mu/browse_slack/convert_slack.py

77 lines
2.9 KiB
Python
Raw Normal View History

2021-08-07 04:01:38 +00:00
# Import JSON from a Slack admin export into a disk image Qemu can load.
#
# Dependencies: python, netpbm
#
2021-08-08 18:15:22 +00:00
# Step 1: download a Slack archive
#
# Step 2: download user avatars to subdirectory images/ and convert them to PPM in subdirectory images/ppm/
# mkdir images
# cd images
2021-08-07 04:01:38 +00:00
# grep image_72 . -r |grep -v users.json |awk '{print $3}' |sort |uniq |sed 's/?.*//' |sed 's,\\,,g' |sed 's/"//' |sed 's/",$//' > images.list
# wget -i images.list --wait=0.1
# # fix some lying images
# for f in $(file *.jpg |grep PNG |sed 's/:.*//'); do mv -i $f $(echo $f |sed 's/\.jpg$/.png/'); done
# #
# mkdir ppm
# for f in *.jpg; do jpegtopnm $f |pnmtopnm -plain > ppm/$(echo $f |sed 's/\.jpg$//').ppm; done
# for f in *.png; do png2pnm -n $f > ppm/$(echo $f |sed 's/\.png$//').ppm; done
#
2021-08-08 18:15:22 +00:00
# Step 3: construct a disk image out of the archives and avatars
# cd ../.. # go back to parent of images/
2021-08-07 04:01:38 +00:00
# dd if=/dev/zero of=data.img count=201600 # 100MB
2021-08-08 18:15:22 +00:00
# python path/to/convert_slack.py |dd of=data.img conv=notrunc
2021-08-07 04:01:38 +00:00
# Currently this process yields errors for ~70 items on the Future of Software
# group. We fail to load those.
#
# Notes on input format:
# Redundant 'type' field that's always 'message'. Probably an "enterprise" feature.
from sys import argv, stderr
import json
from os import listdir
from os.path import isfile, join, basename, splitext
from urllib.parse import urlparse
def look_up_ppm_image(url):
file_root = splitext(basename(urlparse(url).path))[0]
2021-08-08 18:15:22 +00:00
filename = f"images/ppm/{file_root}.ppm"
if isfile(filename):
with open(filename) as f:
return f.read()
2021-08-10 11:15:40 +00:00
user_id = {} # name -> index
2021-08-10 11:18:34 +00:00
with open('users.json') as f:
2021-08-10 11:22:29 +00:00
for idx, user in enumerate(json.load(f)):
2021-08-10 11:19:54 +00:00
if 'real_name' not in user:
user['real_name'] = ''
print(f"({json.dumps(user['id'])} \"@{user['name']}\" {json.dumps(user['real_name'])} [{look_up_ppm_image(user['profile']['image_72']) or ''}])")
2021-08-10 11:22:29 +00:00
user_id[user['id']] = idx
2021-08-10 11:16:36 +00:00
items = []
2021-08-10 11:11:51 +00:00
def contents(filename):
with open(filename) as f:
for item in json.load(f):
try:
if 'thread_ts' in item:
# comment
yield {
'name': f"/{item['thread_ts']}/{item['ts']}",
'contents': item['text'],
'by': user_id[item['user']],
}
else:
# top-level post
yield {
'name': f"/{item['ts']}",
'contents': item['text'],
'by': user_id[item['user']],
}
except KeyError:
stderr.write(repr(item)+'\n')
2021-08-10 11:28:44 +00:00
for channel in json.load(open('channels.json')):
2021-08-10 11:34:58 +00:00
for filename in sorted(listdir(channel['name'])):
for item in contents(join(channel['name'], filename)):
2021-08-10 11:28:44 +00:00
print(f"({json.dumps(item['name'])} {json.dumps(channel['name'])} {item['by']} {json.dumps(item['contents'])})")