mu/browse-slack/convert_slack.py

88 lines
3.5 KiB
Python
Raw Permalink Normal View History

2021-08-10 16:13:59 +00:00
# Import JSON from a Slack admin export into a disk image Mu can load.
2021-08-07 04:01:38 +00:00
#
2021-08-12 02:07:51 +00:00
# Dependencies: python, wget, awk, sed, netpbm
#
2021-08-12 02:07:51 +00:00
# Step 1: download a Slack archive and unpack it to some directory
2021-08-08 18:15:22 +00:00
#
# Step 2: download user avatars to subdirectory images/ and convert them to PPM in subdirectory images/ppm/
2021-08-12 02:07:51 +00:00
# grep image_72 . -r |grep -v users.json |awk '{print $3}' |sort |uniq |sed 's/?.*//' |sed 's,\\,,g' |sed 's/"//' |sed 's/",$//' > images.list
2021-08-08 18:15:22 +00:00
# mkdir images
# cd images
2021-08-12 02:07:51 +00:00
# wget -i ../images.list --wait=0.1
# # fix some lying images
# for f in $(file *.jpg |grep PNG |sed 's/:.*//'); do mv -i $f $(echo $f |sed 's/\.jpg$/.png/'); done
# #
# mkdir ppm
# for f in *.jpg; do jpegtopnm $f |pnmtoplainpnm > ppm/$(echo $f |sed 's/\.jpg$//').ppm; done
# for f in *.png; do pngtopnm $f |pnmtoplainpnm > ppm/$(echo $f |sed 's/\.png$//').ppm; done
#
# (Depending on your OS, you may need to replace pnmtoplainpnm with `pnmtopnm -plain`. Some places also have a pnm2pnm.
# I don't understand it either.)
#
2021-08-08 18:15:22 +00:00
# Step 3: construct a disk image out of the archives and avatars
2021-08-12 02:07:51 +00:00
# cd .. # go back to the top-level archive directory
2021-08-07 04:01:38 +00:00
# dd if=/dev/zero of=data.img count=201600 # 100MB
2021-08-12 02:07:51 +00:00
# python path/to/convert_slack.py > data.out 2> data.err
2021-08-12 01:40:10 +00:00
# dd if=data.out of=data.img conv=notrunc
2021-08-10 16:13:59 +00:00
# Currently this process yields errors for ~300 items (~70 posts and their comments)
# on the Future of Software group (https://futureofcoding.org/community). We fail to load those.
#
# Notes on input format:
# Redundant 'type' field that's always 'message'. Probably an "enterprise" feature.
from sys import argv, stderr
import json
from os import listdir
from os.path import isfile, join, basename, splitext
from urllib.parse import urlparse
import traceback
def look_up_ppm_image(url):
file_root = splitext(basename(urlparse(url).path))[0]
2021-08-08 18:15:22 +00:00
filename = f"images/ppm/{file_root}.ppm"
if isfile(filename):
with open(filename) as f:
return f.read()
2021-08-10 11:55:19 +00:00
user_idx = {}
2021-08-10 11:18:34 +00:00
with open('users.json') as f:
2021-08-10 11:22:29 +00:00
for idx, user in enumerate(json.load(f)):
2021-08-10 11:19:54 +00:00
if 'real_name' not in user:
user['real_name'] = ''
print(f"({json.dumps(user['id'])} \"@{user['name']}\" {json.dumps(user['real_name'])} [{look_up_ppm_image(user['profile']['image_72']) or ''}])")
2021-08-10 11:55:19 +00:00
user_idx[user['id']] = idx
2021-08-10 11:44:43 +00:00
def by(item):
if 'subtype' in item and item['subtype'] == 'bot_message' and 'username' in item:
federated_user = item['username']
if federated_user not in user_idx:
user_idx[federated_user] = len(user_idx)
return user_idx[federated_user]
2021-08-10 11:55:19 +00:00
return user_idx[item['user']]
2021-08-10 11:44:43 +00:00
item_idx = {}
def parent(item):
if 'thread_ts' in item and item['thread_ts'] != item['ts']:
# comment
return item_idx[item['thread_ts']]
else:
return -1
items = []
2021-08-10 11:28:44 +00:00
for channel in json.load(open('channels.json')):
2021-08-10 11:34:58 +00:00
for filename in sorted(listdir(channel['name'])):
2021-08-10 11:44:43 +00:00
with open(join(channel['name'], filename)) as f:
for item in json.load(f):
item['channel_name'] = channel['name']
items.append(item)
idx = 0
for item in sorted(items, key=lambda item: item['ts']):
try:
print(f"({json.dumps(item['ts'])} {parent(item)} {json.dumps(item['channel_name'])} {by(item)} {json.dumps(item['text'])})")
item_idx[item['ts']] = idx
idx += 1 # only increment when actually used and no exception raised
except KeyError:
traceback.print_exc(file=stderr)
stderr.write(repr(item)+'\n')