mu/browse_slack/convert_slack.py

# Import JSON from a Slack admin export into a disk image Qemu can load.
#
# Dependencies: python, netpbm
#
# Step 1: download a Slack archive
#
# Step 2: download user avatars to subdirectory images/ and convert them to PPM in subdirectory images/ppm/
#   mkdir images
#   cd images
#   grep image_72 . -r |grep -v users.json |awk '{print $3}' |sort |uniq |sed 's/?.*//' |sed 's,\\,,g' |sed 's/"//' |sed 's/",$//' > images.list
#   wget -i images.list --wait=0.1
#   # fix some lying images
#   for f in $(file *.jpg |grep PNG |sed 's/:.*//'); do mv -i $f $(echo $f |sed 's/\.jpg$/.png/'); done
#   #
#   mkdir ppm
#   for f in *.jpg; do jpegtopnm $f |pnmtopnm -plain > ppm/$(echo $f |sed 's/\.jpg$//').ppm; done
#   for f in *.png; do png2pnm -n $f > ppm/$(echo $f |sed 's/\.png$//').ppm; done
#
# Step 3: construct a disk image out of the archives and avatars
#   cd ../..  # go back to parent of images/
#   dd if=/dev/zero of=data.img count=201600  # 100MB
#   python path/to/convert_slack.py |dd of=data.img conv=notrunc
# Currently this process yields errors for ~70 items on the Future of Software
# group. We fail to load those.
#
# Notes on input format:
#   Redundant 'type' field that's always 'message'. Probably an "enterprise" feature.

from sys import argv, stderr
import json
from os import listdir
from os.path import isfile, join, basename, splitext
from urllib.parse import urlparse

def look_up_ppm_image(url):
    file_root = splitext(basename(urlparse(url).path))[0]
    filename = f"images/ppm/{file_root}.ppm"
    if isfile(filename):
        with open(filename) as f:
            return f.read()

user_id = {}  # name -> index
with open('users.json') as f:
    for idx, user in enumerate(json.load(f)):
        if 'real_name' not in user:
            user['real_name'] = ''
        print(f"({json.dumps(user['id'])} \"@{user['name']}\" {json.dumps(user['real_name'])} [{look_up_ppm_image(user['profile']['image_72']) or ''}])")
        user_id[user['id']] = idx

items = []

def contents(filename):
    with open(filename) as f:
        for item in json.load(f):
            try:
                if 'thread_ts' in item:
                    # comment
                    yield {
                      'name': f"/{item['thread_ts']}/{item['ts']}",
                      'contents': item['text'],
                      'by': user_id[item['user']],
                    }
                else:
                    # top-level post
                    yield {
                      'name': f"/{item['ts']}",
                      'contents': item['text'],
                      'by': user_id[item['user']],
                    }
            except KeyError:
                stderr.write(repr(item)+'\n')

for channel in json.load(open('channels.json')):
    for filename in sorted(listdir(channel['name'])):
        for item in contents(join(channel['name'], filename)):
            print(f"({json.dumps(item['name'])} {json.dumps(channel['name'])} {item['by']} {json.dumps(item['contents'])})")
. 2021-08-07 04:01:38 +00:00			`# Import JSON from a Slack admin export into a disk image Qemu can load.`
			`#`
			`# Dependencies: python, netpbm`
beginnings of a Slack archive reader I'm hackily depending on Python (3.something) to prototype the disk image creator. But no non-std libs. Once the disk image is created, I've validated that it can be loaded from disk without too much latency (assuming KVM). 2021-08-07 03:46:48 +00:00			`#`
. 2021-08-08 18:15:22 +00:00			`# Step 1: download a Slack archive`
			`#`
			`# Step 2: download user avatars to subdirectory images/ and convert them to PPM in subdirectory images/ppm/`
			`# mkdir images`
			`# cd images`
. 2021-08-07 04:01:38 +00:00			`# grep image_72 . -r \|grep -v users.json \|awk '{print $3}' \|sort \|uniq \|sed 's/?.*//' \|sed 's,\\,,g' \|sed 's/"//' \|sed 's/",$//' > images.list`
beginnings of a Slack archive reader I'm hackily depending on Python (3.something) to prototype the disk image creator. But no non-std libs. Once the disk image is created, I've validated that it can be loaded from disk without too much latency (assuming KVM). 2021-08-07 03:46:48 +00:00			`# wget -i images.list --wait=0.1`
			`# # fix some lying images`
			`# for f in $(file .jpg \|grep PNG \|sed 's/:.//'); do mv -i $f $(echo $f \|sed 's/\.jpg$/.png/'); done`
			`# #`
			`# mkdir ppm`
			`# for f in *.jpg; do jpegtopnm $f \|pnmtopnm -plain > ppm/$(echo $f \|sed 's/\.jpg$//').ppm; done`
			`# for f in *.png; do png2pnm -n $f > ppm/$(echo $f \|sed 's/\.png$//').ppm; done`
			`#`
. 2021-08-08 18:15:22 +00:00			`# Step 3: construct a disk image out of the archives and avatars`
			`# cd ../.. # go back to parent of images/`
. 2021-08-07 04:01:38 +00:00			`# dd if=/dev/zero of=data.img count=201600 # 100MB`
. 2021-08-08 18:15:22 +00:00			`# python path/to/convert_slack.py \|dd of=data.img conv=notrunc`
. 2021-08-07 04:01:38 +00:00			`# Currently this process yields errors for ~70 items on the Future of Software`
			`# group. We fail to load those.`
beginnings of a Slack archive reader I'm hackily depending on Python (3.something) to prototype the disk image creator. But no non-std libs. Once the disk image is created, I've validated that it can be loaded from disk without too much latency (assuming KVM). 2021-08-07 03:46:48 +00:00			`#`
			`# Notes on input format:`
			`# Redundant 'type' field that's always 'message'. Probably an "enterprise" feature.`

			`from sys import argv, stderr`
			`import json`
			`from os import listdir`
			`from os.path import isfile, join, basename, splitext`
			`from urllib.parse import urlparse`

			`def look_up_ppm_image(url):`
			`file_root = splitext(basename(urlparse(url).path))[0]`
. 2021-08-08 18:15:22 +00:00			`filename = f"images/ppm/{file_root}.ppm"`
beginnings of a Slack archive reader I'm hackily depending on Python (3.something) to prototype the disk image creator. But no non-std libs. Once the disk image is created, I've validated that it can be loaded from disk without too much latency (assuming KVM). 2021-08-07 03:46:48 +00:00			`if isfile(filename):`
			`with open(filename) as f:`
			`return f.read()`

. 2021-08-10 11:15:40 +00:00			`user_id = {} # name -> index`
. 2021-08-10 11:18:34 +00:00			`with open('users.json') as f:`
. 2021-08-10 11:22:29 +00:00			`for idx, user in enumerate(json.load(f)):`
. 2021-08-10 11:19:54 +00:00			`if 'real_name' not in user:`
			`user['real_name'] = ''`
			`print(f"({json.dumps(user['id'])} \"@{user['name']}\" {json.dumps(user['real_name'])} [{look_up_ppm_image(user['profile']['image_72']) or ''}])")`
. 2021-08-10 11:22:29 +00:00			`user_id[user['id']] = idx`
beginnings of a Slack archive reader I'm hackily depending on Python (3.something) to prototype the disk image creator. But no non-std libs. Once the disk image is created, I've validated that it can be loaded from disk without too much latency (assuming KVM). 2021-08-07 03:46:48 +00:00
. 2021-08-10 11:16:36 +00:00			`items = []`

. 2021-08-10 11:11:51 +00:00			`def contents(filename):`
			`with open(filename) as f:`
			`for item in json.load(f):`
			`try:`
			`if 'thread_ts' in item:`
			`# comment`
			`yield {`
			`'name': f"/{item['thread_ts']}/{item['ts']}",`
			`'contents': item['text'],`
			`'by': user_id[item['user']],`
			`}`
			`else:`
			`# top-level post`
			`yield {`
			`'name': f"/{item['ts']}",`
			`'contents': item['text'],`
			`'by': user_id[item['user']],`
			`}`
			`except KeyError:`
			`stderr.write(repr(item)+'\n')`

. 2021-08-10 11:28:44 +00:00			`for channel in json.load(open('channels.json')):`
. 2021-08-10 11:34:58 +00:00			`for filename in sorted(listdir(channel['name'])):`
			`for item in contents(join(channel['name'], filename)):`
. 2021-08-10 11:28:44 +00:00			`print(f"({json.dumps(item['name'])} {json.dumps(channel['name'])} {item['by']} {json.dumps(item['contents'])})")`