mu/convert_slack.py

# Import JSON from a Slack admin export.
#
# Images downloaded as follows:
#   grep image_72 . -r |grep -v users.json |column 3 |sort |uniq |sed 's/?.*//' |sed 's,\\,,g' |sed 's/"//' |sed 's/", $//' > images.list
#   wget -i images.list --wait=0.1
#   # fix some lying images
#   for f in $(file *.jpg |grep PNG |sed 's/:.*//'); do mv -i $f $(echo $f |sed 's/\.jpg$/.png/'); done
#   #
#   mkdir ppm
#   for f in *.jpg; do jpegtopnm $f |pnmtopnm -plain > ppm/$(echo $f |sed 's/\.jpg$//').ppm; done
#   for f in *.png; do png2pnm -n $f > ppm/$(echo $f |sed 's/\.png$//').ppm; done
#
# Dependencies: python netpbm and my 'column' perl script
#
# Notes on input format:
#   Redundant 'type' field that's always 'message'. Probably an "enterprise" feature.

from sys import argv, stderr
import json
from os import listdir
from os.path import isfile, join, basename, splitext
from urllib.parse import urlparse

channels = {}

user_id = {}  # name -> index
users = []

items = []

def contents(filename):
    with open(filename) as f:
        for item in json.load(f):
            try:
                if 'thread_ts' in item:
                    # comment
                    yield {
                      'name': f"/{item['thread_ts']}/{item['ts']}",
                      'contents': item['text'],
                      'by': user_id[item['user']],
#?                       'by': users[user_id[item['user']]]['avatar'][0:100],
                    }
                else:
                    # top-level post
                    yield {
                      'name': f"/{item['ts']}",
                      'contents': item['text'],
                      'by': user_id[item['user']],
#?                       'by': users[user_id[item['user']]]['avatar'][0:100],
                    }
            except KeyError:
                stderr.write(repr(item)+'\n')

def filenames(dir):
    for filename in sorted(listdir(dir)):
        result = join(dir, filename)
        if isfile(result):
            yield result

def look_up_ppm_image(url):
    file_root = splitext(basename(urlparse(url).path))[0]
    filename = f"images2/ppm/{file_root}.ppm"
    if isfile(filename):
        with open(filename) as f:
            return f.read()

def load_users():
    stderr.write('loading users..\n')
    length = 0
    with open('users.json') as f:
        for user in json.load(f):
#?             if user['deleted']:
#?                 continue
            if user['id'] not in user_id:
                if 'real_name' not in user:
                    user['real_name'] = ''
                print(f"({json.dumps(user['id'])} \"@{user['name']}\" {json.dumps(user['real_name'])} [{look_up_ppm_image(user['profile']['image_72']) or ''}])")
#?                 users.append({
#?                     'id': user['id'],
#?                     'username': user['name'],
#?                     'name': user['real_name'],
#?                     'avatar': look_up_ppm_image(user['profile']['image_72']),
#?                 })
                user_id[user['id']] = length
                length += 1

def load_channels():
    stderr.write('loading channels..\n')
    with open('channels.json') as f:
        for channel in json.load(f):
            channels[channel['id']] = channel['name']

load_channels()
load_users()
for dir in channels.values():
    try:
        for filename in filenames(dir):
            print(filename)
            for item in contents(filename):
                print(f"({json.dumps(item['name'])} {json.dumps(dir)} {item['by']} {json.dumps(item['contents'])})")
    except NotADirectoryError:
        pass
beginnings of a Slack archive reader I'm hackily depending on Python (3.something) to prototype the disk image creator. But no non-std libs. Once the disk image is created, I've validated that it can be loaded from disk without too much latency (assuming KVM). 2021-08-07 03:46:48 +00:00			`# Import JSON from a Slack admin export.`
			`#`
			`# Images downloaded as follows:`
. 2021-08-07 03:54:34 +00:00			`# grep image_72 . -r \|grep -v users.json \|column 3 \|sort \|uniq \|sed 's/?.*//' \|sed 's,\\,,g' \|sed 's/"//' \|sed 's/", $//' > images.list`
beginnings of a Slack archive reader I'm hackily depending on Python (3.something) to prototype the disk image creator. But no non-std libs. Once the disk image is created, I've validated that it can be loaded from disk without too much latency (assuming KVM). 2021-08-07 03:46:48 +00:00			`# wget -i images.list --wait=0.1`
			`# # fix some lying images`
			`# for f in $(file .jpg \|grep PNG \|sed 's/:.//'); do mv -i $f $(echo $f \|sed 's/\.jpg$/.png/'); done`
			`# #`
			`# mkdir ppm`
			`# for f in *.jpg; do jpegtopnm $f \|pnmtopnm -plain > ppm/$(echo $f \|sed 's/\.jpg$//').ppm; done`
			`# for f in *.png; do png2pnm -n $f > ppm/$(echo $f \|sed 's/\.png$//').ppm; done`
			`#`
			`# Dependencies: python netpbm and my 'column' perl script`
			`#`
			`# Notes on input format:`
			`# Redundant 'type' field that's always 'message'. Probably an "enterprise" feature.`

			`from sys import argv, stderr`
			`import json`
			`from os import listdir`
			`from os.path import isfile, join, basename, splitext`
			`from urllib.parse import urlparse`

. 2021-08-07 03:54:34 +00:00			`channels = {}`
beginnings of a Slack archive reader I'm hackily depending on Python (3.something) to prototype the disk image creator. But no non-std libs. Once the disk image is created, I've validated that it can be loaded from disk without too much latency (assuming KVM). 2021-08-07 03:46:48 +00:00
. 2021-08-07 03:54:34 +00:00			`user_id = {} # name -> index`
beginnings of a Slack archive reader I'm hackily depending on Python (3.something) to prototype the disk image creator. But no non-std libs. Once the disk image is created, I've validated that it can be loaded from disk without too much latency (assuming KVM). 2021-08-07 03:46:48 +00:00			`users = []`

			`items = []`

			`def contents(filename):`
			`with open(filename) as f:`
			`for item in json.load(f):`
			`try:`
			`if 'thread_ts' in item:`
			`# comment`
			`yield {`
			`'name': f"/{item['thread_ts']}/{item['ts']}",`
			`'contents': item['text'],`
			`'by': user_id[item['user']],`
			`#? 'by': users[user_id[item['user']]]['avatar'][0:100],`
			`}`
			`else:`
			`# top-level post`
			`yield {`
			`'name': f"/{item['ts']}",`
			`'contents': item['text'],`
			`'by': user_id[item['user']],`
			`#? 'by': users[user_id[item['user']]]['avatar'][0:100],`
			`}`
			`except KeyError:`
			`stderr.write(repr(item)+'\n')`

			`def filenames(dir):`
. 2021-08-07 03:54:34 +00:00			`for filename in sorted(listdir(dir)):`
beginnings of a Slack archive reader I'm hackily depending on Python (3.something) to prototype the disk image creator. But no non-std libs. Once the disk image is created, I've validated that it can be loaded from disk without too much latency (assuming KVM). 2021-08-07 03:46:48 +00:00			`result = join(dir, filename)`
			`if isfile(result):`
			`yield result`

			`def look_up_ppm_image(url):`
			`file_root = splitext(basename(urlparse(url).path))[0]`
			`filename = f"images2/ppm/{file_root}.ppm"`
			`if isfile(filename):`
			`with open(filename) as f:`
			`return f.read()`

			`def load_users():`
			`stderr.write('loading users..\n')`
			`length = 0`
			`with open('users.json') as f:`
			`for user in json.load(f):`
			`#? if user['deleted']:`
			`#? continue`
			`if user['id'] not in user_id:`
			`if 'real_name' not in user:`
			`user['real_name'] = ''`
. 2021-08-07 03:54:34 +00:00			`print(f"({json.dumps(user['id'])} \"@{user['name']}\" {json.dumps(user['real_name'])} [{look_up_ppm_image(user['profile']['image_72']) or ''}])")`
beginnings of a Slack archive reader I'm hackily depending on Python (3.something) to prototype the disk image creator. But no non-std libs. Once the disk image is created, I've validated that it can be loaded from disk without too much latency (assuming KVM). 2021-08-07 03:46:48 +00:00			`#? users.append({`
			`#? 'id': user['id'],`
			`#? 'username': user['name'],`
			`#? 'name': user['real_name'],`
			`#? 'avatar': look_up_ppm_image(user['profile']['image_72']),`
			`#? })`
			`user_id[user['id']] = length`
			`length += 1`

. 2021-08-07 03:54:34 +00:00			`def load_channels():`
			`stderr.write('loading channels..\n')`
			`with open('channels.json') as f:`
			`for channel in json.load(f):`
			`channels[channel['id']] = channel['name']`

			`load_channels()`
beginnings of a Slack archive reader I'm hackily depending on Python (3.something) to prototype the disk image creator. But no non-std libs. Once the disk image is created, I've validated that it can be loaded from disk without too much latency (assuming KVM). 2021-08-07 03:46:48 +00:00			`load_users()`
. 2021-08-07 03:54:34 +00:00			`for dir in channels.values():`
beginnings of a Slack archive reader I'm hackily depending on Python (3.something) to prototype the disk image creator. But no non-std libs. Once the disk image is created, I've validated that it can be loaded from disk without too much latency (assuming KVM). 2021-08-07 03:46:48 +00:00			`try:`
			`for filename in filenames(dir):`
. 2021-08-07 03:54:34 +00:00			`print(filename)`
beginnings of a Slack archive reader I'm hackily depending on Python (3.something) to prototype the disk image creator. But no non-std libs. Once the disk image is created, I've validated that it can be loaded from disk without too much latency (assuming KVM). 2021-08-07 03:46:48 +00:00			`for item in contents(filename):`
. 2021-08-07 03:54:34 +00:00			`print(f"({json.dumps(item['name'])} {json.dumps(dir)} {item['by']} {json.dumps(item['contents'])})")`
beginnings of a Slack archive reader I'm hackily depending on Python (3.something) to prototype the disk image creator. But no non-std libs. Once the disk image is created, I've validated that it can be loaded from disk without too much latency (assuming KVM). 2021-08-07 03:46:48 +00:00			`except NotADirectoryError:`
			`pass`