episodegen/transgrab.py

import requests, re, string
from jsonpath import jsonpath
from urllib.parse import quote as raw_quote
import argparse

# deals with quote treating slashes as safe by default
def quote(s):
	return raw_quote(s,safe='')

# gets transcript for episode eptitle, if raw then return raw JSON response, passthrough direction style
def getTranscript(eptitle="Gem Glow",raw=False,dir_style=1):
	r = requests.get("http://steven-universe.wikia.com/api.php?format=json&action=query&titles={}%2FTranscript&prop=revisions&rvprop=content".format(quote(eptitle)))
	r.raise_for_status()
	if raw: return r.text
	r = r.json()
	return processTranscript(r,dir_style)

def processTranscript(r,dir_style=1):
	text = jsonpath(r,"$.query.pages.[0]")[0]["*"].split("\n")[2:-1]
	text = ["".join(filter(lambda x: ord(x) in range(128),s)) for s in text if "|" in s] # filter music notes and other unicode chars, as well as empty lines
	text = [x for x in filter(lambda x: len(x.split("|")[1])>0,text)]
	text = [x.split("|",2)[2][:-2] for x in text]
	if dir_style==1:
		text = [x.replace("(","").replace(")","") for x in text]
		text = [re.sub("''.*''","",x) for x in text]
	elif dir_style==2:
		text = [re.sub("\([^)]*\)","",x).replace("  "," ") for x in text]
	text = [re.sub(r"\[\[[^|]*\|([^\]]+)\]\]",r"\g<1>",x).replace("[[","").replace("]]","") for x in text]
	return "\n".join(text)

def normalize(t):
	return "".join(filter(lambda x: not x in string.punctuation,t.lower())).replace(" ","_")

if __name__=="__main__":
	parser = argparse.ArgumentParser(description="Grabs transcripts.")
	parser.add_argument("--raw","-r",action="store_true",help="Store raw JSON response")
	parser.add_argument("--directive-style","-d",help="Style of directives. 1 = double apostraphe and parens, 2 = just parens",nargs="?",default=1,type=int)
	parser.add_argument("title",help="title of episode")
	args = parser.parse_args()
	with open("corpora/{}.txt".format(normalize(args.title)),"w") as f:
		f.write(getTranscript(args.title,args.raw,args.directive_style))