episodegen/transgrab.py

42 lines
2.0 KiB
Python

import requests, re, string
from jsonpath import jsonpath
from urllib.parse import quote as raw_quote
import argparse
# deals with quote treating slashes as safe by default
def quote(s):
return raw_quote(s,safe='')
# gets transcript for episode eptitle, if raw then return raw JSON response, passthrough direction style
def getTranscript(eptitle="Gem Glow",raw=False,dir_style=1):
r = requests.get("http://steven-universe.wikia.com/api.php?format=json&action=query&titles={}%2FTranscript&prop=revisions&rvprop=content".format(quote(eptitle)))
r.raise_for_status()
if raw: return r.text
r = r.json()
return processTranscript(r,dir_style)
def processTranscript(r,dir_style=1):
text = jsonpath(r,"$.query.pages.[0]")[0]["*"].split("\n")[2:-1]
text = ["".join(filter(lambda x: ord(x) in range(128),s)) for s in text if "|" in s] # filter music notes and other unicode chars, as well as empty lines
text = [x for x in filter(lambda x: len(x.split("|")[1])>0,text)]
text = [x.split("|",2)[2][:-2] for x in text]
if dir_style==1:
text = [x.replace("(","").replace(")","") for x in text]
text = [re.sub("''.*''","",x) for x in text]
elif dir_style==2:
text = [re.sub("\([^)]*\)","",x).replace(" "," ") for x in text]
text = [re.sub(r"\[\[[^|]*\|([^\]]+)\]\]",r"\g<1>",x).replace("[[","").replace("]]","") for x in text]
return "\n".join(text)
def normalize(t):
return "".join(filter(lambda x: not x in string.punctuation,t.lower())).replace(" ","_")
if __name__=="__main__":
parser = argparse.ArgumentParser(description="Grabs transcripts.")
parser.add_argument("--raw","-r",action="store_true",help="Store raw JSON response")
parser.add_argument("--directive-style","-d",help="Style of directives. 1 = double apostraphe and parens, 2 = just parens",nargs="?",default=1,type=int)
parser.add_argument("title",help="title of episode")
args = parser.parse_args()
with open("corpora/{}.txt".format(normalize(args.title)),"w") as f:
f.write(getTranscript(args.title,args.raw,args.directive_style))