import requests, re, string from jsonpath import jsonpath from urllib.parse import quote as raw_quote import argparse # deals with quote treating slashes as safe by default def quote(s): return raw_quote(s,safe='') # gets transcript for episode eptitle, if raw then return raw JSON response, passthrough direction style def getTranscript(eptitle="Gem Glow",raw=False,dir_style=1): r = requests.get("http://steven-universe.wikia.com/api.php?format=json&action=query&titles={}%2FTranscript&prop=revisions&rvprop=content".format(quote(eptitle))) r.raise_for_status() if raw: return r.text r = r.json() return processTranscript(r,dir_style) def processTranscript(r,dir_style=1): text = jsonpath(r,"$.query.pages.[0]")[0]["*"].split("\n")[2:-1] text = ["".join(filter(lambda x: ord(x) in range(128),s)) for s in text if "|" in s] # filter music notes and other unicode chars, as well as empty lines text = [x for x in filter(lambda x: len(x.split("|")[1])==0,text)] text = [x.split("|",2)[2][:-2] for x in text] if dir_style==1: text = [x.replace("(","").replace(")","") for x in text] text = [re.sub("''.*''","",x) for x in text] elif dir_style==2: text = [re.sub("\([^)]*\)","",x).replace(" "," ") for x in text] text = [re.sub(r"\[\[[^|]*\|([^\]]+)\]\]",r"\g<1>",x).replace("[[","").replace("]]","") for x in text] return "\n".join(text) def normalize(t): return "".join(filter(lambda x: not x in string.punctuation,t.lower())).replace(" ","_") if __name__=="__main__": parser = argparse.ArgumentParser(description="Grabs transcripts.") parser.add_argument("--raw","-r",action="store_true",help="Store raw JSON response") # parser.add_argument("--directive-style","-d",help="Style of directives. 1 = double apostraphe and parens, 2 = just parens",nargs="?",default=1,type=int) parser.add_argument("title",help="title of episode") args = parser.parse_args() with open("stagedirections.txt","a") as f: f.write(getTranscript(args.title,args.raw,3))