2018-08-03 05:26:39 +00:00
import requests , re , string
2018-08-03 03:39:56 +00:00
from jsonpath import jsonpath
from urllib . parse import quote as raw_quote
import argparse
# deals with quote treating slashes as safe by default
def quote ( s ) :
return raw_quote ( s , safe = ' ' )
# gets transcript for episode eptitle, if raw then return raw JSON response, passthrough direction style
def getTranscript ( eptitle = " Gem Glow " , raw = False , dir_style = 1 ) :
r = requests . get ( " http://steven-universe.wikia.com/api.php?format=json&action=query&titles= {} %2F Transcript&prop=revisions&rvprop=content " . format ( quote ( eptitle ) ) )
r . raise_for_status ( )
if raw : return r . text
r = r . json ( )
return processTranscript ( r , dir_style )
def processTranscript ( r , dir_style = 1 ) :
text = jsonpath ( r , " $.query.pages.[0] " ) [ 0 ] [ " * " ] . split ( " \n " ) [ 2 : - 1 ]
2018-08-03 05:34:27 +00:00
text = [ " " . join ( filter ( lambda x : ord ( x ) in range ( 128 ) , s ) ) for s in text if " | " in s ] # filter music notes and other unicode chars, as well as empty lines
2018-08-03 03:39:56 +00:00
text = [ x for x in filter ( lambda x : len ( x . split ( " | " ) [ 1 ] ) > 0 , text ) ]
text = [ x . split ( " | " , 2 ) [ 2 ] [ : - 2 ] for x in text ]
if dir_style == 1 :
text = [ x . replace ( " ( " , " " ) . replace ( " ) " , " " ) for x in text ]
text = [ re . sub ( " ' ' .* ' ' " , " " , x ) for x in text ]
elif dir_style == 2 :
text = [ re . sub ( " \ ([^)]* \ ) " , " " , x ) . replace ( " " , " " ) for x in text ]
2018-08-03 03:53:41 +00:00
text = [ re . sub ( r " \ [ \ [[^|]* \ |([^ \ ]]+) \ ] \ ] " , r " \ g<1> " , x ) . replace ( " [[ " , " " ) . replace ( " ]] " , " " ) for x in text ]
2018-08-03 03:39:56 +00:00
return " \n " . join ( text )
def normalize ( t ) :
2018-08-03 05:26:39 +00:00
return " " . join ( filter ( lambda x : not x in string . punctuation , t . lower ( ) ) ) . replace ( " " , " _ " )
2018-08-03 03:39:56 +00:00
if __name__ == " __main__ " :
parser = argparse . ArgumentParser ( description = " Grabs transcripts. " )
parser . add_argument ( " --raw " , " -r " , action = " store_true " , help = " Store raw JSON response " )
parser . add_argument ( " --directive-style " , " -d " , help = " Style of directives. 1 = double apostraphe and parens, 2 = just parens " , nargs = " ? " , default = 1 , type = int )
parser . add_argument ( " title " , help = " title of episode " )
args = parser . parse_args ( )
with open ( " corpora/ {} .txt " . format ( normalize ( args . title ) ) , " w " ) as f :
f . write ( getTranscript ( args . title , args . raw , args . directive_style ) )