bragi/markchainer.py

import markovify
import nltk
import json
import re
import time
import os
import language_check
import string

class POSifiedText(markovify.Text):
    def word_split(self, sentence):
        words = re.split(self.word_split_pattern, sentence)
        words = [ "::".join(tag) for tag in nltk.pos_tag(words) ]
        return words

    def word_join(self, words):
        sentence = " ".join(word.split("::")[0] for word in words)
        return sentence


corpus_path = "./corpus/prose/"
chains_path = "./corpus/prose/chains/"

tool = None
matches =None
combined_model = None
model_json = None

for file in os.listdir(corpus_path):
    if file.endswith(".std"):
        with open(corpus_path + file) as f:
            #extraneous copy of file contents? or is this necessary?
            text = f.read()
            # test this, strip multiple spaces and leading/trailing
            text = re.sub( '\s+', ' ', text ).strip()
            model = markovify.Text(text) #this fails here with too many files in the corpus_path or long filenames.
#            model = markovify.Text(text, retain_original=False) # use this one for very large corpora
            model_json = model.to_json()
            chainfile = file + ".mkdch"
            with open(chains_path + chainfile, 'w') as outfile:
                json.dump(model_json, outfile)
            time.sleep(5)