bragi/markchainer.py

43 lines
1.3 KiB
Python

import markovify
import nltk
import json
import re
import time
import os
import language_check
import string
class POSifiedText(markovify.Text):
def word_split(self, sentence):
words = re.split(self.word_split_pattern, sentence)
words = [ "::".join(tag) for tag in nltk.pos_tag(words) ]
return words
def word_join(self, words):
sentence = " ".join(word.split("::")[0] for word in words)
return sentence
corpus_path = "./corpus/prose/"
chains_path = "./corpus/prose/chains/"
tool = None
matches =None
combined_model = None
model_json = None
for file in os.listdir(corpus_path):
if file.endswith(".std"):
with open(corpus_path + file) as f:
#extraneous copy of file contents? or is this necessary?
text = f.read()
# test this, strip multiple spaces and leading/trailing
text = re.sub( '\s+', ' ', text ).strip()
model = markovify.Text(text) #this fails here with too many files in the corpus_path or long filenames.
# model = markovify.Text(text, retain_original=False) # use this one for very large corpora
model_json = model.to_json()
chainfile = file + ".mkdch"
with open(chains_path + chainfile, 'w') as outfile:
json.dump(model_json, outfile)
time.sleep(5)