43 lines
1.3 KiB
Python
43 lines
1.3 KiB
Python
import markovify
|
|
import nltk
|
|
import json
|
|
import re
|
|
import time
|
|
import os
|
|
import language_check
|
|
import string
|
|
|
|
class POSifiedText(markovify.Text):
|
|
def word_split(self, sentence):
|
|
words = re.split(self.word_split_pattern, sentence)
|
|
words = [ "::".join(tag) for tag in nltk.pos_tag(words) ]
|
|
return words
|
|
|
|
def word_join(self, words):
|
|
sentence = " ".join(word.split("::")[0] for word in words)
|
|
return sentence
|
|
|
|
|
|
corpus_path = "./corpus/prose/"
|
|
chains_path = "./corpus/prose/chains/"
|
|
|
|
tool = None
|
|
matches =None
|
|
combined_model = None
|
|
model_json = None
|
|
|
|
for file in os.listdir(corpus_path):
|
|
if file.endswith(".std"):
|
|
with open(corpus_path + file) as f:
|
|
#extraneous copy of file contents? or is this necessary?
|
|
text = f.read()
|
|
# test this, strip multiple spaces and leading/trailing
|
|
text = re.sub( '\s+', ' ', text ).strip()
|
|
model = markovify.Text(text) #this fails here with too many files in the corpus_path or long filenames.
|
|
# model = markovify.Text(text, retain_original=False) # use this one for very large corpora
|
|
model_json = model.to_json()
|
|
chainfile = file + ".mkdch"
|
|
with open(chains_path + chainfile, 'w') as outfile:
|
|
json.dump(model_json, outfile)
|
|
time.sleep(5)
|