heebie jeebies

This commit is contained in:
terris Station 2019-11-14 23:50:15 -05:00
commit c83edd6d4b
9 changed files with 438 additions and 0 deletions

View File

@ -0,0 +1 @@
here will be the markdown chains generated by markchainer.py and used by mark8.py

3
corpus/prose/readme.md Normal file
View File

@ -0,0 +1,3 @@
# README
Here will be the source files already processed by stdtxt.sh, ready as filename.std for markchainer to generate models from.

160
mark8.py Normal file
View File

@ -0,0 +1,160 @@
import markovify
import nltk
import json
import re
import time
import os
import random
import language_check
import textwrap
# Modifying mark7.py to re-introduce language-tool to fix the sentences.
# it's not clear that it's actually doing anything! (╯°□°)╯︵ ┻━┻ ...¯\_(ツ)_/¯
# still to do - selectable corpuses
# expand corpus collection
# re-introduce cosmic.voyage corpus (after removing message headers and non alphanumerics?)
#
# notes:
# the time.sleep() lines below are there just to slow down the processing to keep
# the server burden below 100% on one or more CPUs. These were added, when the supernice
# alias just wasn't enough (in my opinion) to run on tilde.team without possibly affecting
# other users. The time.sleep() lines are not necessary, they are there for shared pubnix love.
#
# there are probably one or more import lines above that do not need to be there.
#
# the lines toward the end are messy and difficult to read. And they insert some
# indicators and numbers into the printed text that were added for debugging when
# language-check kept crashing if markovify would return a blank sentence and language-check
# doesn't like variables of type None. So the product of the markovify call was str()'d, and
# the call itself was placed in a while loop until it actually returned something that isn't 'None'
# an intermediate 'matches' variable was reintroduced.
# all that code is subject to change.
#
tool = None
text = None
matches = None
combined_model = None
model_json = None
couplets = random.randint(2,14) # use this var to generate a random number of couplets between X,Y
title = None # generate a title for the poem/song
wrapper = textwrap.TextWrapper(width=69, subsequent_indent=" ")
#def genHeaders():
# head1 = '''\
# [processing by miniVISR.autoComms @ relay station VHE-0j0-η ]
# [recv'd on 100Hz+-15Hz, 112.358Hz, 112.358KHz, 33.33MHz, 66.66MHz ]
# [radio 112.358MHz, 121.5MHz, 130.167MHz, 143.625MHz, 244.30MHz ]
# [freqs. 358.13MHz, 581.321MHz, 633.9MHz, 922.7MHz, 2113.853MHz ]
# [signals quality: poor. signals reconstuction confidence: 65% ]
# [message transcription relayed to Earthsys QEC for general delivery ]
# [begin transcript ]
# '''.format()
# head2 = '''\
#
# ...able, animal, and mineral,
# I know the skalds of old Solsys, and I quote the verse atypical
# Synechdoche and kenna til, disorder quite limerickal;
# I'm very well disposed as well, to matters a-esthetical,
# I sing love songs to moons both in circle and elliptical,
# About their craters and their phases with my lyrics I'm quite loose,
# Bragi's here in your system to sing, and to share whatever you choose.
# '''.format()
# return 0
#def genFooters():
# countid = str(125925)#{file open stuff - keep a counter in a file?}
#
# foot = '''\
# Bragi's here in your system to sing, and to share whatever you choose.
# Broadcasting on many frequencies every 86,400 seconds. Bragi-{ordinal}
# out.
#
# [end transcript ]
# END MESSAGE."
# '''.format(ordinal=countid)
# print()
# print(foot)
# return 0
def genTitle():
word_list = str(combined_model.make_short_sentence(150, min_chars=80)).split()
wtitle = ""
for x in range(random.randint(1,3)):
wtitle = wtitle + random.choice(word_list) + " "
return str(wtitle).strip(' ,:"')
def genLongLine(minl, maxl):
"""
give me a min length and a max length and i'll return a short
sentence between those lengths, hopefully. and check it for grammar
errors, and if some found, run it through language-check.correct()
"""
text = None
while (str(text) == 'None'):
text = str(combined_model.make_short_sentence(maxl, min_chars=minl))
time.sleep(1)
matches = tool.check(text)
if len(matches) == 0:
line = str(" " + text)
else:
line = str(" " + language_check.correct(text, matches))
return line
def genShortLine(minl, maxl):
"""
give me a min length and a max length and i'll return a short
sentence between those lengths, hopefully. and check it for grammar
errors, and if some found, run it through language-check.correct()
"""
text = None
while (str(text) == 'None'):
text = str(combined_model.make_short_sentence(maxl, min_chars=minl))
time.sleep(1)
matches = tool.check(text)
if len(matches) == 0:
line = str(" " + text)
else:
line = str(" " + language_check.correct(text, matches))
return line
# same basic structure as in markchainer.py, but this uses saved models
# (model generation procedure separate from verse generation procedure
# for shared pubnix love and) to be able to work with growing corpora
# this searches a fixed relative path for already-created models.
# reads the model(json) in, converts from json, combines the models
# together.
for file in os.listdir("./corpus/prose/chains/"):
if file.endswith(".mkdch"):
with open("./corpus/prose/chains/" + file) as f:
model = markovify.Text.from_json(json.load(f))
time.sleep(5)
if combined_model:
time.sleep(5)
combined_model = markovify.combine(models=[combined_model, model])
else:
combined_model = model
tool = language_check.LanguageTool('en-US')
# disabling spellchecking, didn't like some of the 'fixes' from testing. using archaic language in testing, will use novel words in the future. to re-enable spellchecking, comment out the following line:
tool.disable_spellchecking()
print("~*~") # This start delimeter is here for testing each run
# genHeaders() #maybe do this in the future
print(" ")
#print("title: " + genTitle())
print(genTitle())
print(" ")
#print("couplets: " + str(couplets))
for x in range(couplets):
print(wrapper.fill(genLongLine(30,80)))
print(wrapper.fill(genShortLine(20,56)))
# print(wrapper.fill(" l--" + genLongLine(30,80)))
# print(wrapper.fill(" s--" + genShortLine(20,56)))
print(" ")
print("~!~") # this ending delimeter is here for testing each run
# genFooters() #maybe do this in the future

42
markchainer.py Normal file
View File

@ -0,0 +1,42 @@
import markovify
import nltk
import json
import re
import time
import os
import language_check
import string
class POSifiedText(markovify.Text):
def word_split(self, sentence):
words = re.split(self.word_split_pattern, sentence)
words = [ "::".join(tag) for tag in nltk.pos_tag(words) ]
return words
def word_join(self, words):
sentence = " ".join(word.split("::")[0] for word in words)
return sentence
corpus_path = "./corpus/prose/"
chains_path = "./corpus/prose/chains/"
tool = None
matches =None
combined_model = None
model_json = None
for file in os.listdir(corpus_path):
if file.endswith(".std"):
with open(corpus_path + file) as f:
#extraneous copy of file contents? or is this necessary?
text = f.read()
# test this, strip multiple spaces and leading/trailing
text = re.sub( '\s+', ' ', text ).strip()
model = markovify.Text(text) #this fails here with too many files in the corpus_path or long filenames.
# model = markovify.Text(text, retain_original=False) # use this one for very large corpora
model_json = model.to_json()
chainfile = file + ".mkdch"
with open(chains_path + chainfile, 'w') as outfile:
json.dump(model_json, outfile)
time.sleep(5)

53
readme.md Normal file
View File

@ -0,0 +1,53 @@
```
* * * * * * * * * * * * * * * * * * * * * * * *
* ___ _ __ _____ ___ *
* / _ )_______ ____ _(_) / |/ / /__ ( _ ) *
* / _ / __/ _ `/ _ `/ / / /|_/ / '_// _ | *
* /____/_/ \_,_/\_, /_/ /_/ /_/_/\_(_)___/ *
* /___/ *
* * * * * * * * * * * * * * * * * * * * * * * *
```
This is a project (work in progress) to generate verses
that look like poetry (using markov chains) for the ship
stjörnuvagn Bragi on https://cosmic.voyage/
It is presented here without text-sources or markdown chains
because you can get the sources from project gutenberg like
I did. And besides, I don't want to distribute gutenberg
texts without the license verbiage (I had to remove it before
generating the models). Support Project Gutenberg! Great
old texts are not just for mining, they are also for reading.
https://www.gutenberg.org/
# Requirements
> I did all this in a virtualenv, and installed the following packages with pip3:
> * [markovify](https://github.com/jsvine/markovify)
> * nltk
> * [language-check](https://github.com/myint/language-check) - installs languagetool, which requires java
# Included
* mark8.py - the main generator proof of concept
* markchainer.py - generates models from text files already processed by:
* stdtxt.sh - sed pipeline to clean up the text (numbers, blank lines, underscores, brackets)
* samples/mark8test.txt - rough-looking samples produced by rough-looking code during debugging.
# Procedure
1. download some large textfiles from project gutenberg or from https://www.archive.org
1a. alternatively build your own large corpus through other means (web scraping, download corpora archives, etc.)
2. trim each text files as needed, so they contain the kinds of things you want to generate text from
3. use iconv or other means to make sure the texts are all of the same kind of encoding. (utf-8, ascii were tested)
4. use stdtxt.sh on the main input files. this should produce something like inputfile.txt.std
5. supernice python3 markchainer.py (this will look in './corpus/prose/' for *.std files, and generate a model for each. (will be found in './corpus/prose/chains' called something like inputfile.txt.std.mkdch )
6. supernice python3 mark8.py >> output.txt
* supernice is just a bash alias:
```
alias supernice='nice -n 19 ionice -c 3'
```
...to help reduce load on the server from running this toy. the markovify package (esp. when using nltk stuf) can consume a lot of resources (especially when combined with langauge-check/LanguageTool!) so the python scripts were slowed down even more using time.sleep().

131
samples/mark8sample.txt Normal file
View File

@ -0,0 +1,131 @@
Linux 4.15.0-54-generic (tilde) 11/14/2019 _x86_64_ (6 CPU)
11:01:24 PM UID PID %usr %system %guest %wait %CPU CPU Command
11:01:26 PM xxxx 9743 11.5% 2.0% 0.0% 0.0% 13.5% 5 python3
11:01:28 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 5 python3
11:01:30 PM xxxx 9743 16.5% 3.0% 0.0% 0.0% 19.5% 5 python3
11:01:32 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 5 python3
11:01:34 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 5 python3
11:01:36 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 1 python3
11:01:38 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 1 python3
11:01:40 PM xxxx 9743 10.0% 2.5% 0.0% 0.0% 12.5% 4 python3
11:01:42 PM xxxx 9743 45.0% 6.5% 0.0% 0.5% 51.5% 1 python3
11:01:44 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 1 python3
11:01:46 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 1 python3
11:01:48 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 1 python3
11:01:50 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 1 python3
11:01:52 PM xxxx 9743 28.5% 2.0% 0.0% 0.5% 30.5% 3 python3
11:01:54 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 3 python3
11:01:56 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 3 python3
11:01:58 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 4 python3
11:02:00 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 4 python3
11:02:02 PM xxxx 9743 9.0% 0.0% 0.0% 4.0% 9.0% 0 python3
11:02:04 PM xxxx 9743 20.0% 0.5% 0.0% 2.5% 20.5% 2 python3
11:02:06 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 2 python3
11:02:08 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 2 python3
11:02:10 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 2 python3
11:02:12 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 2 python3
11:02:14 PM xxxx 9743 33.5% 1.5% 0.0% 0.5% 35.0% 4 python3
11:02:16 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 4 python3
11:02:18 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 4 python3
11:02:20 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 5 python3
11:02:22 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 5 python3
11:02:24 PM xxxx 9743 22.5% 1.5% 0.0% 0.0% 24.0% 5 python3
11:02:26 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 5 python3
11:02:28 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 5 python3
11:02:30 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 0 python3
11:02:32 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 0 python3
11:02:34 PM xxxx 9743 9.0% 0.0% 0.0% 1.0% 9.0% 2 python3
11:02:36 PM xxxx 9743 16.0% 1.5% 0.0% 0.0% 17.5% 0 python3
11:02:38 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 0 python3
11:02:40 PM xxxx 9743 1.5% 0.0% 0.0% 0.0% 1.5% 3 python3
11:02:42 PM xxxx 9743 6.0% 0.0% 0.0% 0.5% 6.0% 0 python3
11:02:44 PM xxxx 9743 2.0% 0.5% 0.0% 0.0% 2.5% 1 python3
11:02:46 PM xxxx 9743 16.5% 0.0% 0.0% 0.5% 16.5% 0 python3
11:02:48 PM xxxx 9743 0.5% 0.0% 0.0% 0.0% 0.5% 3 python3
11:02:50 PM xxxx 9743 5.0% 0.0% 0.0% 0.0% 5.0% 1 python3
11:02:52 PM xxxx 9743 4.5% 0.0% 0.0% 0.5% 4.5% 1 python3
~*~
title: Yes me
couplets: 7
l-- They came up to the problem, or he will advise us for all that
was Friday.
s-- We ourselves want to do with violence.
l-- I had taken her away from the carriage rattled away.
s-- The Most Happy must be simple, precise, terse.
l-- Within a radius of 100 meters unless they take a few hundred
yards.
s-- Of course, it is not yet scientifically formulated.
l-- He raised her veil as she could, for the abatement of rashes.
s-- The night was cold under my disguise.
l-- At the second letter of mine, and James acted as agent.
s-- It was not accustomed to succeed with them.
l-- We are in a word--and it shall be able to take me back on
Ganymede.
s-- We had no difficulty about this.
l-- It's not quite the same terrible winter.
s-- I fear that he had called.
~!~
Average: xxxx 9743 5.9% 0.5% 0.0% 0.2% 6.3% - python3
(.env) terris@tilde:~/genwrite$ pidstat --human 2 -e nice -n 19 ionice -c 3 python3 mark8.py >> samples/mark8sample.txt
(.env) terris@tilde:~/genwrite$ supernice python3 mark8.py >> samples/mark8sample.txt
~*~
gazing
As we stepped into the lake a bag and went downstairs.
The whole incident left a trace of her husband.
For example, I began by using his machinery in motion than that of
the darkness.
She tore from the crater, of the prehistoric people.
~!~
~*~
outside
Well I knew from his writing is a man in England are getting £100 a
year.
On the night at 16, Godolphin Street, Westminster.
He stole in and nowhere to hide.
A further knowledge of the art.
I then endeavored to convey the final discussion of his daily
journey.
He was a letter in a late visit to Birmingham.
~!~
~*~
your is
Well, Watson, you have one sorrow and untimely death.
What's the matter up.
Then on the phone Hook I'm back in a gold coronet.
Her age was not literally true.
At the far end, with a provision that a party Yes we're going for a
while.
D. It is I, McMurdo.
She then called for such combinations of events is pretty basic, and
guess what?
Next day I am right or I am a dangerous rival.
Since then I suddenly felt that he was a prank--upon me.
My wife came out he gave a shrill whine.
By means of a good situation for anyone.
So now, my dear Mr. Mac.
Anyway, if you look at it this evening.
And we may expect us early in the blackest shade.
We are glad to go out together and chuckled.
We had tried to force her confidence.
It has been drifting by about half.
There were several other letters.
~!~

21
sedtest.txt Normal file
View File

@ -0,0 +1,21 @@
234.
115:
Linux
3344:
Solaris
41. How I left
Ubuntu
(parenthetical)
[bracketed]
_underscored_
{braced}
55873
Fedora
159.
RedHat
Junk
1223,
Gunk
stones

14
sedtest.txt.std Normal file
View File

@ -0,0 +1,14 @@
Linux
Solaris
How I left
Ubuntu
parenthetical
bracketed
underscored
braced
55873
Fedora
RedHat
Junk
Gunk
stones

13
stdtxt.sh Executable file
View File

@ -0,0 +1,13 @@
#!/bin/bash
#This pipeline should remove the punctuation from a text file and then
# remove blank lines
#No longer removes punctuation, but should clean up blank lines and numbers at the beginning of a line and followed by punctuation.
# still need to figure out how to also do iconv (to us-ascii? to utf-8?)
# usage: 'stdtxt.sh filename'
#tr -d [:punct:] < $1 | sed '/^\s*$/d' >> "$1.nopunct"
#sed 's/^[0-9]*[[:punct:]]//' $1 | sed 's/^ *[0-9]*[[:punct:]]//' | sed '/^ *$/d' >> "$1.std"
# does this work for brackets and underscores?
# sed 's/[][}{)(_]//g'
# added another sed to remove numbers after whitespace before punctuation.
sed 's/^[0-9]*[[:punct:]]//' $1 | sed 's/^[[:space:]]*[0-9]*[[:punct:]]//' | sed 's/[][}{)(_]//g' | sed '/^ *$/d' >> "$1.std"