heebie jeebies
This commit is contained in:
commit
c83edd6d4b
|
@ -0,0 +1 @@
|
|||
here will be the markdown chains generated by markchainer.py and used by mark8.py
|
|
@ -0,0 +1,3 @@
|
|||
# README
|
||||
|
||||
Here will be the source files already processed by stdtxt.sh, ready as filename.std for markchainer to generate models from.
|
|
@ -0,0 +1,160 @@
|
|||
import markovify
|
||||
import nltk
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import os
|
||||
import random
|
||||
import language_check
|
||||
import textwrap
|
||||
|
||||
# Modifying mark7.py to re-introduce language-tool to fix the sentences.
|
||||
# it's not clear that it's actually doing anything! (╯°□°)╯︵ ┻━┻ ...¯\_(ツ)_/¯
|
||||
# still to do - selectable corpuses
|
||||
# expand corpus collection
|
||||
# re-introduce cosmic.voyage corpus (after removing message headers and non alphanumerics?)
|
||||
#
|
||||
# notes:
|
||||
# the time.sleep() lines below are there just to slow down the processing to keep
|
||||
# the server burden below 100% on one or more CPUs. These were added, when the supernice
|
||||
# alias just wasn't enough (in my opinion) to run on tilde.team without possibly affecting
|
||||
# other users. The time.sleep() lines are not necessary, they are there for shared pubnix love.
|
||||
#
|
||||
# there are probably one or more import lines above that do not need to be there.
|
||||
#
|
||||
# the lines toward the end are messy and difficult to read. And they insert some
|
||||
# indicators and numbers into the printed text that were added for debugging when
|
||||
# language-check kept crashing if markovify would return a blank sentence and language-check
|
||||
# doesn't like variables of type None. So the product of the markovify call was str()'d, and
|
||||
# the call itself was placed in a while loop until it actually returned something that isn't 'None'
|
||||
# an intermediate 'matches' variable was reintroduced.
|
||||
# all that code is subject to change.
|
||||
#
|
||||
|
||||
tool = None
|
||||
text = None
|
||||
matches = None
|
||||
combined_model = None
|
||||
model_json = None
|
||||
couplets = random.randint(2,14) # use this var to generate a random number of couplets between X,Y
|
||||
title = None # generate a title for the poem/song
|
||||
wrapper = textwrap.TextWrapper(width=69, subsequent_indent=" ")
|
||||
|
||||
#def genHeaders():
|
||||
# head1 = '''\
|
||||
# [processing by miniVISR.autoComms @ relay station VHE-0j0-η ]
|
||||
# [recv'd on 100Hz+-15Hz, 112.358Hz, 112.358KHz, 33.33MHz, 66.66MHz ]
|
||||
# [radio 112.358MHz, 121.5MHz, 130.167MHz, 143.625MHz, 244.30MHz ]
|
||||
# [freqs. 358.13MHz, 581.321MHz, 633.9MHz, 922.7MHz, 2113.853MHz ]
|
||||
# [signals quality: poor. signals reconstuction confidence: 65% ]
|
||||
# [message transcription relayed to Earthsys QEC for general delivery ]
|
||||
# [begin transcript ]
|
||||
# '''.format()
|
||||
# head2 = '''\
|
||||
#
|
||||
# ...able, animal, and mineral,
|
||||
# I know the skalds of old Solsys, and I quote the verse atypical
|
||||
# Synechdoche and kenna til, disorder quite limerickal;
|
||||
# I'm very well disposed as well, to matters a-esthetical,
|
||||
# I sing love songs to moons both in circle and elliptical,
|
||||
# About their craters and their phases with my lyrics I'm quite loose,
|
||||
# Bragi's here in your system to sing, and to share whatever you choose.
|
||||
# '''.format()
|
||||
# return 0
|
||||
|
||||
#def genFooters():
|
||||
# countid = str(125925)#{file open stuff - keep a counter in a file?}
|
||||
#
|
||||
# foot = '''\
|
||||
# Bragi's here in your system to sing, and to share whatever you choose.
|
||||
# Broadcasting on many frequencies every 86,400 seconds. Bragi-{ordinal}
|
||||
# out.
|
||||
#
|
||||
# [end transcript ]
|
||||
# END MESSAGE."
|
||||
# '''.format(ordinal=countid)
|
||||
# print()
|
||||
# print(foot)
|
||||
|
||||
# return 0
|
||||
|
||||
def genTitle():
|
||||
word_list = str(combined_model.make_short_sentence(150, min_chars=80)).split()
|
||||
wtitle = ""
|
||||
for x in range(random.randint(1,3)):
|
||||
wtitle = wtitle + random.choice(word_list) + " "
|
||||
return str(wtitle).strip(' ,:"')
|
||||
|
||||
def genLongLine(minl, maxl):
|
||||
"""
|
||||
give me a min length and a max length and i'll return a short
|
||||
sentence between those lengths, hopefully. and check it for grammar
|
||||
errors, and if some found, run it through language-check.correct()
|
||||
"""
|
||||
text = None
|
||||
while (str(text) == 'None'):
|
||||
text = str(combined_model.make_short_sentence(maxl, min_chars=minl))
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
matches = tool.check(text)
|
||||
if len(matches) == 0:
|
||||
line = str(" " + text)
|
||||
else:
|
||||
line = str(" " + language_check.correct(text, matches))
|
||||
return line
|
||||
|
||||
def genShortLine(minl, maxl):
|
||||
"""
|
||||
give me a min length and a max length and i'll return a short
|
||||
sentence between those lengths, hopefully. and check it for grammar
|
||||
errors, and if some found, run it through language-check.correct()
|
||||
"""
|
||||
text = None
|
||||
while (str(text) == 'None'):
|
||||
text = str(combined_model.make_short_sentence(maxl, min_chars=minl))
|
||||
time.sleep(1)
|
||||
|
||||
matches = tool.check(text)
|
||||
if len(matches) == 0:
|
||||
line = str(" " + text)
|
||||
else:
|
||||
line = str(" " + language_check.correct(text, matches))
|
||||
return line
|
||||
|
||||
# same basic structure as in markchainer.py, but this uses saved models
|
||||
# (model generation procedure separate from verse generation procedure
|
||||
# for shared pubnix love and) to be able to work with growing corpora
|
||||
# this searches a fixed relative path for already-created models.
|
||||
# reads the model(json) in, converts from json, combines the models
|
||||
# together.
|
||||
for file in os.listdir("./corpus/prose/chains/"):
|
||||
if file.endswith(".mkdch"):
|
||||
with open("./corpus/prose/chains/" + file) as f:
|
||||
model = markovify.Text.from_json(json.load(f))
|
||||
time.sleep(5)
|
||||
if combined_model:
|
||||
time.sleep(5)
|
||||
combined_model = markovify.combine(models=[combined_model, model])
|
||||
else:
|
||||
combined_model = model
|
||||
|
||||
tool = language_check.LanguageTool('en-US')
|
||||
# disabling spellchecking, didn't like some of the 'fixes' from testing. using archaic language in testing, will use novel words in the future. to re-enable spellchecking, comment out the following line:
|
||||
tool.disable_spellchecking()
|
||||
|
||||
print("~*~") # This start delimeter is here for testing each run
|
||||
# genHeaders() #maybe do this in the future
|
||||
print(" ")
|
||||
#print("title: " + genTitle())
|
||||
print(genTitle())
|
||||
print(" ")
|
||||
#print("couplets: " + str(couplets))
|
||||
for x in range(couplets):
|
||||
print(wrapper.fill(genLongLine(30,80)))
|
||||
print(wrapper.fill(genShortLine(20,56)))
|
||||
# print(wrapper.fill(" l--" + genLongLine(30,80)))
|
||||
# print(wrapper.fill(" s--" + genShortLine(20,56)))
|
||||
print(" ")
|
||||
print("~!~") # this ending delimeter is here for testing each run
|
||||
# genFooters() #maybe do this in the future
|
|
@ -0,0 +1,42 @@
|
|||
import markovify
|
||||
import nltk
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import os
|
||||
import language_check
|
||||
import string
|
||||
|
||||
class POSifiedText(markovify.Text):
|
||||
def word_split(self, sentence):
|
||||
words = re.split(self.word_split_pattern, sentence)
|
||||
words = [ "::".join(tag) for tag in nltk.pos_tag(words) ]
|
||||
return words
|
||||
|
||||
def word_join(self, words):
|
||||
sentence = " ".join(word.split("::")[0] for word in words)
|
||||
return sentence
|
||||
|
||||
|
||||
corpus_path = "./corpus/prose/"
|
||||
chains_path = "./corpus/prose/chains/"
|
||||
|
||||
tool = None
|
||||
matches =None
|
||||
combined_model = None
|
||||
model_json = None
|
||||
|
||||
for file in os.listdir(corpus_path):
|
||||
if file.endswith(".std"):
|
||||
with open(corpus_path + file) as f:
|
||||
#extraneous copy of file contents? or is this necessary?
|
||||
text = f.read()
|
||||
# test this, strip multiple spaces and leading/trailing
|
||||
text = re.sub( '\s+', ' ', text ).strip()
|
||||
model = markovify.Text(text) #this fails here with too many files in the corpus_path or long filenames.
|
||||
# model = markovify.Text(text, retain_original=False) # use this one for very large corpora
|
||||
model_json = model.to_json()
|
||||
chainfile = file + ".mkdch"
|
||||
with open(chains_path + chainfile, 'w') as outfile:
|
||||
json.dump(model_json, outfile)
|
||||
time.sleep(5)
|
|
@ -0,0 +1,53 @@
|
|||
```
|
||||
* * * * * * * * * * * * * * * * * * * * * * * *
|
||||
* ___ _ __ _____ ___ *
|
||||
* / _ )_______ ____ _(_) / |/ / /__ ( _ ) *
|
||||
* / _ / __/ _ `/ _ `/ / / /|_/ / '_// _ | *
|
||||
* /____/_/ \_,_/\_, /_/ /_/ /_/_/\_(_)___/ *
|
||||
* /___/ *
|
||||
* * * * * * * * * * * * * * * * * * * * * * * *
|
||||
```
|
||||
|
||||
This is a project (work in progress) to generate verses
|
||||
that look like poetry (using markov chains) for the ship
|
||||
stjörnuvagn Bragi on https://cosmic.voyage/
|
||||
|
||||
It is presented here without text-sources or markdown chains
|
||||
because you can get the sources from project gutenberg like
|
||||
I did. And besides, I don't want to distribute gutenberg
|
||||
texts without the license verbiage (I had to remove it before
|
||||
generating the models). Support Project Gutenberg! Great
|
||||
old texts are not just for mining, they are also for reading.
|
||||
https://www.gutenberg.org/
|
||||
|
||||
# Requirements
|
||||
|
||||
> I did all this in a virtualenv, and installed the following packages with pip3:
|
||||
> * [markovify](https://github.com/jsvine/markovify)
|
||||
> * nltk
|
||||
> * [language-check](https://github.com/myint/language-check) - installs languagetool, which requires java
|
||||
|
||||
# Included
|
||||
|
||||
* mark8.py - the main generator proof of concept
|
||||
* markchainer.py - generates models from text files already processed by:
|
||||
* stdtxt.sh - sed pipeline to clean up the text (numbers, blank lines, underscores, brackets)
|
||||
* samples/mark8test.txt - rough-looking samples produced by rough-looking code during debugging.
|
||||
|
||||
# Procedure
|
||||
|
||||
1. download some large textfiles from project gutenberg or from https://www.archive.org
|
||||
1a. alternatively build your own large corpus through other means (web scraping, download corpora archives, etc.)
|
||||
2. trim each text files as needed, so they contain the kinds of things you want to generate text from
|
||||
3. use iconv or other means to make sure the texts are all of the same kind of encoding. (utf-8, ascii were tested)
|
||||
4. use stdtxt.sh on the main input files. this should produce something like inputfile.txt.std
|
||||
5. supernice python3 markchainer.py (this will look in './corpus/prose/' for *.std files, and generate a model for each. (will be found in './corpus/prose/chains' called something like inputfile.txt.std.mkdch )
|
||||
6. supernice python3 mark8.py >> output.txt
|
||||
|
||||
* supernice is just a bash alias:
|
||||
```
|
||||
alias supernice='nice -n 19 ionice -c 3'
|
||||
```
|
||||
...to help reduce load on the server from running this toy. the markovify package (esp. when using nltk stuf) can consume a lot of resources (especially when combined with langauge-check/LanguageTool!) so the python scripts were slowed down even more using time.sleep().
|
||||
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
Linux 4.15.0-54-generic (tilde) 11/14/2019 _x86_64_ (6 CPU)
|
||||
|
||||
11:01:24 PM UID PID %usr %system %guest %wait %CPU CPU Command
|
||||
11:01:26 PM xxxx 9743 11.5% 2.0% 0.0% 0.0% 13.5% 5 python3
|
||||
11:01:28 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 5 python3
|
||||
11:01:30 PM xxxx 9743 16.5% 3.0% 0.0% 0.0% 19.5% 5 python3
|
||||
11:01:32 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 5 python3
|
||||
11:01:34 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 5 python3
|
||||
11:01:36 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 1 python3
|
||||
11:01:38 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 1 python3
|
||||
11:01:40 PM xxxx 9743 10.0% 2.5% 0.0% 0.0% 12.5% 4 python3
|
||||
11:01:42 PM xxxx 9743 45.0% 6.5% 0.0% 0.5% 51.5% 1 python3
|
||||
11:01:44 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 1 python3
|
||||
11:01:46 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 1 python3
|
||||
11:01:48 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 1 python3
|
||||
11:01:50 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 1 python3
|
||||
11:01:52 PM xxxx 9743 28.5% 2.0% 0.0% 0.5% 30.5% 3 python3
|
||||
11:01:54 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 3 python3
|
||||
11:01:56 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 3 python3
|
||||
11:01:58 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 4 python3
|
||||
11:02:00 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 4 python3
|
||||
11:02:02 PM xxxx 9743 9.0% 0.0% 0.0% 4.0% 9.0% 0 python3
|
||||
11:02:04 PM xxxx 9743 20.0% 0.5% 0.0% 2.5% 20.5% 2 python3
|
||||
11:02:06 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 2 python3
|
||||
11:02:08 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 2 python3
|
||||
11:02:10 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 2 python3
|
||||
11:02:12 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 2 python3
|
||||
11:02:14 PM xxxx 9743 33.5% 1.5% 0.0% 0.5% 35.0% 4 python3
|
||||
11:02:16 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 4 python3
|
||||
11:02:18 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 4 python3
|
||||
11:02:20 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 5 python3
|
||||
11:02:22 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 5 python3
|
||||
11:02:24 PM xxxx 9743 22.5% 1.5% 0.0% 0.0% 24.0% 5 python3
|
||||
11:02:26 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 5 python3
|
||||
11:02:28 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 5 python3
|
||||
11:02:30 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 0 python3
|
||||
11:02:32 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 0 python3
|
||||
11:02:34 PM xxxx 9743 9.0% 0.0% 0.0% 1.0% 9.0% 2 python3
|
||||
11:02:36 PM xxxx 9743 16.0% 1.5% 0.0% 0.0% 17.5% 0 python3
|
||||
11:02:38 PM xxxx 9743 0.0% 0.0% 0.0% 0.0% 0.0% 0 python3
|
||||
11:02:40 PM xxxx 9743 1.5% 0.0% 0.0% 0.0% 1.5% 3 python3
|
||||
11:02:42 PM xxxx 9743 6.0% 0.0% 0.0% 0.5% 6.0% 0 python3
|
||||
11:02:44 PM xxxx 9743 2.0% 0.5% 0.0% 0.0% 2.5% 1 python3
|
||||
11:02:46 PM xxxx 9743 16.5% 0.0% 0.0% 0.5% 16.5% 0 python3
|
||||
11:02:48 PM xxxx 9743 0.5% 0.0% 0.0% 0.0% 0.5% 3 python3
|
||||
11:02:50 PM xxxx 9743 5.0% 0.0% 0.0% 0.0% 5.0% 1 python3
|
||||
11:02:52 PM xxxx 9743 4.5% 0.0% 0.0% 0.5% 4.5% 1 python3
|
||||
~*~
|
||||
|
||||
title: Yes me
|
||||
|
||||
couplets: 7
|
||||
l-- They came up to the problem, or he will advise us for all that
|
||||
was Friday.
|
||||
s-- We ourselves want to do with violence.
|
||||
l-- I had taken her away from the carriage rattled away.
|
||||
s-- The Most Happy must be simple, precise, terse.
|
||||
l-- Within a radius of 100 meters unless they take a few hundred
|
||||
yards.
|
||||
s-- Of course, it is not yet scientifically formulated.
|
||||
l-- He raised her veil as she could, for the abatement of rashes.
|
||||
s-- The night was cold under my disguise.
|
||||
l-- At the second letter of mine, and James acted as agent.
|
||||
s-- It was not accustomed to succeed with them.
|
||||
l-- We are in a word--and it shall be able to take me back on
|
||||
Ganymede.
|
||||
s-- We had no difficulty about this.
|
||||
l-- It's not quite the same terrible winter.
|
||||
s-- I fear that he had called.
|
||||
|
||||
~!~
|
||||
|
||||
Average: xxxx 9743 5.9% 0.5% 0.0% 0.2% 6.3% - python3
|
||||
|
||||
(.env) terris@tilde:~/genwrite$ pidstat --human 2 -e nice -n 19 ionice -c 3 python3 mark8.py >> samples/mark8sample.txt
|
||||
|
||||
|
||||
|
||||
(.env) terris@tilde:~/genwrite$ supernice python3 mark8.py >> samples/mark8sample.txt
|
||||
|
||||
~*~
|
||||
|
||||
gazing
|
||||
|
||||
As we stepped into the lake a bag and went downstairs.
|
||||
The whole incident left a trace of her husband.
|
||||
For example, I began by using his machinery in motion than that of
|
||||
the darkness.
|
||||
She tore from the crater, of the prehistoric people.
|
||||
|
||||
~!~
|
||||
~*~
|
||||
|
||||
outside
|
||||
|
||||
Well I knew from his writing is a man in England are getting £100 a
|
||||
year.
|
||||
On the night at 16, Godolphin Street, Westminster.
|
||||
He stole in and nowhere to hide.
|
||||
A further knowledge of the art.
|
||||
I then endeavored to convey the final discussion of his daily
|
||||
journey.
|
||||
He was a letter in a late visit to Birmingham.
|
||||
|
||||
~!~
|
||||
~*~
|
||||
|
||||
your is
|
||||
|
||||
Well, Watson, you have one sorrow and untimely death.
|
||||
What's the matter up.
|
||||
Then on the phone Hook I'm back in a gold coronet.
|
||||
Her age was not literally true.
|
||||
At the far end, with a provision that a party Yes we're going for a
|
||||
while.
|
||||
D. It is I, McMurdo.
|
||||
She then called for such combinations of events is pretty basic, and
|
||||
guess what?
|
||||
Next day I am right or I am a dangerous rival.
|
||||
Since then I suddenly felt that he was a prank--upon me.
|
||||
My wife came out he gave a shrill whine.
|
||||
By means of a good situation for anyone.
|
||||
So now, my dear Mr. Mac.
|
||||
Anyway, if you look at it this evening.
|
||||
And we may expect us early in the blackest shade.
|
||||
We are glad to go out together and chuckled.
|
||||
We had tried to force her confidence.
|
||||
It has been drifting by about half.
|
||||
There were several other letters.
|
||||
|
||||
~!~
|
|
@ -0,0 +1,21 @@
|
|||
234.
|
||||
115:
|
||||
Linux
|
||||
3344:
|
||||
Solaris
|
||||
41. How I left
|
||||
Ubuntu
|
||||
(parenthetical)
|
||||
[bracketed]
|
||||
_underscored_
|
||||
{braced}
|
||||
55873
|
||||
Fedora
|
||||
159.
|
||||
RedHat
|
||||
|
||||
Junk
|
||||
1223,
|
||||
|
||||
Gunk
|
||||
stones
|
|
@ -0,0 +1,14 @@
|
|||
Linux
|
||||
Solaris
|
||||
How I left
|
||||
Ubuntu
|
||||
parenthetical
|
||||
bracketed
|
||||
underscored
|
||||
braced
|
||||
55873
|
||||
Fedora
|
||||
RedHat
|
||||
Junk
|
||||
Gunk
|
||||
stones
|
|
@ -0,0 +1,13 @@
|
|||
#!/bin/bash
|
||||
#This pipeline should remove the punctuation from a text file and then
|
||||
# remove blank lines
|
||||
#No longer removes punctuation, but should clean up blank lines and numbers at the beginning of a line and followed by punctuation.
|
||||
# still need to figure out how to also do iconv (to us-ascii? to utf-8?)
|
||||
# usage: 'stdtxt.sh filename'
|
||||
#tr -d [:punct:] < $1 | sed '/^\s*$/d' >> "$1.nopunct"
|
||||
#sed 's/^[0-9]*[[:punct:]]//' $1 | sed 's/^ *[0-9]*[[:punct:]]//' | sed '/^ *$/d' >> "$1.std"
|
||||
# does this work for brackets and underscores?
|
||||
# sed 's/[][}{)(_]//g'
|
||||
# added another sed to remove numbers after whitespace before punctuation.
|
||||
sed 's/^[0-9]*[[:punct:]]//' $1 | sed 's/^[[:space:]]*[0-9]*[[:punct:]]//' | sed 's/[][}{)(_]//g' | sed '/^ *$/d' >> "$1.std"
|
||||
|
Loading…
Reference in New Issue