heebie jeebies

2019-11-14 23:50:15 -05:00 · 2019-11-14 23:50:15 -05:00 · c83edd6d4b
commit c83edd6d4b
9 changed files with 438 additions and 0 deletions
--- a/corpus/prose/chains/readme.txt
+++ b/corpus/prose/chains/readme.txt
@ -0,0 +1 @@
+here will be the markdown chains generated by markchainer.py and used by mark8.py
--- a/corpus/prose/readme.md
+++ b/corpus/prose/readme.md
@ -0,0 +1,3 @@
+# README
+
+Here will be the source files already processed by stdtxt.sh, ready as filename.std for markchainer to generate models from.
--- a/mark8.py
+++ b/mark8.py
@ -0,0 +1,160 @@
+import markovify
+import nltk
+import json
+import re
+import time
+import os
+import random
+import language_check
+import textwrap
+
+# Modifying mark7.py to re-introduce language-tool to fix the sentences.
+# it's not clear that it's actually doing anything! (╯°□°）╯︵ ┻━┻ ...¯\_(ツ)_/¯
+# still to do - selectable corpuses
+# expand corpus collection
+# re-introduce cosmic.voyage corpus (after removing message headers and non alphanumerics?)
+# 
+# notes:
+# the time.sleep() lines below are there just to slow down the processing to keep
+# the server burden below 100% on one or more CPUs. These were added, when the supernice
+# alias just wasn't enough (in my opinion) to run on tilde.team without possibly affecting
+# other users. The time.sleep() lines are not necessary, they are there for shared pubnix love.
+# 
+# there are probably one or more import lines above that do not need to be there.
+#
+# the lines toward the end are messy and difficult to read. And they insert some
+# indicators and numbers into the printed text that were added for debugging when
+# language-check kept crashing if markovify would return a blank sentence and language-check
+# doesn't like variables of type None. So the product of the markovify call was str()'d, and
+# the call itself was placed in a while loop until it actually returned something that isn't 'None'
+# an intermediate 'matches' variable was reintroduced.
+# all that code is subject to change.
+# 
+
+tool = None
+text = None 
+matches = None
+combined_model = None
+model_json = None
+couplets = random.randint(2,14) # use this var to generate a random number of couplets between X,Y
+title = None # generate a title for the poem/song
+wrapper = textwrap.TextWrapper(width=69, subsequent_indent="            ")
+
+#def genHeaders():
+#    head1 = '''\
+#	    [processing by miniVISR.autoComms @ relay station VHE-0j0-η         ]
+#	    [recv'd on 100Hz+-15Hz, 112.358Hz, 112.358KHz, 33.33MHz, 66.66MHz   ]
+# 	    [radio     112.358MHz, 121.5MHz, 130.167MHz, 143.625MHz, 244.30MHz  ]
+#	    [freqs.    358.13MHz, 581.321MHz, 633.9MHz, 922.7MHz, 2113.853MHz   ]
+#	    [signals quality: poor.      signals reconstuction confidence: 65%  ]
+#	    [message transcription relayed to Earthsys QEC for general delivery ]
+#	    [begin transcript                                                   ]
+#	    '''.format()
+#	head2 = '''\
+#
+#                      ...able, animal, and mineral,
+#            I know the skalds of old Solsys, and I quote the verse atypical
+#            Synechdoche and kenna til, disorder quite limerickal;
+#            I'm very well disposed as well, to matters a-esthetical,
+#            I sing love songs to moons both in circle and elliptical,
+#            About their craters and their phases with my lyrics I'm quite loose,
+#            Bragi's here in your system to sing, and to share whatever you choose.
+#            '''.format()
+#    return 0
+
+#def genFooters():
+#    countid = str(125925)#{file open stuff - keep a counter in a file?}
+#	
+#    foot =  '''\
+#            Bragi's here in your system to sing, and to share whatever you choose.
+#            Broadcasting on many frequencies every 86,400 seconds. Bragi-{ordinal}
+#            out.
+#
+#            [end transcript                                                     ]
+#            END MESSAGE."
+#            '''.format(ordinal=countid)
+#    print()
+#    print(foot)
+ 
+#    return 0
+
+def genTitle():
+    word_list = str(combined_model.make_short_sentence(150, min_chars=80)).split()
+    wtitle = ""
+    for x in range(random.randint(1,3)):
+        wtitle = wtitle + random.choice(word_list) + " "
+    return str(wtitle).strip(' ,:"')
+
+def genLongLine(minl, maxl):
+    """
+    give me a min length and a max length and i'll return a short
+    sentence between those lengths, hopefully. and check it for grammar
+    errors, and if some found, run it through language-check.correct()
+    """
+    text = None
+    while (str(text) == 'None'):
+        text = str(combined_model.make_short_sentence(maxl, min_chars=minl))
+        time.sleep(1)
+
+            
+        matches = tool.check(text)
+        if len(matches) == 0:
+            line = str(" " + text)
+        else:
+            line = str(" " + language_check.correct(text, matches))
+    return line
+
+def genShortLine(minl, maxl):
+    """
+    give me a min length and a max length and i'll return a short
+    sentence between those lengths, hopefully. and check it for grammar
+    errors, and if some found, run it through language-check.correct()
+    """
+    text = None
+    while (str(text) == 'None'):
+        text = str(combined_model.make_short_sentence(maxl, min_chars=minl))
+        time.sleep(1)
+
+        matches = tool.check(text)
+        if len(matches) == 0:
+            line = str("      " + text)
+        else:
+            line = str("      " + language_check.correct(text, matches))
+    return line
+
+# same basic structure as in markchainer.py, but this uses saved models
+# (model generation procedure separate from verse generation procedure
+# for shared pubnix love and) to be able to work with growing corpora
+# this searches a fixed relative path for already-created models.
+# reads the model(json) in, converts from json, combines the models
+# together.
+for file in os.listdir("./corpus/prose/chains/"):
+    if file.endswith(".mkdch"):
+        with open("./corpus/prose/chains/" + file) as f:
+            model = markovify.Text.from_json(json.load(f))
+            time.sleep(5)
+            if combined_model:
+                time.sleep(5)
+                combined_model = markovify.combine(models=[combined_model, model])
+            else:
+                combined_model = model
+
+tool = language_check.LanguageTool('en-US')
+# disabling spellchecking, didn't like some of the 'fixes' from testing. using archaic language in testing, will use novel words in the future. to re-enable spellchecking, comment out the following line:
+tool.disable_spellchecking()
+
+print("~*~") # This start delimeter is here for testing each run
+# genHeaders() #maybe do this in the future
+print("  ")
+#print("title: " + genTitle())
+print(genTitle())
+print("  ")
+#print("couplets: " + str(couplets))
+for x in range(couplets):
+    print(wrapper.fill(genLongLine(30,80)))
+    print(wrapper.fill(genShortLine(20,56)))
+#    print(wrapper.fill("  l--" + genLongLine(30,80)))
+#    print(wrapper.fill("  s--" + genShortLine(20,56)))
+print("  ")
+print("~!~") # this ending delimeter is here for testing each run
+# genFooters() #maybe do this in the future
--- a/markchainer.py
+++ b/markchainer.py
@ -0,0 +1,42 @@
+import markovify
+import nltk
+import json
+import re
+import time
+import os
+import language_check
+import string
+
+class POSifiedText(markovify.Text):
+    def word_split(self, sentence):
+        words = re.split(self.word_split_pattern, sentence)
+        words = [ "::".join(tag) for tag in nltk.pos_tag(words) ]
+        return words
+
+    def word_join(self, words):
+        sentence = " ".join(word.split("::")[0] for word in words)
+        return sentence
+
+
+corpus_path = "./corpus/prose/"
+chains_path = "./corpus/prose/chains/"
+
+tool = None
+matches =None
+combined_model = None
+model_json = None
+
+for file in os.listdir(corpus_path):
+    if file.endswith(".std"):
+        with open(corpus_path + file) as f:
+            #extraneous copy of file contents? or is this necessary?
+            text = f.read()
+            # test this, strip multiple spaces and leading/trailing
+            text = re.sub( '\s+', ' ', text ).strip()
+            model = markovify.Text(text) #this fails here with too many files in the corpus_path or long filenames.
+#            model = markovify.Text(text, retain_original=False) # use this one for very large corpora
+            model_json = model.to_json()
+            chainfile = file + ".mkdch"
+            with open(chains_path + chainfile, 'w') as outfile:
+                json.dump(model_json, outfile)
+            time.sleep(5)
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,53 @@
+```
+* * * * * * * * * * * * * * * * * * * * * * * *
+*    ___                _   __  _____    ___  * 
+*   / _ )_______ ____ _(_) /  |/  / /__ ( _ ) *
+*  / _  / __/ _ `/ _ `/ / / /|_/ /  '_// _  | *
+* /____/_/  \_,_/\_, /_/ /_/  /_/_/\_(_)___/  * 
+*               /___/                         * 
+* * * * * * * * * * * * * * * * * * * * * * * *
+```
+
+   This is a project (work in progress) to generate verses
+that look like poetry (using markov chains) for the ship
+stjörnuvagn Bragi on https://cosmic.voyage/
+
+It is presented here without text-sources or markdown chains
+because you can get the sources from project gutenberg like
+I did. And besides, I don't want to distribute gutenberg
+texts without the license verbiage (I had to remove it before
+generating the models). Support Project Gutenberg! Great
+old texts are not just for mining, they are also for reading.
+https://www.gutenberg.org/
+
+# Requirements
+
+> I did all this in a virtualenv, and installed the following packages with pip3:
+> * [markovify](https://github.com/jsvine/markovify)
+> * nltk
+> * [language-check](https://github.com/myint/language-check) - installs languagetool, which requires java
+
+# Included
+
+* mark8.py - the main generator proof of concept
+* markchainer.py - generates models from text files already processed by:
+* stdtxt.sh - sed pipeline to clean up the text (numbers, blank lines, underscores, brackets)
+* samples/mark8test.txt - rough-looking samples produced by rough-looking code during debugging.
+
+# Procedure
+
+1. download some large textfiles from project gutenberg or from https://www.archive.org
+1a. alternatively build your own large corpus through other means (web scraping, download corpora archives, etc.)
+2. trim each text files as needed, so they contain the kinds of things you want to generate text from
+3. use iconv or other means to make sure the texts are all of the same kind of encoding. (utf-8, ascii were tested)
+4. use stdtxt.sh on the main input files. this should produce something like inputfile.txt.std
+5. supernice python3 markchainer.py (this will look in './corpus/prose/' for *.std files, and generate a model for each. (will be found in './corpus/prose/chains' called something like inputfile.txt.std.mkdch )
+6. supernice python3 mark8.py >> output.txt
+
+* supernice is just a bash alias: 
+```
+alias supernice='nice -n 19 ionice -c 3'
+```
+...to help reduce load on the server from running this toy. the markovify package (esp. when using nltk stuf) can consume a lot of resources (especially when combined with langauge-check/LanguageTool!) so the python scripts  were slowed down even more using time.sleep().
+
+
--- a/samples/mark8sample.txt
+++ b/samples/mark8sample.txt
@ -0,0 +1,131 @@
+Linux 4.15.0-54-generic (tilde) 	11/14/2019 	_x86_64_	(6 CPU)
+
+11:01:24 PM   UID       PID    %usr %system  %guest   %wait    %CPU   CPU  Command
+11:01:26 PM  xxxx      9743   11.5%    2.0%    0.0%    0.0%   13.5%     5  python3
+11:01:28 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     5  python3
+11:01:30 PM  xxxx      9743   16.5%    3.0%    0.0%    0.0%   19.5%     5  python3
+11:01:32 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     5  python3
+11:01:34 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     5  python3
+11:01:36 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     1  python3
+11:01:38 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     1  python3
+11:01:40 PM  xxxx      9743   10.0%    2.5%    0.0%    0.0%   12.5%     4  python3
+11:01:42 PM  xxxx      9743   45.0%    6.5%    0.0%    0.5%   51.5%     1  python3
+11:01:44 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     1  python3
+11:01:46 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     1  python3
+11:01:48 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     1  python3
+11:01:50 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     1  python3
+11:01:52 PM  xxxx      9743   28.5%    2.0%    0.0%    0.5%   30.5%     3  python3
+11:01:54 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     3  python3
+11:01:56 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     3  python3
+11:01:58 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     4  python3
+11:02:00 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     4  python3
+11:02:02 PM  xxxx      9743    9.0%    0.0%    0.0%    4.0%    9.0%     0  python3
+11:02:04 PM  xxxx      9743   20.0%    0.5%    0.0%    2.5%   20.5%     2  python3
+11:02:06 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     2  python3
+11:02:08 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     2  python3
+11:02:10 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     2  python3
+11:02:12 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     2  python3
+11:02:14 PM  xxxx      9743   33.5%    1.5%    0.0%    0.5%   35.0%     4  python3
+11:02:16 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     4  python3
+11:02:18 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     4  python3
+11:02:20 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     5  python3
+11:02:22 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     5  python3
+11:02:24 PM  xxxx      9743   22.5%    1.5%    0.0%    0.0%   24.0%     5  python3
+11:02:26 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     5  python3
+11:02:28 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     5  python3
+11:02:30 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     0  python3
+11:02:32 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     0  python3
+11:02:34 PM  xxxx      9743    9.0%    0.0%    0.0%    1.0%    9.0%     2  python3
+11:02:36 PM  xxxx      9743   16.0%    1.5%    0.0%    0.0%   17.5%     0  python3
+11:02:38 PM  xxxx      9743    0.0%    0.0%    0.0%    0.0%    0.0%     0  python3
+11:02:40 PM  xxxx      9743    1.5%    0.0%    0.0%    0.0%    1.5%     3  python3
+11:02:42 PM  xxxx      9743    6.0%    0.0%    0.0%    0.5%    6.0%     0  python3
+11:02:44 PM  xxxx      9743    2.0%    0.5%    0.0%    0.0%    2.5%     1  python3
+11:02:46 PM  xxxx      9743   16.5%    0.0%    0.0%    0.5%   16.5%     0  python3
+11:02:48 PM  xxxx      9743    0.5%    0.0%    0.0%    0.0%    0.5%     3  python3
+11:02:50 PM  xxxx      9743    5.0%    0.0%    0.0%    0.0%    5.0%     1  python3
+11:02:52 PM  xxxx      9743    4.5%    0.0%    0.0%    0.5%    4.5%     1  python3
+~*~
+  
+title: Yes me
+  
+couplets: 7
+  l-- They came up to the problem, or he will advise us for all that
+            was Friday.
+  s--      We ourselves want to do with violence.
+  l-- I had taken her away from the carriage rattled away.
+  s--      The Most Happy must be simple, precise, terse.
+  l-- Within a radius of 100 meters unless they take a few hundred
+            yards.
+  s--      Of course, it is not yet scientifically formulated.
+  l-- He raised her veil as she could, for the abatement of rashes.
+  s--      The night was cold under my disguise.
+  l-- At the second letter of mine, and James acted as agent.
+  s--      It was not accustomed to succeed with them.
+  l-- We are in a word--and it shall be able to take me back on
+            Ganymede.
+  s--      We had no difficulty about this.
+  l-- It's not quite the same terrible winter.
+  s--      I fear that he had called.
+  
+~!~
+
+Average:     xxxx      9743    5.9%    0.5%    0.0%    0.2%    6.3%     -  python3
+
+(.env) terris@tilde:~/genwrite$ pidstat --human 2 -e nice -n 19 ionice -c 3 python3 mark8.py >> samples/mark8sample.txt
+
+
+
+(.env) terris@tilde:~/genwrite$ supernice python3 mark8.py >> samples/mark8sample.txt
+
+~*~
+  
+gazing
+  
+ As we stepped into the lake a bag and went downstairs.
+      The whole incident left a trace of her husband.
+ For example, I began by using his machinery in motion than that of
+            the darkness.
+      She tore from the crater, of the prehistoric people.
+  
+~!~
+~*~
+  
+outside
+  
+ Well I knew from his writing is a man in England are getting £100 a
+            year.
+      On the night at 16, Godolphin Street, Westminster.
+ He stole in and nowhere to hide.
+      A further knowledge of the art.
+ I then endeavored to convey the final discussion of his daily
+            journey.
+      He was a letter in a late visit to Birmingham.
+  
+~!~
+~*~
+  
+your is
+  
+ Well, Watson, you have one sorrow and untimely death.
+      What's the matter up.
+ Then on the phone Hook I'm back in a gold coronet.
+      Her age was not literally true.
+ At the far end, with a provision that a party Yes we're going for a
+            while.
+      D. It is I, McMurdo.
+ She then called for such combinations of events is pretty basic, and
+            guess what?
+      Next day I am right or I am a dangerous rival.
+ Since then I suddenly felt that he was a prank--upon me.
+      My wife came out he gave a shrill whine.
+ By means of a good situation for anyone.
+      So now, my dear Mr. Mac.
+ Anyway, if you look at it this evening.
+      And we may expect us early in the blackest shade.
+ We are glad to go out together and chuckled.
+      We had tried to force her confidence.
+ It has been drifting by about half.
+      There were several other letters.
+  
+~!~
--- a/sedtest.txt
+++ b/sedtest.txt
@ -0,0 +1,21 @@
+234.
+115:
+Linux
+	3344:
+Solaris
+41. How I left
+Ubuntu
+(parenthetical)
+[bracketed]
+_underscored_
+{braced}
+55873
+Fedora
+  159.
+RedHat
+
+Junk
+1223,
+
+Gunk
+stones
--- a/sedtest.txt.std
+++ b/sedtest.txt.std
@ -0,0 +1,14 @@
+Linux
+Solaris
+ How I left
+Ubuntu
+parenthetical
+bracketed
+underscored
+braced
+55873
+Fedora
+RedHat
+Junk
+Gunk
+stones
--- a/stdtxt.sh
+++ b/stdtxt.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+#This pipeline should remove the punctuation from a text file and then
+# remove blank lines
+#No longer removes punctuation, but should clean up blank lines and numbers at the beginning of a line and followed by punctuation.
+# still need to figure out how to also do iconv (to us-ascii? to utf-8?)
+# usage: 'stdtxt.sh filename'
+#tr -d [:punct:] < $1 | sed '/^\s*$/d' >> "$1.nopunct"
+#sed 's/^[0-9]*[[:punct:]]//' $1 | sed 's/^ *[0-9]*[[:punct:]]//' | sed '/^ *$/d' >> "$1.std"
+# does this work for brackets and underscores?
+# sed 's/[][}{)(_]//g' 
+# added another sed to remove numbers after whitespace before punctuation.
+sed 's/^[0-9]*[[:punct:]]//' $1 | sed 's/^[[:space:]]*[0-9]*[[:punct:]]//' | sed 's/[][}{)(_]//g'  | sed '/^ *$/d' >> "$1.std"
+
				`@ -0,0 +1 @@`
				`here will be the markdown chains generated by markchainer.py and used by mark8.py`