142 lines
4.7 KiB
Python
Executable File
142 lines
4.7 KiB
Python
Executable File
#!/usr/bin/python
|
|
import fileinput
|
|
import json
|
|
import time
|
|
import calendar
|
|
import re
|
|
import shutil
|
|
import argparse
|
|
import logging, sys
|
|
import math
|
|
|
|
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate word cloud data based off of irc chat logs"
|
|
)
|
|
parser.add_argument(
|
|
"-logfile", help="irc log file to read from", default="/home/archangelic/irc/log"
|
|
)
|
|
parser.add_argument("-outfile", help="output file to write to", default="")
|
|
|
|
parser.add_argument(
|
|
"-timeend",
|
|
type=int,
|
|
help="end time of the word cloud (in epoch time)",
|
|
default=calendar.timegm(time.gmtime()),
|
|
)
|
|
parser.add_argument(
|
|
"-timestart",
|
|
type=int,
|
|
help="start time of the word cloud (in epoch time)",
|
|
default=-1,
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-bannedUsersFile",
|
|
help="file containing list of banned users",
|
|
default="/home/krowbar/Code/python/bannedUsers",
|
|
)
|
|
parser.add_argument(
|
|
"-bannedWordsFile",
|
|
help="file containing list of banned words",
|
|
default="/home/krowbar/Code/python/bannedWords",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-minLength",
|
|
type=int,
|
|
help="minimum size of words to include in the cloud",
|
|
default=3,
|
|
)
|
|
parser.add_argument(
|
|
"-minOccurrence",
|
|
type=int,
|
|
help="the minimum occurence of a word to include it in the cloud",
|
|
default=3,
|
|
)
|
|
parser.add_argument(
|
|
"-timestamp",
|
|
help="what kind of time stamp should be inserted into the chat cloud. valid values are none, start, end, month, and full",
|
|
default="none"
|
|
)
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
wordData = {} # keyed by "word" that contains a count
|
|
# we only care about recent chats, let's say for the past sixteen hours
|
|
|
|
args.timestart = (
|
|
args.timestart if args.timestart != -1 else args.timeend - (16 * 60 * 60)
|
|
)
|
|
# timeCutoff = calendar.timegm(time.strptime("1 Oct 16", "%d %b %y"))
|
|
logging.info(
|
|
"Generating word cloud based off words from "
|
|
+ str(args.timestart)
|
|
+ " to "
|
|
+ str(args.timeend)
|
|
)
|
|
|
|
bannedWords = open(args.bannedWordsFile).read().splitlines()
|
|
bannedUsers = open(args.bannedUsersFile).read().splitlines()
|
|
|
|
with open(args.logfile, "r") as log:
|
|
firstTime = None # track these for the timestamp we may do later
|
|
lastTime = None
|
|
for line in log:
|
|
try:
|
|
mtime, user, message = line.split("\t", 3)
|
|
mtime = int(mtime)
|
|
except ValueError:
|
|
continue # There are some bad lines in the log file that we'll ignore if we can't parse
|
|
if user in bannedUsers:
|
|
continue # We don't care what they say
|
|
if mtime >= args.timestart and mtime <= args.timeend:
|
|
# print "Processing line from " + user + " at " + str(mtime)
|
|
if firstTime is None:
|
|
firstTime = mtime
|
|
lastTime = mtime
|
|
|
|
for word in (
|
|
re.sub("['\"\`\/\\;:,.?!*&^\-()<>\{\}|_\[\]0-9]", " ", message)
|
|
.lower()
|
|
.split()
|
|
):
|
|
# changing symbols into spaces instead of stripping them avoids compounded words
|
|
if len(word) < args.minLength or word in bannedWords:
|
|
# print "Rejecting " + word
|
|
continue
|
|
# if the word already exists in the list
|
|
if word in wordData:
|
|
wordData[word] += 1
|
|
else: # if they are new
|
|
wordData[word] = 1
|
|
# print "Added word: " + word
|
|
wordData = {i: wordData[i] for i in wordData if wordData[i] >= args.minOccurrence}
|
|
if len(wordData) == 0:
|
|
wordData = {"NOTHING": 1, "INTERESTING": 1, "TODAY": 1}
|
|
elif args.timestamp is not "none":
|
|
stamp = "DATE";
|
|
if args.timestamp == "start":
|
|
stamp = time.strftime("%B %d, %Y", time.gmtime(firstTime))
|
|
elif args.timestamp == "end":
|
|
stamp = time.strftime("%B %d, %Y", time.gmtime(lastTime))
|
|
elif args.timestamp == "full":
|
|
stamp = "{} to {}".format(time.strftime("%b %d, %Y", time.gmtime(firstTime)), time.strftime("%b %d, %Y", time.gmtime(lastTime)))
|
|
elif args.timestamp == "month":
|
|
# use the month of one day before the last day
|
|
stamp = time.strftime("%B %Y", time.gmtime(lastTime - 86400))
|
|
# make the timestamp a bit bigger than everything else
|
|
size = int(math.ceil(wordData[max(wordData, key=wordData.get)] * 1.3)) + 1
|
|
wordData[stamp] = size
|
|
print("Added timestamp \"{}\" of size {}".format(stamp, size))
|
|
|
|
if args.outfile == "":
|
|
print(json.dumps(wordData))
|
|
else:
|
|
with open(args.outfile + ".tmp", "w") as tmpFile:
|
|
tmpFile.write(json.dumps(wordData))
|
|
shutil.move(args.outfile + ".tmp", args.outfile)
|
|
print("Dumped {} words to {}".format(len(wordData), args.outfile))
|