tilde-projects/Code/python/chatcloud2.py

142 lines
4.7 KiB
Python
Executable File

#!/usr/bin/python
import fileinput
import json
import time
import calendar
import re
import shutil
import argparse
import logging, sys
import math
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
parser = argparse.ArgumentParser(
description="Generate word cloud data based off of irc chat logs"
)
parser.add_argument(
"-logfile", help="irc log file to read from", default="/home/archangelic/irc/log"
)
parser.add_argument("-outfile", help="output file to write to", default="")
parser.add_argument(
"-timeend",
type=int,
help="end time of the word cloud (in epoch time)",
default=calendar.timegm(time.gmtime()),
)
parser.add_argument(
"-timestart",
type=int,
help="start time of the word cloud (in epoch time)",
default=-1,
)
parser.add_argument(
"-bannedUsersFile",
help="file containing list of banned users",
default="/home/krowbar/Code/python/bannedUsers",
)
parser.add_argument(
"-bannedWordsFile",
help="file containing list of banned words",
default="/home/krowbar/Code/python/bannedWords",
)
parser.add_argument(
"-minLength",
type=int,
help="minimum size of words to include in the cloud",
default=3,
)
parser.add_argument(
"-minOccurrence",
type=int,
help="the minimum occurence of a word to include it in the cloud",
default=3,
)
parser.add_argument(
"-timestamp",
help="what kind of time stamp should be inserted into the chat cloud. valid values are none, start, end, month, and full",
default="none"
)
args = parser.parse_args()
wordData = {} # keyed by "word" that contains a count
# we only care about recent chats, let's say for the past sixteen hours
args.timestart = (
args.timestart if args.timestart != -1 else args.timeend - (16 * 60 * 60)
)
# timeCutoff = calendar.timegm(time.strptime("1 Oct 16", "%d %b %y"))
logging.info(
"Generating word cloud based off words from "
+ str(args.timestart)
+ " to "
+ str(args.timeend)
)
bannedWords = open(args.bannedWordsFile).read().splitlines()
bannedUsers = open(args.bannedUsersFile).read().splitlines()
with open(args.logfile, "r") as log:
firstTime = None # track these for the timestamp we may do later
lastTime = None
for line in log:
try:
mtime, user, message = line.split("\t", 3)
mtime = int(mtime)
except ValueError:
continue # There are some bad lines in the log file that we'll ignore if we can't parse
if user in bannedUsers:
continue # We don't care what they say
if mtime >= args.timestart and mtime <= args.timeend:
# print "Processing line from " + user + " at " + str(mtime)
if firstTime is None:
firstTime = mtime
lastTime = mtime
for word in (
re.sub("['\"\`\/\\;:,.?!*&^\-()<>\{\}|_\[\]0-9]", " ", message)
.lower()
.split()
):
# changing symbols into spaces instead of stripping them avoids compounded words
if len(word) < args.minLength or word in bannedWords:
# print "Rejecting " + word
continue
# if the word already exists in the list
if word in wordData:
wordData[word] += 1
else: # if they are new
wordData[word] = 1
# print "Added word: " + word
wordData = {i: wordData[i] for i in wordData if wordData[i] >= args.minOccurrence}
if len(wordData) == 0:
wordData = {"NOTHING": 1, "INTERESTING": 1, "TODAY": 1}
elif args.timestamp is not "none":
stamp = "DATE";
if args.timestamp == "start":
stamp = time.strftime("%B %d, %Y", time.gmtime(firstTime))
elif args.timestamp == "end":
stamp = time.strftime("%B %d, %Y", time.gmtime(lastTime))
elif args.timestamp == "full":
stamp = "{} to {}".format(time.strftime("%b %d, %Y", time.gmtime(firstTime)), time.strftime("%b %d, %Y", time.gmtime(lastTime)))
elif args.timestamp == "month":
# use the month of one day before the last day
stamp = time.strftime("%B %Y", time.gmtime(lastTime - 86400))
# make the timestamp a bit bigger than everything else
size = int(math.ceil(wordData[max(wordData, key=wordData.get)] * 1.3)) + 1
wordData[stamp] = size
print("Added timestamp \"{}\" of size {}".format(stamp, size))
if args.outfile == "":
print(json.dumps(wordData))
else:
with open(args.outfile + ".tmp", "w") as tmpFile:
tmpFile.write(json.dumps(wordData))
shutil.move(args.outfile + ".tmp", args.outfile)
print("Dumped {} words to {}".format(len(wordData), args.outfile))