#!/usr/bin/python import fileinput import json import time import calendar import re import shutil import argparse import logging, sys import math logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) parser = argparse.ArgumentParser( description="Generate word cloud data based off of irc chat logs" ) parser.add_argument( "-logfile", help="irc log file to read from", default="/home/archangelic/irc/log" ) parser.add_argument("-outfile", help="output file to write to", default="") parser.add_argument( "-timeend", type=int, help="end time of the word cloud (in epoch time)", default=calendar.timegm(time.gmtime()), ) parser.add_argument( "-timestart", type=int, help="start time of the word cloud (in epoch time)", default=-1, ) parser.add_argument( "-bannedUsersFile", help="file containing list of banned users", default="/home/krowbar/Code/python/bannedUsers", ) parser.add_argument( "-bannedWordsFile", help="file containing list of banned words", default="/home/krowbar/Code/python/bannedWords", ) parser.add_argument( "-minLength", type=int, help="minimum size of words to include in the cloud", default=3, ) parser.add_argument( "-minOccurrence", type=int, help="the minimum occurence of a word to include it in the cloud", default=3, ) parser.add_argument( "-timestamp", help="what kind of time stamp should be inserted into the chat cloud. valid values are none, start, end, month, and full", default="none" ) args = parser.parse_args() wordData = {} # keyed by "word" that contains a count # we only care about recent chats, let's say for the past sixteen hours args.timestart = ( args.timestart if args.timestart != -1 else args.timeend - (16 * 60 * 60) ) # timeCutoff = calendar.timegm(time.strptime("1 Oct 16", "%d %b %y")) logging.info( "Generating word cloud based off words from " + str(args.timestart) + " to " + str(args.timeend) ) bannedWords = open(args.bannedWordsFile).read().splitlines() bannedUsers = open(args.bannedUsersFile).read().splitlines() with open(args.logfile, "r") as log: firstTime = None # track these for the timestamp we may do later lastTime = None for line in log: try: mtime, user, message = line.split("\t", 3) mtime = int(mtime) except ValueError: continue # There are some bad lines in the log file that we'll ignore if we can't parse if user in bannedUsers: continue # We don't care what they say if mtime >= args.timestart and mtime <= args.timeend: # print "Processing line from " + user + " at " + str(mtime) if firstTime is None: firstTime = mtime lastTime = mtime for word in ( re.sub("['\"\`\/\\;:,.?!*&^\-()<>\{\}|_\[\]0-9]", " ", message) .lower() .split() ): # changing symbols into spaces instead of stripping them avoids compounded words if len(word) < args.minLength or word in bannedWords: # print "Rejecting " + word continue # if the word already exists in the list if word in wordData: wordData[word] += 1 else: # if they are new wordData[word] = 1 # print "Added word: " + word wordData = {i: wordData[i] for i in wordData if wordData[i] >= args.minOccurrence} if len(wordData) == 0: wordData = {"NOTHING": 1, "INTERESTING": 1, "TODAY": 1} elif args.timestamp is not "none": stamp = "DATE"; if args.timestamp == "start": stamp = time.strftime("%B %d, %Y", time.gmtime(firstTime)) elif args.timestamp == "end": stamp = time.strftime("%B %d, %Y", time.gmtime(lastTime)) elif args.timestamp == "full": stamp = "{} to {}".format(time.strftime("%b %d, %Y", time.gmtime(firstTime)), time.strftime("%b %d, %Y", time.gmtime(lastTime))) elif args.timestamp == "month": # use the month of one day before the last day stamp = time.strftime("%B %Y", time.gmtime(lastTime - 86400)) # make the timestamp a bit bigger than everything else size = int(math.ceil(wordData[max(wordData, key=wordData.get)] * 1.3)) + 1 wordData[stamp] = size print("Added timestamp \"{}\" of size {}".format(stamp, size)) if args.outfile == "": print(json.dumps(wordData)) else: with open(args.outfile + ".tmp", "w") as tmpFile: tmpFile.write(json.dumps(wordData)) shutil.move(args.outfile + ".tmp", args.outfile) print("Dumped {} words to {}".format(len(wordData), args.outfile))