#!/usr/bin/python import fileinput import json import time import calendar import re import shutil import argparse import logging, sys logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) parser = argparse.ArgumentParser(description='Generate word cloud data based off of irc chat logs') parser.add_argument('-logfile', help='irc log file to read from', default='/home/archangelic/irc/log') parser.add_argument('-outfile', help='output file to write to', default='') parser.add_argument('-timeend', type=int, help='end time of the word cloud (in epoch time)', default=calendar.timegm(time.gmtime())) parser.add_argument('-timestart', type=int, help='start time of the word cloud (in epoch time)', default=-1) parser.add_argument('-bannedUsersFile', help='file containing list of banned users', default='/home/krowbar/Code/python/bannedUsers') parser.add_argument('-bannedWordsFile', help='file containing list of banned words', default='/home/krowbar/Code/python/bannedWords') parser.add_argument('-minLength', type=int, help='minimum size of words to include in the cloud', default=3) parser.add_argument('-minOccurrence', type=int, help='the minimum occurence of a word to include it in the cloud', default=3) args = parser.parse_args() wordData = {} # keyed by "word" that contains a count #we only care about recent chats, let's say for the past sixteen hours args.timestart = args.timestart if args.timestart != -1 else args.timeend - (16 * 60 * 60) #timeCutoff = calendar.timegm(time.strptime("1 Oct 16", "%d %b %y")) logging.info("Generating word cloud based off words from " + str(args.timestart) + " to " + str(args.timeend)) bannedWords = open(args.bannedWordsFile).read().splitlines() bannedUsers = open(args.bannedUsersFile).read().splitlines() with open(args.logfile, "r") as log: for line in log: try: time, user, message = line.split("\t", 3) time = int(time) except ValueError: continue #There are some bad lines in the log file that we'll ignore if we can't parse if user in bannedUsers: continue #We don't care what they say if time >= args.timestart and time <= args.timeend: #print "Processing line from " + user + " at " + str(time) for word in re.sub('[\'\"\`\/\\;:,.?!*&^\-()<>\{\}|_\[\]0-9]', ' ', message).lower().split(): #changing symbols into spaces instead of stripping them avoids compounded words if len(word) < args.minLength or word in bannedWords: #print "Rejecting " + word continue #if the word already exists in the list if word in wordData: wordData[word] += 1 else: #if they are new wordData[word] = 1 #print "Added word: " + word wordData = {i:wordData[i] for i in wordData if wordData[i] >= args.minOccurrence } if len(wordData) == 0: wordData = {"NOTHING": 1, "INTERESTING": 1, "TODAY": 1} if args.outfile == '': print json.dumps(wordData) else: with open(args.outfile + ".tmp", "w") as tmpFile: tmpFile.write(json.dumps(wordData)) shutil.move(args.outfile + ".tmp", args.outfile)