2019-12-11 01:37:46 +00:00
|
|
|
#!/usr/bin/env python3
|
2019-12-05 01:14:10 +00:00
|
|
|
"""This module takes input and returns link_data, the data structure linkulator works from"""
|
2019-12-11 01:37:46 +00:00
|
|
|
from time import time
|
2019-12-05 01:14:10 +00:00
|
|
|
from pathlib import PurePath
|
|
|
|
from glob import glob
|
2019-12-05 02:20:23 +00:00
|
|
|
import re
|
2019-12-05 01:14:10 +00:00
|
|
|
|
2019-12-11 01:37:46 +00:00
|
|
|
# regex for removing escape characters from https://stackoverflow.com/a/14693789
|
|
|
|
ESCAPE_CHARS = re.compile(r"\x1B[@-_][0-?]*[ -/]*[@-~]")
|
|
|
|
BAD_CHARS = re.compile(r"[\t\r\n\f\v]*")
|
2019-12-05 02:20:23 +00:00
|
|
|
|
|
|
|
|
|
|
|
def is_well_formed_line(line: str) -> bool:
|
2019-12-05 01:14:10 +00:00
|
|
|
"""Checks if current line is valid or not, returns true and false respectively."""
|
2019-12-05 02:20:23 +00:00
|
|
|
pipe_count = (
|
|
|
|
4 ## A PROPERLY FORMATED LINE IN linkulator.data HAS EXACTLY FOUR PIPES.
|
|
|
|
)
|
|
|
|
return line.count("|") == pipe_count
|
|
|
|
|
|
|
|
|
2019-12-11 01:37:46 +00:00
|
|
|
def is_valid_time(timestamp: str) -> bool:
|
|
|
|
"""identifies future dated timestamps - returns true if valid time, false is invalid"""
|
|
|
|
return float(timestamp) < time()
|
2019-12-05 01:14:10 +00:00
|
|
|
|
|
|
|
|
2019-12-11 01:37:46 +00:00
|
|
|
def wash_line(line: str) -> str:
|
2019-12-05 02:20:23 +00:00
|
|
|
"""take line and return a version with bad characters removed"""
|
2019-12-11 01:37:46 +00:00
|
|
|
line = ESCAPE_CHARS.sub("", line)
|
|
|
|
line = BAD_CHARS.sub("", line)
|
|
|
|
return line
|
2019-12-05 02:20:23 +00:00
|
|
|
|
|
|
|
|
|
|
|
def process(line: str, file_owner: str):
|
2019-12-05 01:14:10 +00:00
|
|
|
"""Takes a line, returns a list based on the delimeter pipe character"""
|
2019-12-05 02:20:23 +00:00
|
|
|
if not is_well_formed_line(line):
|
|
|
|
raise ValueError("Not a well formed record")
|
|
|
|
line = wash_line(line)
|
2019-12-05 01:14:10 +00:00
|
|
|
split_line = line.split("|")
|
2019-12-11 01:37:46 +00:00
|
|
|
if split_line[0] and not is_valid_time(split_line[0]):
|
2019-12-05 02:20:23 +00:00
|
|
|
raise ValueError("Invalid date")
|
|
|
|
split_line.insert(0, file_owner)
|
2019-12-05 01:14:10 +00:00
|
|
|
return split_line
|
|
|
|
|
|
|
|
|
|
|
|
def get(config, ignore_names):
|
2019-12-11 01:37:46 +00:00
|
|
|
"""reads data files for non-ignored users and returns valid data in linkulator formats"""
|
2019-12-05 01:14:10 +00:00
|
|
|
link_data = []
|
|
|
|
## username, datestamp, parent-id, category, link-url, link-title
|
|
|
|
categories = []
|
|
|
|
category_counts = {}
|
|
|
|
ignore_names = []
|
|
|
|
|
|
|
|
## WHENEVER THIS FUNCTION IS CALLED, THE DATA IS REFRESHED FROM FILES. SINCE
|
|
|
|
## DISK IO IS PROBABLY THE HEAVIEST PART OF THIS SCRIPT, DON'T DO THIS OFTEN.
|
|
|
|
|
|
|
|
files_pattern = str(
|
|
|
|
PurePath(config.PATHS.all_homedir_pattern).joinpath(
|
|
|
|
config.PATHS.datadir, config.PATHS.datafile
|
|
|
|
)
|
|
|
|
)
|
|
|
|
linkulator_files = glob(files_pattern)
|
|
|
|
|
2019-12-11 01:37:46 +00:00
|
|
|
id_iterator = 1
|
|
|
|
|
2019-12-05 01:14:10 +00:00
|
|
|
for filename in linkulator_files:
|
2019-12-11 01:37:46 +00:00
|
|
|
with open(filename) as cfile:
|
2019-12-05 01:14:10 +00:00
|
|
|
# get file owner username from path
|
|
|
|
file_owner = PurePath(filename).parent.parent.name
|
|
|
|
if file_owner in ignore_names:
|
|
|
|
# ignore names found in ignore file
|
|
|
|
continue
|
2019-12-11 01:37:46 +00:00
|
|
|
for line in cfile:
|
2019-12-05 02:20:23 +00:00
|
|
|
try:
|
|
|
|
split_line = process(line, file_owner)
|
|
|
|
except ValueError:
|
2019-12-05 01:14:10 +00:00
|
|
|
continue
|
2019-12-11 01:37:46 +00:00
|
|
|
|
|
|
|
# assign parent items (links) an ID
|
|
|
|
if split_line[2] == "":
|
|
|
|
split_line.insert(0, id_iterator)
|
|
|
|
id_iterator += 1
|
|
|
|
else:
|
|
|
|
split_line.insert(0, "")
|
|
|
|
|
|
|
|
link_data.append(split_line)
|
|
|
|
|
|
|
|
# sort links by creation date
|
2019-12-06 00:28:52 +00:00
|
|
|
link_data.sort(key=lambda x: x[2], reverse=True)
|
2019-12-05 01:14:10 +00:00
|
|
|
|
2019-12-11 01:37:46 +00:00
|
|
|
# generate categories list and category count from sorted link data
|
|
|
|
for record in link_data:
|
|
|
|
cat = record[4]
|
|
|
|
if cat not in categories and cat != "":
|
|
|
|
categories.append(cat)
|
|
|
|
category_counts[cat] = 1
|
|
|
|
elif cat in categories:
|
|
|
|
category_counts[cat] += 1
|
2019-12-05 01:14:10 +00:00
|
|
|
|
|
|
|
return link_data, categories, category_counts
|