linkulator2/data.py

96 lines
3.2 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""This module takes input and returns link_data, the data structure linkulator works from"""
import time
from pathlib import PurePath
from glob import glob
import re
BADCHARS = re.compile(r"^[ -~]+$")
def is_well_formed_line(line: str) -> bool:
"""Checks if current line is valid or not, returns true and false respectively."""
pipe_count = (
4 ## A PROPERLY FORMATED LINE IN linkulator.data HAS EXACTLY FOUR PIPES.
)
return line.count("|") == pipe_count
def is_valid_time(split_line):
"""identifies future dated records"""
return split_line[0] and float(split_line[0]) < time.time()
def wash_line(line):
"""take line and return a version with bad characters removed"""
# return BADCHARS.sub("", line)
return line.rstrip("\r\n")
def process(line: str, file_owner: str):
"""Takes a line, returns a list based on the delimeter pipe character"""
if not is_well_formed_line(line):
raise ValueError("Not a well formed record")
line = wash_line(line)
split_line = line.split("|")
if not is_valid_time(split_line):
raise ValueError("Invalid date")
split_line.insert(0, file_owner)
return split_line
def get(config, ignore_names):
"""reads data files for non-ignored users and returns valid data in linkulator format"""
link_data = []
## username, datestamp, parent-id, category, link-url, link-title
categories = []
category_counts = {}
ignore_names = []
## WHENEVER THIS FUNCTION IS CALLED, THE DATA IS REFRESHED FROM FILES. SINCE
## DISK IO IS PROBABLY THE HEAVIEST PART OF THIS SCRIPT, DON'T DO THIS OFTEN.
files_pattern = str(
PurePath(config.PATHS.all_homedir_pattern).joinpath(
config.PATHS.datadir, config.PATHS.datafile
)
)
linkulator_files = glob(files_pattern)
linkulator_lines = []
for filename in linkulator_files:
with open(filename) as f:
# get file owner username from path
file_owner = PurePath(filename).parent.parent.name
if file_owner in ignore_names:
# ignore names found in ignore file
continue
for line in f:
try:
split_line = process(line, file_owner)
except ValueError:
continue
linkulator_lines.append(split_line) ## creating a list of lists
i = 1
for idx, line in enumerate(linkulator_lines):
if line[2] == "": # CREATE/INSERT PARENT ID:
linkulator_lines[idx].insert(0, i)
i = i + 1
else: ## NOT PARENT, SO NO PARENT ID
linkulator_lines[idx].insert(0, "")
link_data = linkulator_lines
## THIS IS SUPPOSED TO SORT ALL LINKS BY CREATION DATE. NEED TO CONFIRM THAT IT WORKS.
link_data.sort(key=lambda x: x[2], reverse=True)
category_counts.clear() ## CLEAR SO WE DON'T DOUBLE-COUNT IF FNC RUN MORE THAN ONCE.
for line in link_data:
if line[4] not in categories and line[4] != "":
categories.append(line[4])
category_counts[line[4]] = 1
elif line[4] in categories:
category_counts[line[4]] = category_counts[line[4]] + 1
return link_data, categories, category_counts