From 0e8d5a4379e4f39baab0be54a8b66cd2c5038fc6 Mon Sep 17 00:00:00 2001 From: Lucidiot Date: Sat, 20 Mar 2021 22:37:39 +0000 Subject: [PATCH] Rewrite fprss.py - Only adds .jpg for Imgur links, closes #2 - Performs HTTP requests to retrieve the MIME type and size of all images, falling back to a set of known images and zero length for failing requests, and adds caching to avoid flooding, closes #3 - Drops the use of `mensch` and instead reads straight from the IRC log without a week fliter, closes #4 - Adds mod_syndication and skipDays, closes #5 --- .gitignore | 1 + README.md | 9 ++- fprss.py | 192 +++++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 151 insertions(+), 51 deletions(-) diff --git a/.gitignore b/.gitignore index 6e4266f..ac61029 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ __pycache__/ *.py[cod] *$py.class +cache.json diff --git a/README.md b/README.md index fa9676d..cc9d340 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,14 @@ A simple script to make a `#fridaypostcard` RSS feed. * Python 3.6+ * [xmltodict](https://github.com/martinblech/xmltodict) +* [requests](https://requests.readthedocs.io/en/master/) ## Usage -Run [`fprss.py`](fprss.py) to build the #fridaypostcard RSS feed. You can add this script to CRON to build the feeds regularly. +Run [`fprss.py`](fprss.py) to build the #fridaypostcard RSS feed. + +You can add this script to CRON to build the feeds regularly; here is what I use to generate every hour on Friday: + +``` +* */1 * * 5 nice -n 19 /home/lucidiot/dev/fprss/fprss.py +``` diff --git a/fprss.py b/fprss.py index ccbc66b..953b8f1 100755 --- a/fprss.py +++ b/fprss.py @@ -1,102 +1,194 @@ #!/usr/bin/env python3 from collections import namedtuple from datetime import datetime, timezone -import os.path +from pathlib import Path +from requests.exceptions import HTTPError +from typing import Iterator, Optional +from urllib.parse import urlparse +import json import re -import subprocess -import sys +import requests import xmltodict -URL_REGEX = re.compile(r'(?Phttps?://[^\s]+)') -Postcard = namedtuple('Postcard', ['timestamp', 'username', 'url', 'message']) +RSS_DATE_FORMAT = '%a, %d %b %Y %T %z' +SANITIZE_REGEX = re.compile(r'(?:[\002\017\021\026\035\036\037]|\003(?:[0-9]{1,2}(?:,[0-9]{1,2})?)?|\004(?:[0-9A-F]{,6})?)', re.IGNORECASE) +URL_REGEX = re.compile(r'https?://[A-Za-z0-9-._~:/?#[\]%@!$&\'()*+,;=]+', re.IGNORECASE) +LOG_FILE = Path('~archangelic/irc/log').expanduser() +CACHE_FILE = Path(__file__).absolute().parent / 'cache.json' +OUTPUT_PATH = Path('~/public_html/fridaypostcard.xml').expanduser() + +IGNORE_USERNAMES = {'quote_bot'} +# We cannot safely assume we will know all image extensions, but there are some obvious and common extensions that we can ignore. +IGNORE_EXTENSIONS = {'.html', '.htm', '.xml', '.json'} +KNOWN_MIME_TYPES = { + '.jpg': 'image/jpeg', + '.jpeg': 'image/jpeg', + '.jp2': 'image/jp2', + '.bmp': 'image/bmp', + '.png': 'image/png', + '.gif': 'image/gif', + '.svg': 'image/svg+xml', + '.webp': 'image/webp', +} + +Postcard = namedtuple('Postcard', ['timestamp', 'username', 'url', 'message', 'mime_type', 'length']) + +# MIME type and length cache to avoid making hundreds of requests each time +cache = {} -def parse_log(log: str) -> Postcard: +def get_logs() -> Iterator[str]: + with LOG_FILE.open('r') as f: + for line in f: + if '#fridaypostcard' in line: + yield line + + +def sanitize_message(message: str) -> str: + return SANITIZE_REGEX.sub('', message) + + +def parse_log(log: str) -> Optional[Postcard]: timestamp, username, message = log.split("\t", 3) - url = URL_REGEX.search(message).group("url") - message = message.replace(url, '').replace('#fridaypostcard', '').strip() + + if username in IGNORE_USERNAMES: + return + + message = sanitize_message(message) + match = URL_REGEX.search(message) + # Ignore messages with invalid URLs + if not match: + return + url_str = match.group() + + message = message.replace(url_str, '').replace('#fridaypostcard', '').strip() + + try: + url = urlparse(url_str) + except: + return + + extension = Path(url.path).suffix + if extension in IGNORE_EXTENSIONS: + return + # Force-replace https with http to ensure PSP compatibility - url = url.replace('https', 'http') \ - .replace('http://imgur', 'http://i.imgur') - if not any(map(url.lower().endswith, ( - '.jpg', - '.gif', - '.png', - '.svg', - '.webp', - '.bmp', - '.tif' - ))): - url = url + '.jpg' + url_str = url_str.replace('https', 'http') + + # Turn Imgur links into direct links + if url.netloc == 'imgur.com': + url_str = url_str.replace('http://imgur', 'http://i.imgur') + if extension not in KNOWN_MIME_TYPES: + url_str += '.jpg' + + mime_type, length = cache.get(url_str, ['', '0']) + + if not mime_type: + try: + with requests.get(url_str, allow_redirects=True, stream=True, timeout=5) as resp: + resp.raise_for_status() + length = resp.headers.get('Content-Length', '0') + mime_type = resp.headers.get('Content-Type', KNOWN_MIME_TYPES.get(extension, '')) + except HTTPError as e: + # Dirty hack to avoid repeating lots of requests for images that are now broken. + if e.response.status_code >= 400 and e.response.status_code <= 500: + mime_type = KNOWN_MIME_TYPES.get(extension, 'image/x-error') + length = '0' + cache[url_str] = [mime_type, length] + return + except Exception: + return + cache[url_str] = [mime_type, length] + return Postcard( timestamp=int(timestamp), username=username, - url=url, + url=url_str, message=message, + mime_type=mime_type, + length=length, ) -def is_current_week(postcard): - return ( - datetime.utcnow() - datetime.fromtimestamp(postcard.timestamp) - ).days < 7 - - def to_item(postcard): if postcard.message: title = postcard.message - description = postcard.message + '\n~' + postcard.username + description = f'{postcard.message}
~{postcard.username}' else: - title = 'Postcard from ~' + postcard.username - description = '~' + postcard.username + title = f'Postcard from ~{postcard.username}' + # Empty arrays causes the tag to be ignored by xmltodict + description = [] return { "title": title, "description": description, "link": postcard.url, - "author": postcard.username, + "guid": postcard.url, + "author": f"{postcard.username}@tilde.town ({postcard.username})", "pubDate": datetime.fromtimestamp(postcard.timestamp, timezone.utc) - .strftime('%a, %d %b %Y %T %z'), + .strftime(RSS_DATE_FORMAT), "enclosure": { "@url": postcard.url, + "@type": postcard.mime_type, + "@length": postcard.length, }, - # TODO: media:thumbnail, MIME type, content length } def main(): - logs = subprocess.run( - args=[os.path.expanduser('~karlen/bin/mensch'), '-f'], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - check=True, - ).stdout.decode('utf-8').splitlines() - + global cache + if CACHE_FILE.is_file(): + cache = json.loads(CACHE_FILE.read_text()) output = { "rss": { "@version": "2.0", + "@xmlns:atom": "http://www.w3.org/2005/Atom", + "@xmlns:sy": "http://purl.org/rss/1.0/modules/syndication/", "channel": { "title": "#fridaypostcard", "description": "to contribute, share a link to an image on " "irc with the text #fridaypostcard. updated " "every friday", "link": "http://tilde.town/~jumblesale/fp.html", + "atom:link": { + "@rel": "self", + "@type": "application/rss+xml", + "@href": "http://tilde.town/~lucidiot/fridaypostcard.xml" + }, "language": "en", "pubDate": datetime.now(timezone.utc) - .strftime('%a, %d %b %Y %T %z'), - "docs": "https://cyber.harvard.edu/rss/rss.html", + .strftime(RSS_DATE_FORMAT), + "docs": "https://www.rssboard.org/rss-specification", + "webMaster": "lucidiot@tilde.town (~lucidiot)", "generator": "fprss", - "item": list(map(to_item, filter(is_current_week, - map(parse_log, logs)))), + "skipDays": { + "day": [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Saturday", + "Sunday" + ] + }, + "sy:updatePeriod": "weekly", + "sy:updateFrequency": "1", + "sy:updateBase": "1971-01-01T18:00+00:00", + "item": list({ + # Unique by GUID + item["guid"]: item + for item in map(to_item, filter(None, map(parse_log, get_logs()))) + }.values()), } } } - with open(os.path.expanduser('~/public_html/fridaypostcard.xml'), 'w') as f: - f.write(xmltodict.unparse( - output, - pretty=True, - short_empty_elements=True, - )) + output = xmltodict.unparse( + output, + pretty=True, + short_empty_elements=True, + ) + OUTPUT_PATH.write_text(output) + CACHE_FILE.write_text(json.dumps(cache)) if __name__ == '__main__':