Rewrite fprss.py

- Only adds .jpg for Imgur links, closes #2 - Performs HTTP requests to retrieve the MIME type and size of all images, falling back to a set of known images and zero length for failing requests, and adds caching to avoid flooding, closes #3 - Drops the use of `mensch` and instead reads straight from the IRC log without a week fliter, closes #4 - Adds mod_syndication and skipDays, closes #5
2021-03-20 22:37:39 +00:00 · 2021-03-20 22:37:39 +00:00 · 0e8d5a4379
parent 655eb5aa1a
commit 0e8d5a4379
3 changed files with 151 additions and 51 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 __pycache__/
 *.py[cod]
 *$py.class
+cache.json
--- a/README.md
+++ b/README.md
@ -6,7 +6,14 @@ A simple script to make a `#fridaypostcard` RSS feed.

 * Python 3.6+
 * [xmltodict](https://github.com/martinblech/xmltodict)
+* [requests](https://requests.readthedocs.io/en/master/)

 ## Usage

-Run [`fprss.py`](fprss.py) to build the #fridaypostcard RSS feed. You can add this script to CRON to build the feeds regularly.
+Run [`fprss.py`](fprss.py) to build the #fridaypostcard RSS feed.
+
+You can add this script to CRON to build the feeds regularly; here is what I use to generate every hour on Friday:
+
+```
+* */1 * * 5 nice -n 19 /home/lucidiot/dev/fprss/fprss.py
+```
--- a/fprss.py
+++ b/fprss.py
@ -1,102 +1,194 @@
 #!/usr/bin/env python3
 from collections import namedtuple
 from datetime import datetime, timezone
-import os.path
+from pathlib import Path
+from requests.exceptions import HTTPError
+from typing import Iterator, Optional
+from urllib.parse import urlparse
+import json
 import re
-import subprocess
-import sys
+import requests
 import xmltodict

-URL_REGEX = re.compile(r'(?P<url>https?://[^\s]+)')
-Postcard = namedtuple('Postcard', ['timestamp', 'username', 'url', 'message'])
+RSS_DATE_FORMAT = '%a, %d %b %Y %T %z'
+SANITIZE_REGEX = re.compile(r'(?:[\002\017\021\026\035\036\037]|\003(?:[0-9]{1,2}(?:,[0-9]{1,2})?)?|\004(?:[0-9A-F]{,6})?)', re.IGNORECASE)
+URL_REGEX = re.compile(r'https?://[A-Za-z0-9-._~:/?#[\]%@!$&\'()*+,;=]+', re.IGNORECASE)
+LOG_FILE = Path('~archangelic/irc/log').expanduser()
+CACHE_FILE = Path(__file__).absolute().parent / 'cache.json'
+OUTPUT_PATH = Path('~/public_html/fridaypostcard.xml').expanduser()
+
+IGNORE_USERNAMES = {'quote_bot'}
+# We cannot safely assume we will know all image extensions, but there are some obvious and common extensions that we can ignore.
+IGNORE_EXTENSIONS = {'.html', '.htm', '.xml', '.json'}
+KNOWN_MIME_TYPES = {
+    '.jpg': 'image/jpeg',
+    '.jpeg': 'image/jpeg',
+    '.jp2': 'image/jp2',
+    '.bmp': 'image/bmp',
+    '.png': 'image/png',
+    '.gif': 'image/gif',
+    '.svg': 'image/svg+xml',
+    '.webp': 'image/webp',
+}
+
+Postcard = namedtuple('Postcard', ['timestamp', 'username', 'url', 'message', 'mime_type', 'length'])
+
+# MIME type and length cache to avoid making hundreds of requests each time
+cache = {}


-def parse_log(log: str) -> Postcard:
+def get_logs() -> Iterator[str]:
+    with LOG_FILE.open('r') as f:
+        for line in f:
+            if '#fridaypostcard' in line:
+                yield line
+
+
+def sanitize_message(message: str) -> str:
+    return SANITIZE_REGEX.sub('', message)
+
+
+def parse_log(log: str) -> Optional[Postcard]:
    timestamp, username, message = log.split("\t", 3)
-    url = URL_REGEX.search(message).group("url")
-    message = message.replace(url, '').replace('#fridaypostcard', '').strip()
+
+    if username in IGNORE_USERNAMES:
+        return
+
+    message = sanitize_message(message)
+    match = URL_REGEX.search(message)
+    # Ignore messages with invalid URLs
+    if not match:
+        return
+    url_str = match.group()
+
+    message = message.replace(url_str, '').replace('#fridaypostcard', '').strip()
+
+    try:
+        url = urlparse(url_str)
+    except:
+        return
+
+    extension = Path(url.path).suffix
+    if extension in IGNORE_EXTENSIONS:
+        return
+
    # Force-replace https with http to ensure PSP compatibility
-    url = url.replace('https', 'http') \
-             .replace('http://imgur', 'http://i.imgur')
-    if not any(map(url.lower().endswith, (
-                '.jpg',
-                '.gif',
-                '.png',
-                '.svg',
-                '.webp',
-                '.bmp',
-                '.tif'
-            ))):
-        url = url + '.jpg'
+    url_str = url_str.replace('https', 'http')
+
+    # Turn Imgur links into direct links
+    if url.netloc == 'imgur.com':
+        url_str = url_str.replace('http://imgur', 'http://i.imgur')
+        if extension not in KNOWN_MIME_TYPES:
+            url_str += '.jpg'
+
+    mime_type, length = cache.get(url_str, ['', '0'])
+
+    if not mime_type:
+        try:
+            with requests.get(url_str, allow_redirects=True, stream=True, timeout=5) as resp:
+                resp.raise_for_status()
+                length = resp.headers.get('Content-Length', '0')
+                mime_type = resp.headers.get('Content-Type', KNOWN_MIME_TYPES.get(extension, ''))
+        except HTTPError as e:
+            # Dirty hack to avoid repeating lots of requests for images that are now broken.
+            if e.response.status_code >= 400 and e.response.status_code <= 500:
+                mime_type = KNOWN_MIME_TYPES.get(extension, 'image/x-error')
+                length = '0'
+            cache[url_str] = [mime_type, length]
+            return
+        except Exception:
+            return
+        cache[url_str] = [mime_type, length]
+
    return Postcard(
        timestamp=int(timestamp),
        username=username,
-        url=url,
+        url=url_str,
        message=message,
+        mime_type=mime_type,
+        length=length,
    )


-def is_current_week(postcard):
-    return (
-        datetime.utcnow() - datetime.fromtimestamp(postcard.timestamp)
-    ).days < 7
-
-
 def to_item(postcard):
    if postcard.message:
        title = postcard.message
-        description = postcard.message + '\n~' + postcard.username
+        description = f'{postcard.message}<br />~{postcard.username}'
    else:
-        title = 'Postcard from ~' + postcard.username
-        description = '~' + postcard.username
+        title = f'Postcard from ~{postcard.username}'
+        # Empty arrays causes the tag to be ignored by xmltodict
+        description = []

    return {
        "title": title,
        "description": description,
        "link": postcard.url,
-        "author": postcard.username,
+        "guid": postcard.url,
+        "author": f"{postcard.username}@tilde.town ({postcard.username})",
        "pubDate": datetime.fromtimestamp(postcard.timestamp, timezone.utc)
-                           .strftime('%a, %d %b %Y %T %z'),
+                           .strftime(RSS_DATE_FORMAT),
        "enclosure": {
            "@url": postcard.url,
+            "@type": postcard.mime_type,
+            "@length": postcard.length,
        },
-        # TODO: media:thumbnail, MIME type, content length
    }


 def main():
-    logs = subprocess.run(
-        args=[os.path.expanduser('~karlen/bin/mensch'), '-f'],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        check=True,
-    ).stdout.decode('utf-8').splitlines()
-
+    global cache
+    if CACHE_FILE.is_file():
+        cache = json.loads(CACHE_FILE.read_text())
    output = {
        "rss": {
            "@version": "2.0",
+            "@xmlns:atom": "http://www.w3.org/2005/Atom",
+            "@xmlns:sy": "http://purl.org/rss/1.0/modules/syndication/",
            "channel": {
                "title": "#fridaypostcard",
                "description": "to contribute, share a link to an image on "
                               "irc with the text #fridaypostcard. updated "
                               "every friday",
                "link": "http://tilde.town/~jumblesale/fp.html",
+                "atom:link": {
+                    "@rel": "self",
+                    "@type": "application/rss+xml",
+                    "@href": "http://tilde.town/~lucidiot/fridaypostcard.xml"
+                },
                "language": "en",
                "pubDate": datetime.now(timezone.utc)
-                                   .strftime('%a, %d %b %Y %T %z'),
-                "docs": "https://cyber.harvard.edu/rss/rss.html",
+                                   .strftime(RSS_DATE_FORMAT),
+                "docs": "https://www.rssboard.org/rss-specification",
+                "webMaster": "lucidiot@tilde.town (~lucidiot)",
                "generator": "fprss",
-                "item": list(map(to_item, filter(is_current_week,
-                    map(parse_log, logs)))),
+                "skipDays": {
+                    "day": [
+                        "Monday",
+                        "Tuesday",
+                        "Wednesday",
+                        "Thursday",
+                        "Saturday",
+                        "Sunday"
+                    ]
+                },
+                "sy:updatePeriod": "weekly",
+                "sy:updateFrequency": "1",
+                "sy:updateBase": "1971-01-01T18:00+00:00",
+                "item": list({
+                    # Unique by GUID
+                    item["guid"]: item
+                    for item in map(to_item, filter(None, map(parse_log, get_logs())))
+                }.values()),
            }
        }
    }
-    with open(os.path.expanduser('~/public_html/fridaypostcard.xml'), 'w') as f:
-        f.write(xmltodict.unparse(
-            output,
-            pretty=True,
-            short_empty_elements=True,
-        ))
+    output = xmltodict.unparse(
+        output,
+        pretty=True,
+        short_empty_elements=True,
+    )
+    OUTPUT_PATH.write_text(output)
+    CACHE_FILE.write_text(json.dumps(cache))


 if __name__ == '__main__':