Rewrite fprss.py

- Only adds .jpg for Imgur links, closes #2
- Performs HTTP requests to retrieve the MIME type and size of all
  images, falling back to a set of known images and zero length for
  failing requests, and adds caching to avoid flooding, closes #3
- Drops the use of `mensch` and instead reads straight from the
  IRC log without a week fliter, closes #4
- Adds mod_syndication and skipDays, closes #5
This commit is contained in:
Lucidiot 2021-03-20 22:37:39 +00:00
parent 655eb5aa1a
commit 0e8d5a4379
Signed by: lucidiot
GPG Key ID: 3358C1CA6906FB8D
3 changed files with 151 additions and 51 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
__pycache__/
*.py[cod]
*$py.class
cache.json

View File

@ -6,7 +6,14 @@ A simple script to make a `#fridaypostcard` RSS feed.
* Python 3.6+
* [xmltodict](https://github.com/martinblech/xmltodict)
* [requests](https://requests.readthedocs.io/en/master/)
## Usage
Run [`fprss.py`](fprss.py) to build the #fridaypostcard RSS feed. You can add this script to CRON to build the feeds regularly.
Run [`fprss.py`](fprss.py) to build the #fridaypostcard RSS feed.
You can add this script to CRON to build the feeds regularly; here is what I use to generate every hour on Friday:
```
* */1 * * 5 nice -n 19 /home/lucidiot/dev/fprss/fprss.py
```

192
fprss.py
View File

@ -1,102 +1,194 @@
#!/usr/bin/env python3
from collections import namedtuple
from datetime import datetime, timezone
import os.path
from pathlib import Path
from requests.exceptions import HTTPError
from typing import Iterator, Optional
from urllib.parse import urlparse
import json
import re
import subprocess
import sys
import requests
import xmltodict
URL_REGEX = re.compile(r'(?P<url>https?://[^\s]+)')
Postcard = namedtuple('Postcard', ['timestamp', 'username', 'url', 'message'])
RSS_DATE_FORMAT = '%a, %d %b %Y %T %z'
SANITIZE_REGEX = re.compile(r'(?:[\002\017\021\026\035\036\037]|\003(?:[0-9]{1,2}(?:,[0-9]{1,2})?)?|\004(?:[0-9A-F]{,6})?)', re.IGNORECASE)
URL_REGEX = re.compile(r'https?://[A-Za-z0-9-._~:/?#[\]%@!$&\'()*+,;=]+', re.IGNORECASE)
LOG_FILE = Path('~archangelic/irc/log').expanduser()
CACHE_FILE = Path(__file__).absolute().parent / 'cache.json'
OUTPUT_PATH = Path('~/public_html/fridaypostcard.xml').expanduser()
IGNORE_USERNAMES = {'quote_bot'}
# We cannot safely assume we will know all image extensions, but there are some obvious and common extensions that we can ignore.
IGNORE_EXTENSIONS = {'.html', '.htm', '.xml', '.json'}
KNOWN_MIME_TYPES = {
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.jp2': 'image/jp2',
'.bmp': 'image/bmp',
'.png': 'image/png',
'.gif': 'image/gif',
'.svg': 'image/svg+xml',
'.webp': 'image/webp',
}
Postcard = namedtuple('Postcard', ['timestamp', 'username', 'url', 'message', 'mime_type', 'length'])
# MIME type and length cache to avoid making hundreds of requests each time
cache = {}
def parse_log(log: str) -> Postcard:
def get_logs() -> Iterator[str]:
with LOG_FILE.open('r') as f:
for line in f:
if '#fridaypostcard' in line:
yield line
def sanitize_message(message: str) -> str:
return SANITIZE_REGEX.sub('', message)
def parse_log(log: str) -> Optional[Postcard]:
timestamp, username, message = log.split("\t", 3)
url = URL_REGEX.search(message).group("url")
message = message.replace(url, '').replace('#fridaypostcard', '').strip()
if username in IGNORE_USERNAMES:
return
message = sanitize_message(message)
match = URL_REGEX.search(message)
# Ignore messages with invalid URLs
if not match:
return
url_str = match.group()
message = message.replace(url_str, '').replace('#fridaypostcard', '').strip()
try:
url = urlparse(url_str)
except:
return
extension = Path(url.path).suffix
if extension in IGNORE_EXTENSIONS:
return
# Force-replace https with http to ensure PSP compatibility
url = url.replace('https', 'http') \
.replace('http://imgur', 'http://i.imgur')
if not any(map(url.lower().endswith, (
'.jpg',
'.gif',
'.png',
'.svg',
'.webp',
'.bmp',
'.tif'
))):
url = url + '.jpg'
url_str = url_str.replace('https', 'http')
# Turn Imgur links into direct links
if url.netloc == 'imgur.com':
url_str = url_str.replace('http://imgur', 'http://i.imgur')
if extension not in KNOWN_MIME_TYPES:
url_str += '.jpg'
mime_type, length = cache.get(url_str, ['', '0'])
if not mime_type:
try:
with requests.get(url_str, allow_redirects=True, stream=True, timeout=5) as resp:
resp.raise_for_status()
length = resp.headers.get('Content-Length', '0')
mime_type = resp.headers.get('Content-Type', KNOWN_MIME_TYPES.get(extension, ''))
except HTTPError as e:
# Dirty hack to avoid repeating lots of requests for images that are now broken.
if e.response.status_code >= 400 and e.response.status_code <= 500:
mime_type = KNOWN_MIME_TYPES.get(extension, 'image/x-error')
length = '0'
cache[url_str] = [mime_type, length]
return
except Exception:
return
cache[url_str] = [mime_type, length]
return Postcard(
timestamp=int(timestamp),
username=username,
url=url,
url=url_str,
message=message,
mime_type=mime_type,
length=length,
)
def is_current_week(postcard):
return (
datetime.utcnow() - datetime.fromtimestamp(postcard.timestamp)
).days < 7
def to_item(postcard):
if postcard.message:
title = postcard.message
description = postcard.message + '\n~' + postcard.username
description = f'{postcard.message}<br />~{postcard.username}'
else:
title = 'Postcard from ~' + postcard.username
description = '~' + postcard.username
title = f'Postcard from ~{postcard.username}'
# Empty arrays causes the tag to be ignored by xmltodict
description = []
return {
"title": title,
"description": description,
"link": postcard.url,
"author": postcard.username,
"guid": postcard.url,
"author": f"{postcard.username}@tilde.town ({postcard.username})",
"pubDate": datetime.fromtimestamp(postcard.timestamp, timezone.utc)
.strftime('%a, %d %b %Y %T %z'),
.strftime(RSS_DATE_FORMAT),
"enclosure": {
"@url": postcard.url,
"@type": postcard.mime_type,
"@length": postcard.length,
},
# TODO: media:thumbnail, MIME type, content length
}
def main():
logs = subprocess.run(
args=[os.path.expanduser('~karlen/bin/mensch'), '-f'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True,
).stdout.decode('utf-8').splitlines()
global cache
if CACHE_FILE.is_file():
cache = json.loads(CACHE_FILE.read_text())
output = {
"rss": {
"@version": "2.0",
"@xmlns:atom": "http://www.w3.org/2005/Atom",
"@xmlns:sy": "http://purl.org/rss/1.0/modules/syndication/",
"channel": {
"title": "#fridaypostcard",
"description": "to contribute, share a link to an image on "
"irc with the text #fridaypostcard. updated "
"every friday",
"link": "http://tilde.town/~jumblesale/fp.html",
"atom:link": {
"@rel": "self",
"@type": "application/rss+xml",
"@href": "http://tilde.town/~lucidiot/fridaypostcard.xml"
},
"language": "en",
"pubDate": datetime.now(timezone.utc)
.strftime('%a, %d %b %Y %T %z'),
"docs": "https://cyber.harvard.edu/rss/rss.html",
.strftime(RSS_DATE_FORMAT),
"docs": "https://www.rssboard.org/rss-specification",
"webMaster": "lucidiot@tilde.town (~lucidiot)",
"generator": "fprss",
"item": list(map(to_item, filter(is_current_week,
map(parse_log, logs)))),
"skipDays": {
"day": [
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Saturday",
"Sunday"
]
},
"sy:updatePeriod": "weekly",
"sy:updateFrequency": "1",
"sy:updateBase": "1971-01-01T18:00+00:00",
"item": list({
# Unique by GUID
item["guid"]: item
for item in map(to_item, filter(None, map(parse_log, get_logs())))
}.values()),
}
}
}
with open(os.path.expanduser('~/public_html/fridaypostcard.xml'), 'w') as f:
f.write(xmltodict.unparse(
output,
pretty=True,
short_empty_elements=True,
))
output = xmltodict.unparse(
output,
pretty=True,
short_empty_elements=True,
)
OUTPUT_PATH.write_text(output)
CACHE_FILE.write_text(json.dumps(cache))
if __name__ == '__main__':