fprss/fprss.py

229 lines
6.8 KiB
Python
Executable File

#!/usr/bin/env python3
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, Iterator, NamedTuple, Optional, Tuple
from urllib.parse import urlparse
import requests
import xmltodict # type: ignore
from requests.exceptions import HTTPError
RSS_DATE_FORMAT = '%a, %d %b %Y %T %z'
SANITIZE_REGEX = re.compile(
r'(?:[\002\017\021\026\035\036\037]'
r'|\003(?:[0-9]{1,2}(?:,[0-9]{1,2})?)?'
r'|\004(?:[0-9A-F]{,6})?)',
re.IGNORECASE
)
URL_REGEX = re.compile(
r'https?://[A-Za-z0-9-._~:/?#[\]%@!$&\'()*+,;=]+',
re.IGNORECASE
)
LOG_FILE = Path('~archangelic/irc/log').expanduser()
CACHE_FILE = Path(__file__).absolute().parent / 'cache.json'
OUTPUT_PATH = Path('~/public_html/fridaypostcard.xml').expanduser()
IGNORE_USERNAMES = {'quote_bot'}
# We cannot safely assume we will know all image extensions,
# but there are some obvious and common extensions that we can ignore.
IGNORE_EXTENSIONS = {'.html', '.htm', '.xml', '.json'}
KNOWN_MIME_TYPES = {
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.jp2': 'image/jp2',
'.bmp': 'image/bmp',
'.png': 'image/png',
'.gif': 'image/gif',
'.svg': 'image/svg+xml',
'.webp': 'image/webp',
}
# MIME type and length cache to avoid making hundreds of requests each time
cache: Dict[str, Tuple[str, str]] = {}
class Postcard(NamedTuple):
timestamp: int
username: str
url: str
message: str
mime_type: str
length: str
def get_logs() -> Iterator[str]:
with LOG_FILE.open('r') as f:
for line in f:
if '#fridaypostcard' in line:
yield line
def sanitize_message(message: str) -> str:
return SANITIZE_REGEX.sub('', message)
def parse_log(log: str) -> Optional[Postcard]:
timestamp, username, message = log.split("\t", 3)
if username in IGNORE_USERNAMES:
return None
message = sanitize_message(message)
match = URL_REGEX.search(message)
# Ignore messages with invalid URLs
if not match:
return None
url_str = match.group()
message = message \
.replace(url_str, '') \
.replace('#fridaypostcard', '') \
.strip()
try:
url = urlparse(url_str)
except Exception:
return None
extension = Path(url.path).suffix
if extension in IGNORE_EXTENSIONS:
return None
# Force-replace https with http to ensure PSP compatibility
url_str = url_str.replace('https', 'http')
# Turn Imgur links into direct links
if url.netloc == 'imgur.com':
url_str = url_str.replace('http://imgur', 'http://i.imgur')
if extension not in KNOWN_MIME_TYPES:
url_str += '.jpg'
mime_type, length = cache.get(url_str, ('', '0'))
if not mime_type:
try:
with requests.get(
url_str,
allow_redirects=True,
stream=True,
timeout=5) as resp:
resp.raise_for_status()
length = resp.headers.get('Content-Length', '0')
mime_type = resp.headers.get(
'Content-Type',
KNOWN_MIME_TYPES.get(extension, '')
)
except HTTPError as e:
# Dirty hack to avoid repeating lots of requests
# for images that are now broken.
if e.response.status_code >= 400 and e.response.status_code <= 500:
mime_type = KNOWN_MIME_TYPES.get(extension, 'image/x-error')
length = '0'
cache[url_str] = (mime_type, length)
return None
except Exception:
return None
cache[url_str] = (mime_type, length)
return Postcard(
timestamp=int(timestamp),
username=username,
url=url_str,
message=message,
mime_type=mime_type,
length=length,
)
def to_item(postcard):
if postcard.message:
title = postcard.message
description = f'{postcard.message}<br />~{postcard.username}'
else:
title = f'Postcard from ~{postcard.username}'
# Empty arrays causes the tag to be ignored by xmltodict
description = []
return {
"title": title,
"description": description,
"link": postcard.url,
"guid": postcard.url,
"author": f"{postcard.username}@tilde.town ({postcard.username})",
"pubDate": datetime.fromtimestamp(postcard.timestamp, timezone.utc)
.strftime(RSS_DATE_FORMAT),
"enclosure": {
"@url": postcard.url,
"@type": postcard.mime_type,
"@length": postcard.length,
},
}
def main():
global cache
if CACHE_FILE.is_file():
cache = json.loads(CACHE_FILE.read_text())
output = {
"rss": {
"@version": "2.0",
"@xmlns:atom": "http://www.w3.org/2005/Atom",
"@xmlns:sy": "http://purl.org/rss/1.0/modules/syndication/",
"channel": {
"title": "#fridaypostcard",
"description": "to contribute, share a link to an image on "
"irc with the text #fridaypostcard. updated "
"every friday",
"link": "http://tilde.town/~jumblesale/fp.html",
"atom:link": {
"@rel": "self",
"@type": "application/rss+xml",
"@href": "http://tilde.town/~lucidiot/fridaypostcard.xml"
},
"language": "en",
"pubDate": datetime.now(timezone.utc)
.strftime(RSS_DATE_FORMAT),
"docs": "https://www.rssboard.org/rss-specification",
"webMaster": "lucidiot@tilde.town (~lucidiot)",
"generator": "fprss",
"skipDays": {
"day": [
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Saturday",
"Sunday"
]
},
"sy:updatePeriod": "weekly",
"sy:updateFrequency": "1",
"sy:updateBase": "1971-01-01T18:00+00:00",
"item": list({
# Unique by GUID
item["guid"]: item
for item in map(
to_item,
filter(
None,
map(parse_log, get_logs())
)
)
}.values()),
}
}
}
output = xmltodict.unparse(
output,
pretty=True,
short_empty_elements=True,
)
OUTPUT_PATH.write_text(output)
CACHE_FILE.write_text(json.dumps(cache))
if __name__ == '__main__':
main()