fprss/fprss.py

229 lines
6.8 KiB
Python
Raw Normal View History

2020-01-03 20:03:08 +00:00
#!/usr/bin/env python3
2021-07-24 16:09:14 +00:00
import json
import re
2020-01-03 20:03:08 +00:00
from datetime import datetime, timezone
from pathlib import Path
2021-07-24 16:09:14 +00:00
from typing import Dict, Iterator, NamedTuple, Optional, Tuple
from urllib.parse import urlparse
2021-07-24 16:09:14 +00:00
import requests
2021-07-24 16:09:14 +00:00
import xmltodict # type: ignore
from requests.exceptions import HTTPError
2020-01-03 20:03:08 +00:00
RSS_DATE_FORMAT = '%a, %d %b %Y %T %z'
2021-07-24 16:09:14 +00:00
SANITIZE_REGEX = re.compile(
r'(?:[\002\017\021\026\035\036\037]'
r'|\003(?:[0-9]{1,2}(?:,[0-9]{1,2})?)?'
r'|\004(?:[0-9A-F]{,6})?)',
re.IGNORECASE
)
URL_REGEX = re.compile(
r'https?://[A-Za-z0-9-._~:/?#[\]%@!$&\'()*+,;=]+',
re.IGNORECASE
)
LOG_FILE = Path('~archangelic/irc/log').expanduser()
CACHE_FILE = Path(__file__).absolute().parent / 'cache.json'
OUTPUT_PATH = Path('~/public_html/fridaypostcard.xml').expanduser()
2020-01-03 20:03:08 +00:00
IGNORE_USERNAMES = {'quote_bot'}
2021-07-24 16:09:14 +00:00
# We cannot safely assume we will know all image extensions,
# but there are some obvious and common extensions that we can ignore.
IGNORE_EXTENSIONS = {'.html', '.htm', '.xml', '.json'}
KNOWN_MIME_TYPES = {
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.jp2': 'image/jp2',
'.bmp': 'image/bmp',
'.png': 'image/png',
'.gif': 'image/gif',
'.svg': 'image/svg+xml',
'.webp': 'image/webp',
}
2020-01-03 20:03:08 +00:00
# MIME type and length cache to avoid making hundreds of requests each time
2021-07-24 16:09:14 +00:00
cache: Dict[str, Tuple[str, str]] = {}
class Postcard(NamedTuple):
timestamp: int
username: str
url: str
message: str
mime_type: str
length: str
def get_logs() -> Iterator[str]:
with LOG_FILE.open('r') as f:
for line in f:
if '#fridaypostcard' in line:
yield line
def sanitize_message(message: str) -> str:
return SANITIZE_REGEX.sub('', message)
def parse_log(log: str) -> Optional[Postcard]:
2020-01-03 20:03:08 +00:00
timestamp, username, message = log.split("\t", 3)
if username in IGNORE_USERNAMES:
2021-07-24 16:09:14 +00:00
return None
message = sanitize_message(message)
match = URL_REGEX.search(message)
# Ignore messages with invalid URLs
if not match:
2021-07-24 16:09:14 +00:00
return None
url_str = match.group()
2021-07-24 16:09:14 +00:00
message = message \
.replace(url_str, '') \
.replace('#fridaypostcard', '') \
.strip()
try:
url = urlparse(url_str)
2021-07-24 16:09:14 +00:00
except Exception:
return None
extension = Path(url.path).suffix
if extension in IGNORE_EXTENSIONS:
2021-07-24 16:09:14 +00:00
return None
2020-01-03 20:03:08 +00:00
# Force-replace https with http to ensure PSP compatibility
url_str = url_str.replace('https', 'http')
# Turn Imgur links into direct links
if url.netloc == 'imgur.com':
url_str = url_str.replace('http://imgur', 'http://i.imgur')
if extension not in KNOWN_MIME_TYPES:
url_str += '.jpg'
2021-07-24 16:09:14 +00:00
mime_type, length = cache.get(url_str, ('', '0'))
if not mime_type:
try:
2021-07-24 16:09:14 +00:00
with requests.get(
url_str,
allow_redirects=True,
stream=True,
timeout=5) as resp:
resp.raise_for_status()
length = resp.headers.get('Content-Length', '0')
2021-07-24 16:09:14 +00:00
mime_type = resp.headers.get(
'Content-Type',
KNOWN_MIME_TYPES.get(extension, '')
)
except HTTPError as e:
2021-07-24 16:09:14 +00:00
# Dirty hack to avoid repeating lots of requests
# for images that are now broken.
if e.response.status_code >= 400 and e.response.status_code <= 500:
mime_type = KNOWN_MIME_TYPES.get(extension, 'image/x-error')
length = '0'
2021-07-24 16:09:14 +00:00
cache[url_str] = (mime_type, length)
return None
except Exception:
2021-07-24 16:09:14 +00:00
return None
cache[url_str] = (mime_type, length)
2020-01-03 20:03:08 +00:00
return Postcard(
timestamp=int(timestamp),
username=username,
url=url_str,
2020-01-03 20:03:08 +00:00
message=message,
mime_type=mime_type,
length=length,
2020-01-03 20:03:08 +00:00
)
def to_item(postcard):
if postcard.message:
title = postcard.message
description = f'{postcard.message}<br />~{postcard.username}'
2020-01-03 20:03:08 +00:00
else:
title = f'Postcard from ~{postcard.username}'
# Empty arrays causes the tag to be ignored by xmltodict
description = []
2020-01-03 20:03:08 +00:00
return {
"title": title,
"description": description,
"link": postcard.url,
"guid": postcard.url,
"author": f"{postcard.username}@tilde.town ({postcard.username})",
2020-01-03 20:03:08 +00:00
"pubDate": datetime.fromtimestamp(postcard.timestamp, timezone.utc)
.strftime(RSS_DATE_FORMAT),
2020-01-03 20:03:08 +00:00
"enclosure": {
"@url": postcard.url,
"@type": postcard.mime_type,
"@length": postcard.length,
2020-01-03 20:03:08 +00:00
},
}
def main():
global cache
if CACHE_FILE.is_file():
cache = json.loads(CACHE_FILE.read_text())
2020-01-03 20:03:08 +00:00
output = {
"rss": {
"@version": "2.0",
"@xmlns:atom": "http://www.w3.org/2005/Atom",
"@xmlns:sy": "http://purl.org/rss/1.0/modules/syndication/",
2020-01-03 20:03:08 +00:00
"channel": {
"title": "#fridaypostcard",
"description": "to contribute, share a link to an image on "
"irc with the text #fridaypostcard. updated "
"every friday",
"link": "http://tilde.town/~jumblesale/fp.html",
"atom:link": {
"@rel": "self",
"@type": "application/rss+xml",
"@href": "http://tilde.town/~lucidiot/fridaypostcard.xml"
},
2020-01-03 20:03:08 +00:00
"language": "en",
"pubDate": datetime.now(timezone.utc)
.strftime(RSS_DATE_FORMAT),
"docs": "https://www.rssboard.org/rss-specification",
"webMaster": "lucidiot@tilde.town (~lucidiot)",
2020-01-03 20:03:08 +00:00
"generator": "fprss",
"skipDays": {
"day": [
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Saturday",
"Sunday"
]
},
"sy:updatePeriod": "weekly",
"sy:updateFrequency": "1",
"sy:updateBase": "1971-01-01T18:00+00:00",
"item": list({
# Unique by GUID
item["guid"]: item
2021-07-24 16:09:14 +00:00
for item in map(
to_item,
filter(
None,
map(parse_log, get_logs())
)
)
}.values()),
2020-01-03 20:03:08 +00:00
}
}
}
output = xmltodict.unparse(
output,
pretty=True,
short_empty_elements=True,
)
OUTPUT_PATH.write_text(output)
CACHE_FILE.write_text(json.dumps(cache))
2020-01-03 20:03:08 +00:00
if __name__ == '__main__':
main()