229 lines
6.8 KiB
Python
Executable File
229 lines
6.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import json
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, Iterator, NamedTuple, Optional, Tuple
|
|
from urllib.parse import urlparse
|
|
|
|
import requests
|
|
import xmltodict # type: ignore
|
|
from requests.exceptions import HTTPError
|
|
|
|
RSS_DATE_FORMAT = '%a, %d %b %Y %T %z'
|
|
SANITIZE_REGEX = re.compile(
|
|
r'(?:[\002\017\021\026\035\036\037]'
|
|
r'|\003(?:[0-9]{1,2}(?:,[0-9]{1,2})?)?'
|
|
r'|\004(?:[0-9A-F]{,6})?)',
|
|
re.IGNORECASE
|
|
)
|
|
URL_REGEX = re.compile(
|
|
r'https?://[A-Za-z0-9-._~:/?#[\]%@!$&\'()*+,;=]+',
|
|
re.IGNORECASE
|
|
)
|
|
LOG_FILE = Path('~archangelic/irc/log').expanduser()
|
|
CACHE_FILE = Path(__file__).absolute().parent / 'cache.json'
|
|
OUTPUT_PATH = Path('~/public_html/fridaypostcard.xml').expanduser()
|
|
|
|
IGNORE_USERNAMES = {'quote_bot'}
|
|
# We cannot safely assume we will know all image extensions,
|
|
# but there are some obvious and common extensions that we can ignore.
|
|
IGNORE_EXTENSIONS = {'.html', '.htm', '.xml', '.json'}
|
|
KNOWN_MIME_TYPES = {
|
|
'.jpg': 'image/jpeg',
|
|
'.jpeg': 'image/jpeg',
|
|
'.jp2': 'image/jp2',
|
|
'.bmp': 'image/bmp',
|
|
'.png': 'image/png',
|
|
'.gif': 'image/gif',
|
|
'.svg': 'image/svg+xml',
|
|
'.webp': 'image/webp',
|
|
}
|
|
|
|
# MIME type and length cache to avoid making hundreds of requests each time
|
|
cache: Dict[str, Tuple[str, str]] = {}
|
|
|
|
|
|
class Postcard(NamedTuple):
|
|
timestamp: int
|
|
username: str
|
|
url: str
|
|
message: str
|
|
mime_type: str
|
|
length: str
|
|
|
|
|
|
def get_logs() -> Iterator[str]:
|
|
with LOG_FILE.open('r') as f:
|
|
for line in f:
|
|
if '#fridaypostcard' in line:
|
|
yield line
|
|
|
|
|
|
def sanitize_message(message: str) -> str:
|
|
return SANITIZE_REGEX.sub('', message)
|
|
|
|
|
|
def parse_log(log: str) -> Optional[Postcard]:
|
|
timestamp, username, message = log.split("\t", 3)
|
|
|
|
if username in IGNORE_USERNAMES:
|
|
return None
|
|
|
|
message = sanitize_message(message)
|
|
match = URL_REGEX.search(message)
|
|
# Ignore messages with invalid URLs
|
|
if not match:
|
|
return None
|
|
url_str = match.group()
|
|
|
|
message = message \
|
|
.replace(url_str, '') \
|
|
.replace('#fridaypostcard', '') \
|
|
.strip()
|
|
|
|
try:
|
|
url = urlparse(url_str)
|
|
except Exception:
|
|
return None
|
|
|
|
extension = Path(url.path).suffix
|
|
if extension in IGNORE_EXTENSIONS:
|
|
return None
|
|
|
|
# Force-replace https with http to ensure PSP compatibility
|
|
url_str = url_str.replace('https', 'http')
|
|
|
|
# Turn Imgur links into direct links
|
|
if url.netloc == 'imgur.com':
|
|
url_str = url_str.replace('http://imgur', 'http://i.imgur')
|
|
if extension not in KNOWN_MIME_TYPES:
|
|
url_str += '.jpg'
|
|
|
|
mime_type, length = cache.get(url_str, ('', '0'))
|
|
|
|
if not mime_type:
|
|
try:
|
|
with requests.get(
|
|
url_str,
|
|
allow_redirects=True,
|
|
stream=True,
|
|
timeout=5) as resp:
|
|
resp.raise_for_status()
|
|
length = resp.headers.get('Content-Length', '0')
|
|
mime_type = resp.headers.get(
|
|
'Content-Type',
|
|
KNOWN_MIME_TYPES.get(extension, '')
|
|
)
|
|
except HTTPError as e:
|
|
# Dirty hack to avoid repeating lots of requests
|
|
# for images that are now broken.
|
|
if e.response.status_code >= 400 and e.response.status_code <= 500:
|
|
mime_type = KNOWN_MIME_TYPES.get(extension, 'image/x-error')
|
|
length = '0'
|
|
cache[url_str] = (mime_type, length)
|
|
return None
|
|
except Exception:
|
|
return None
|
|
cache[url_str] = (mime_type, length)
|
|
|
|
return Postcard(
|
|
timestamp=int(timestamp),
|
|
username=username,
|
|
url=url_str,
|
|
message=message,
|
|
mime_type=mime_type,
|
|
length=length,
|
|
)
|
|
|
|
|
|
def to_item(postcard):
|
|
if postcard.message:
|
|
title = postcard.message
|
|
description = f'{postcard.message}<br />~{postcard.username}'
|
|
else:
|
|
title = f'Postcard from ~{postcard.username}'
|
|
# Empty arrays causes the tag to be ignored by xmltodict
|
|
description = []
|
|
|
|
return {
|
|
"title": title,
|
|
"description": description,
|
|
"link": postcard.url,
|
|
"guid": postcard.url,
|
|
"author": f"{postcard.username}@tilde.town ({postcard.username})",
|
|
"pubDate": datetime.fromtimestamp(postcard.timestamp, timezone.utc)
|
|
.strftime(RSS_DATE_FORMAT),
|
|
"enclosure": {
|
|
"@url": postcard.url,
|
|
"@type": postcard.mime_type,
|
|
"@length": postcard.length,
|
|
},
|
|
}
|
|
|
|
|
|
def main():
|
|
global cache
|
|
if CACHE_FILE.is_file():
|
|
cache = json.loads(CACHE_FILE.read_text())
|
|
output = {
|
|
"rss": {
|
|
"@version": "2.0",
|
|
"@xmlns:atom": "http://www.w3.org/2005/Atom",
|
|
"@xmlns:sy": "http://purl.org/rss/1.0/modules/syndication/",
|
|
"channel": {
|
|
"title": "#fridaypostcard",
|
|
"description": "to contribute, share a link to an image on "
|
|
"irc with the text #fridaypostcard. updated "
|
|
"every friday",
|
|
"link": "http://tilde.town/~jumblesale/fp.html",
|
|
"atom:link": {
|
|
"@rel": "self",
|
|
"@type": "application/rss+xml",
|
|
"@href": "http://tilde.town/~lucidiot/fridaypostcard.xml"
|
|
},
|
|
"language": "en",
|
|
"pubDate": datetime.now(timezone.utc)
|
|
.strftime(RSS_DATE_FORMAT),
|
|
"docs": "https://www.rssboard.org/rss-specification",
|
|
"webMaster": "lucidiot@tilde.town (~lucidiot)",
|
|
"generator": "fprss",
|
|
"skipDays": {
|
|
"day": [
|
|
"Monday",
|
|
"Tuesday",
|
|
"Wednesday",
|
|
"Thursday",
|
|
"Saturday",
|
|
"Sunday"
|
|
]
|
|
},
|
|
"sy:updatePeriod": "weekly",
|
|
"sy:updateFrequency": "1",
|
|
"sy:updateBase": "1971-01-01T18:00+00:00",
|
|
"item": list({
|
|
# Unique by GUID
|
|
item["guid"]: item
|
|
for item in map(
|
|
to_item,
|
|
filter(
|
|
None,
|
|
map(parse_log, get_logs())
|
|
)
|
|
)
|
|
}.values()),
|
|
}
|
|
}
|
|
}
|
|
output = xmltodict.unparse(
|
|
output,
|
|
pretty=True,
|
|
short_empty_elements=True,
|
|
)
|
|
OUTPUT_PATH.write_text(output)
|
|
CACHE_FILE.write_text(json.dumps(cache))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|