Rewrite fprss.py
- Only adds .jpg for Imgur links, closes #2 - Performs HTTP requests to retrieve the MIME type and size of all images, falling back to a set of known images and zero length for failing requests, and adds caching to avoid flooding, closes #3 - Drops the use of `mensch` and instead reads straight from the IRC log without a week fliter, closes #4 - Adds mod_syndication and skipDays, closes #5
This commit is contained in:
parent
655eb5aa1a
commit
0e8d5a4379
|
@ -1,3 +1,4 @@
|
|||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
cache.json
|
||||
|
|
|
@ -6,7 +6,14 @@ A simple script to make a `#fridaypostcard` RSS feed.
|
|||
|
||||
* Python 3.6+
|
||||
* [xmltodict](https://github.com/martinblech/xmltodict)
|
||||
* [requests](https://requests.readthedocs.io/en/master/)
|
||||
|
||||
## Usage
|
||||
|
||||
Run [`fprss.py`](fprss.py) to build the #fridaypostcard RSS feed. You can add this script to CRON to build the feeds regularly.
|
||||
Run [`fprss.py`](fprss.py) to build the #fridaypostcard RSS feed.
|
||||
|
||||
You can add this script to CRON to build the feeds regularly; here is what I use to generate every hour on Friday:
|
||||
|
||||
```
|
||||
* */1 * * 5 nice -n 19 /home/lucidiot/dev/fprss/fprss.py
|
||||
```
|
||||
|
|
192
fprss.py
192
fprss.py
|
@ -1,102 +1,194 @@
|
|||
#!/usr/bin/env python3
|
||||
from collections import namedtuple
|
||||
from datetime import datetime, timezone
|
||||
import os.path
|
||||
from pathlib import Path
|
||||
from requests.exceptions import HTTPError
|
||||
from typing import Iterator, Optional
|
||||
from urllib.parse import urlparse
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import requests
|
||||
import xmltodict
|
||||
|
||||
URL_REGEX = re.compile(r'(?P<url>https?://[^\s]+)')
|
||||
Postcard = namedtuple('Postcard', ['timestamp', 'username', 'url', 'message'])
|
||||
RSS_DATE_FORMAT = '%a, %d %b %Y %T %z'
|
||||
SANITIZE_REGEX = re.compile(r'(?:[\002\017\021\026\035\036\037]|\003(?:[0-9]{1,2}(?:,[0-9]{1,2})?)?|\004(?:[0-9A-F]{,6})?)', re.IGNORECASE)
|
||||
URL_REGEX = re.compile(r'https?://[A-Za-z0-9-._~:/?#[\]%@!$&\'()*+,;=]+', re.IGNORECASE)
|
||||
LOG_FILE = Path('~archangelic/irc/log').expanduser()
|
||||
CACHE_FILE = Path(__file__).absolute().parent / 'cache.json'
|
||||
OUTPUT_PATH = Path('~/public_html/fridaypostcard.xml').expanduser()
|
||||
|
||||
IGNORE_USERNAMES = {'quote_bot'}
|
||||
# We cannot safely assume we will know all image extensions, but there are some obvious and common extensions that we can ignore.
|
||||
IGNORE_EXTENSIONS = {'.html', '.htm', '.xml', '.json'}
|
||||
KNOWN_MIME_TYPES = {
|
||||
'.jpg': 'image/jpeg',
|
||||
'.jpeg': 'image/jpeg',
|
||||
'.jp2': 'image/jp2',
|
||||
'.bmp': 'image/bmp',
|
||||
'.png': 'image/png',
|
||||
'.gif': 'image/gif',
|
||||
'.svg': 'image/svg+xml',
|
||||
'.webp': 'image/webp',
|
||||
}
|
||||
|
||||
Postcard = namedtuple('Postcard', ['timestamp', 'username', 'url', 'message', 'mime_type', 'length'])
|
||||
|
||||
# MIME type and length cache to avoid making hundreds of requests each time
|
||||
cache = {}
|
||||
|
||||
|
||||
def parse_log(log: str) -> Postcard:
|
||||
def get_logs() -> Iterator[str]:
|
||||
with LOG_FILE.open('r') as f:
|
||||
for line in f:
|
||||
if '#fridaypostcard' in line:
|
||||
yield line
|
||||
|
||||
|
||||
def sanitize_message(message: str) -> str:
|
||||
return SANITIZE_REGEX.sub('', message)
|
||||
|
||||
|
||||
def parse_log(log: str) -> Optional[Postcard]:
|
||||
timestamp, username, message = log.split("\t", 3)
|
||||
url = URL_REGEX.search(message).group("url")
|
||||
message = message.replace(url, '').replace('#fridaypostcard', '').strip()
|
||||
|
||||
if username in IGNORE_USERNAMES:
|
||||
return
|
||||
|
||||
message = sanitize_message(message)
|
||||
match = URL_REGEX.search(message)
|
||||
# Ignore messages with invalid URLs
|
||||
if not match:
|
||||
return
|
||||
url_str = match.group()
|
||||
|
||||
message = message.replace(url_str, '').replace('#fridaypostcard', '').strip()
|
||||
|
||||
try:
|
||||
url = urlparse(url_str)
|
||||
except:
|
||||
return
|
||||
|
||||
extension = Path(url.path).suffix
|
||||
if extension in IGNORE_EXTENSIONS:
|
||||
return
|
||||
|
||||
# Force-replace https with http to ensure PSP compatibility
|
||||
url = url.replace('https', 'http') \
|
||||
.replace('http://imgur', 'http://i.imgur')
|
||||
if not any(map(url.lower().endswith, (
|
||||
'.jpg',
|
||||
'.gif',
|
||||
'.png',
|
||||
'.svg',
|
||||
'.webp',
|
||||
'.bmp',
|
||||
'.tif'
|
||||
))):
|
||||
url = url + '.jpg'
|
||||
url_str = url_str.replace('https', 'http')
|
||||
|
||||
# Turn Imgur links into direct links
|
||||
if url.netloc == 'imgur.com':
|
||||
url_str = url_str.replace('http://imgur', 'http://i.imgur')
|
||||
if extension not in KNOWN_MIME_TYPES:
|
||||
url_str += '.jpg'
|
||||
|
||||
mime_type, length = cache.get(url_str, ['', '0'])
|
||||
|
||||
if not mime_type:
|
||||
try:
|
||||
with requests.get(url_str, allow_redirects=True, stream=True, timeout=5) as resp:
|
||||
resp.raise_for_status()
|
||||
length = resp.headers.get('Content-Length', '0')
|
||||
mime_type = resp.headers.get('Content-Type', KNOWN_MIME_TYPES.get(extension, ''))
|
||||
except HTTPError as e:
|
||||
# Dirty hack to avoid repeating lots of requests for images that are now broken.
|
||||
if e.response.status_code >= 400 and e.response.status_code <= 500:
|
||||
mime_type = KNOWN_MIME_TYPES.get(extension, 'image/x-error')
|
||||
length = '0'
|
||||
cache[url_str] = [mime_type, length]
|
||||
return
|
||||
except Exception:
|
||||
return
|
||||
cache[url_str] = [mime_type, length]
|
||||
|
||||
return Postcard(
|
||||
timestamp=int(timestamp),
|
||||
username=username,
|
||||
url=url,
|
||||
url=url_str,
|
||||
message=message,
|
||||
mime_type=mime_type,
|
||||
length=length,
|
||||
)
|
||||
|
||||
|
||||
def is_current_week(postcard):
|
||||
return (
|
||||
datetime.utcnow() - datetime.fromtimestamp(postcard.timestamp)
|
||||
).days < 7
|
||||
|
||||
|
||||
def to_item(postcard):
|
||||
if postcard.message:
|
||||
title = postcard.message
|
||||
description = postcard.message + '\n~' + postcard.username
|
||||
description = f'{postcard.message}<br />~{postcard.username}'
|
||||
else:
|
||||
title = 'Postcard from ~' + postcard.username
|
||||
description = '~' + postcard.username
|
||||
title = f'Postcard from ~{postcard.username}'
|
||||
# Empty arrays causes the tag to be ignored by xmltodict
|
||||
description = []
|
||||
|
||||
return {
|
||||
"title": title,
|
||||
"description": description,
|
||||
"link": postcard.url,
|
||||
"author": postcard.username,
|
||||
"guid": postcard.url,
|
||||
"author": f"{postcard.username}@tilde.town ({postcard.username})",
|
||||
"pubDate": datetime.fromtimestamp(postcard.timestamp, timezone.utc)
|
||||
.strftime('%a, %d %b %Y %T %z'),
|
||||
.strftime(RSS_DATE_FORMAT),
|
||||
"enclosure": {
|
||||
"@url": postcard.url,
|
||||
"@type": postcard.mime_type,
|
||||
"@length": postcard.length,
|
||||
},
|
||||
# TODO: media:thumbnail, MIME type, content length
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
logs = subprocess.run(
|
||||
args=[os.path.expanduser('~karlen/bin/mensch'), '-f'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
check=True,
|
||||
).stdout.decode('utf-8').splitlines()
|
||||
|
||||
global cache
|
||||
if CACHE_FILE.is_file():
|
||||
cache = json.loads(CACHE_FILE.read_text())
|
||||
output = {
|
||||
"rss": {
|
||||
"@version": "2.0",
|
||||
"@xmlns:atom": "http://www.w3.org/2005/Atom",
|
||||
"@xmlns:sy": "http://purl.org/rss/1.0/modules/syndication/",
|
||||
"channel": {
|
||||
"title": "#fridaypostcard",
|
||||
"description": "to contribute, share a link to an image on "
|
||||
"irc with the text #fridaypostcard. updated "
|
||||
"every friday",
|
||||
"link": "http://tilde.town/~jumblesale/fp.html",
|
||||
"atom:link": {
|
||||
"@rel": "self",
|
||||
"@type": "application/rss+xml",
|
||||
"@href": "http://tilde.town/~lucidiot/fridaypostcard.xml"
|
||||
},
|
||||
"language": "en",
|
||||
"pubDate": datetime.now(timezone.utc)
|
||||
.strftime('%a, %d %b %Y %T %z'),
|
||||
"docs": "https://cyber.harvard.edu/rss/rss.html",
|
||||
.strftime(RSS_DATE_FORMAT),
|
||||
"docs": "https://www.rssboard.org/rss-specification",
|
||||
"webMaster": "lucidiot@tilde.town (~lucidiot)",
|
||||
"generator": "fprss",
|
||||
"item": list(map(to_item, filter(is_current_week,
|
||||
map(parse_log, logs)))),
|
||||
"skipDays": {
|
||||
"day": [
|
||||
"Monday",
|
||||
"Tuesday",
|
||||
"Wednesday",
|
||||
"Thursday",
|
||||
"Saturday",
|
||||
"Sunday"
|
||||
]
|
||||
},
|
||||
"sy:updatePeriod": "weekly",
|
||||
"sy:updateFrequency": "1",
|
||||
"sy:updateBase": "1971-01-01T18:00+00:00",
|
||||
"item": list({
|
||||
# Unique by GUID
|
||||
item["guid"]: item
|
||||
for item in map(to_item, filter(None, map(parse_log, get_logs())))
|
||||
}.values()),
|
||||
}
|
||||
}
|
||||
}
|
||||
with open(os.path.expanduser('~/public_html/fridaypostcard.xml'), 'w') as f:
|
||||
f.write(xmltodict.unparse(
|
||||
output,
|
||||
pretty=True,
|
||||
short_empty_elements=True,
|
||||
))
|
||||
output = xmltodict.unparse(
|
||||
output,
|
||||
pretty=True,
|
||||
short_empty_elements=True,
|
||||
)
|
||||
OUTPUT_PATH.write_text(output)
|
||||
CACHE_FILE.write_text(json.dumps(cache))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
Loading…
Reference in New Issue