fprss/fprss.py

#!/usr/bin/env python3
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, Iterator, NamedTuple, Optional, Tuple
from urllib.parse import urlparse

import requests
import xmltodict  # type: ignore
from requests.exceptions import HTTPError

RSS_DATE_FORMAT = '%a, %d %b %Y %T %z'
SANITIZE_REGEX = re.compile(
    r'(?:[\002\017\021\026\035\036\037]'
    r'|\003(?:[0-9]{1,2}(?:,[0-9]{1,2})?)?'
    r'|\004(?:[0-9A-F]{,6})?)',
    re.IGNORECASE
)
URL_REGEX = re.compile(
    r'https?://[A-Za-z0-9-._~:/?#[\]%@!$&\'()*+,;=]+',
    re.IGNORECASE
)
LOG_FILE = Path('~archangelic/irc/log').expanduser()
CACHE_FILE = Path(__file__).absolute().parent / 'cache.json'
OUTPUT_PATH = Path('~/public_html/fridaypostcard.xml').expanduser()

IGNORE_USERNAMES = {'quote_bot'}
# We cannot safely assume we will know all image extensions,
# but there are some obvious and common extensions that we can ignore.
IGNORE_EXTENSIONS = {'.html', '.htm', '.xml', '.json'}
KNOWN_MIME_TYPES = {
    '.jpg': 'image/jpeg',
    '.jpeg': 'image/jpeg',
    '.jp2': 'image/jp2',
    '.bmp': 'image/bmp',
    '.png': 'image/png',
    '.gif': 'image/gif',
    '.svg': 'image/svg+xml',
    '.webp': 'image/webp',
}

# MIME type and length cache to avoid making hundreds of requests each time
cache: Dict[str, Tuple[str, str]] = {}


class Postcard(NamedTuple):
    timestamp: int
    username: str
    url: str
    message: str
    mime_type: str
    length: str


def get_logs() -> Iterator[str]:
    with LOG_FILE.open('r') as f:
        for line in f:
            if '#fridaypostcard' in line:
                yield line


def sanitize_message(message: str) -> str:
    return SANITIZE_REGEX.sub('', message)


def parse_log(log: str) -> Optional[Postcard]:
    timestamp, username, message = log.split("\t", 3)

    if username in IGNORE_USERNAMES:
        return None

    message = sanitize_message(message)
    match = URL_REGEX.search(message)
    # Ignore messages with invalid URLs
    if not match:
        return None
    url_str = match.group()

    message = message \
        .replace(url_str, '') \
        .replace('#fridaypostcard', '') \
        .strip()

    try:
        url = urlparse(url_str)
    except Exception:
        return None

    extension = Path(url.path).suffix
    if extension in IGNORE_EXTENSIONS:
        return None

    # Force-replace https with http to ensure PSP compatibility
    url_str = url_str.replace('https', 'http')

    # Turn Imgur links into direct links
    if url.netloc == 'imgur.com':
        url_str = url_str.replace('http://imgur', 'http://i.imgur')
        if extension not in KNOWN_MIME_TYPES:
            url_str += '.jpg'

    mime_type, length = cache.get(url_str, ('', '0'))

    if not mime_type:
        try:
            with requests.get(
                    url_str,
                    allow_redirects=True,
                    stream=True,
                    timeout=5) as resp:
                resp.raise_for_status()
                length = resp.headers.get('Content-Length', '0')
                mime_type = resp.headers.get(
                    'Content-Type',
                    KNOWN_MIME_TYPES.get(extension, '')
                )
        except HTTPError as e:
            # Dirty hack to avoid repeating lots of requests
            # for images that are now broken.
            if e.response.status_code >= 400 and e.response.status_code <= 500:
                mime_type = KNOWN_MIME_TYPES.get(extension, 'image/x-error')
                length = '0'
            cache[url_str] = (mime_type, length)
            return None
        except Exception:
            return None
        cache[url_str] = (mime_type, length)

    return Postcard(
        timestamp=int(timestamp),
        username=username,
        url=url_str,
        message=message,
        mime_type=mime_type,
        length=length,
    )


def to_item(postcard):
    if postcard.message:
        title = postcard.message
        description = f'{postcard.message}<br />~{postcard.username}'
    else:
        title = f'Postcard from ~{postcard.username}'
        # Empty arrays causes the tag to be ignored by xmltodict
        description = []

    return {
        "title": title,
        "description": description,
        "link": postcard.url,
        "guid": postcard.url,
        "author": f"{postcard.username}@tilde.town ({postcard.username})",
        "pubDate": datetime.fromtimestamp(postcard.timestamp, timezone.utc)
                           .strftime(RSS_DATE_FORMAT),
        "enclosure": {
            "@url": postcard.url,
            "@type": postcard.mime_type,
            "@length": postcard.length,
        },
    }


def main():
    global cache
    if CACHE_FILE.is_file():
        cache = json.loads(CACHE_FILE.read_text())
    output = {
        "rss": {
            "@version": "2.0",
            "@xmlns:atom": "http://www.w3.org/2005/Atom",
            "@xmlns:sy": "http://purl.org/rss/1.0/modules/syndication/",
            "channel": {
                "title": "#fridaypostcard",
                "description": "to contribute, share a link to an image on "
                               "irc with the text #fridaypostcard. updated "
                               "every friday",
                "link": "http://tilde.town/~jumblesale/fp.html",
                "atom:link": {
                    "@rel": "self",
                    "@type": "application/rss+xml",
                    "@href": "http://tilde.town/~lucidiot/fridaypostcard.xml"
                },
                "language": "en",
                "pubDate": datetime.now(timezone.utc)
                                   .strftime(RSS_DATE_FORMAT),
                "docs": "https://www.rssboard.org/rss-specification",
                "webMaster": "lucidiot@tilde.town (~lucidiot)",
                "generator": "fprss",
                "skipDays": {
                    "day": [
                        "Monday",
                        "Tuesday",
                        "Wednesday",
                        "Thursday",
                        "Saturday",
                        "Sunday"
                    ]
                },
                "sy:updatePeriod": "weekly",
                "sy:updateFrequency": "1",
                "sy:updateBase": "1971-01-01T18:00+00:00",
                "item": list({
                    # Unique by GUID
                    item["guid"]: item
                    for item in map(
                        to_item,
                        filter(
                            None,
                            map(parse_log, get_logs())
                        )
                    )
                }.values()),
            }
        }
    }
    output = xmltodict.unparse(
        output,
        pretty=True,
        short_empty_elements=True,
    )
    OUTPUT_PATH.write_text(output)
    CACHE_FILE.write_text(json.dumps(cache))


if __name__ == '__main__':
    main()