itsb/jq/wmsc.jq

# WMSC feed generator
# Expects pup JSON output holding <li> tags, outputs xmltodict-compatible JSON
# WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.

{
    "rss": {
        "@version": "2.0",
        "channel": {
            "title": "WMSC",
            "description": "Washington Metrorail Safety Commission safety reports",
            "link": "https://wmsc.gov/oversight/reports/",
            "language": "en-us",
            "pubDate": (now | strftime("%a, %d %b %Y %T %z")),
            "docs": "https://www.rssboard.org/rss-specification",
            "ttl": 1440,
            "generator": "ITSB",
            "item": (
                # The WMSC parsing is unusually complex due to the lack of structured data.
                # We need at least a title, a URL, and a date, the date being the harder part.
                # The original data is not sorted by the only date we have (there is an implicit unspecified publication date),
                # so we first compute a timestamp that we can use to sort items with. This helps with bad RSS reader implementations.
                [.[] | .timestamp = (
                    .text
                    # Dates are in the middle of the bullet contents, sometimes with their parts separated with punctuation, sometimes not:
                    # 2019-02-31, 20190231, 2019_0231, 2019 02-31 etc.
                    | match("[^[:alnum:]](?<year>[0-9]{4})[^[:alnum:]]?(?<month>(?:0[1-9]|1[0-2]))[^[:alnum:]]?(?<day>(?:[0-2][0-9]|3[01]))").captures
                    # Turn the captured groups into a { group_name: matched_text } object
                    | [ .[] | { "key": .name, "value": .string } ]
                    | from_entries
                    # Turn those matched dates back into parseable stuff
                    | (.year + "-" + .month + "-" + .day)
                    # Get an actual timestamp
                    | strptime("%Y-%m-%d")
                    | mktime
                )]
                # Sort by timestamp in descending order
                | sort_by(.timestamp)
                | reverse
                # Get the actual RSS item
                | [.[] | {
                    "title": .text,
                    "pubDate": (.timestamp | strftime("%a, %d %b %Y %T %z")),
                    "link": (.children | map(select(.tag == "a")) | first | .href)
                }]
            )
        }
    }
}