itsb/jq/wmsc.jq

49 lines
2.4 KiB
Plaintext

# WMSC feed generator
# Expects pup JSON output holding <li> tags, outputs xmltodict-compatible JSON
# WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
{
"rss": {
"@version": "2.0",
"channel": {
"title": "WMSC",
"description": "Washington Metrorail Safety Commission safety reports",
"link": "https://wmsc.gov/oversight/reports/",
"language": "en-us",
"pubDate": (now | strftime("%a, %d %b %Y %T %z")),
"docs": "https://www.rssboard.org/rss-specification",
"ttl": 1440,
"generator": "ITSB",
"item": (
# The WMSC parsing is unusually complex due to the lack of structured data.
# We need at least a title, a URL, and a date, the date being the harder part.
# The original data is not sorted by the only date we have (there is an implicit unspecified publication date),
# so we first compute a timestamp that we can use to sort items with. This helps with bad RSS reader implementations.
[.[] | .timestamp = (
.text
# Dates are in the middle of the bullet contents, sometimes with their parts separated with punctuation, sometimes not:
# 2019-02-31, 20190231, 2019_0231, 2019 02-31 etc.
| match("[^[:alnum:]](?<year>[0-9]{4})[^[:alnum:]]?(?<month>(?:0[1-9]|1[0-2]))[^[:alnum:]]?(?<day>(?:[0-2][0-9]|3[01]))").captures
# Turn the captured groups into a { group_name: matched_text } object
| [ .[] | { "key": .name, "value": .string } ]
| from_entries
# Turn those matched dates back into parseable stuff
| (.year + "-" + .month + "-" + .day)
# Get an actual timestamp
| strptime("%Y-%m-%d")
| mktime
)]
# Sort by timestamp in descending order
| sort_by(.timestamp)
| reverse
# Get the actual RSS item
| [.[] | {
"title": .text,
"pubDate": (.timestamp | strftime("%a, %d %b %Y %T %z")),
"link": (.children | map(select(.tag == "a")) | first | .href)
}]
)
}
}
}