From 32d9ad4af4e22a63a68ce8a2cab5aaae992e9b5d Mon Sep 17 00:00:00 2001 From: Lucidiot Date: Thu, 14 May 2020 05:28:57 +0000 Subject: [PATCH] Add WMSC custom feed --- feedgen.sh | 7 +++++++ index.html | 24 ++++++++++++++++++++++++ jq/wmsc.jq | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+) create mode 100644 jq/wmsc.jq diff --git a/feedgen.sh b/feedgen.sh index 5eb927a..3de57aa 100755 --- a/feedgen.sh +++ b/feedgen.sh @@ -150,3 +150,10 @@ curl -s 'http://www.uzpln.cz/zpravy-ln' \ --arg link 'http://www.uzpln.cz/zpravy-ln' \ | json2xml > $DIR/feeds/uzpln/cz.xml.new \ && mv $DIR/feeds/uzpln/cz.xml.new $DIR/feeds/uzpln/cz.xml + +log Building WMSC feed to $DIR/feeds/wmsc.xml +curl -s 'https://wmsc.gov/oversight/reports/' \ + | pup '.post-content li json{}' \ + | jq -f $DIR/jq/wmsc.jq \ + | json2xml > $DIR/feeds/wmsc.xml.new \ + && mv $DIR/feeds/wmsc.xml.new $DIR/feeds/wmsc.xml diff --git a/index.html b/index.html index 8b559ef..ba57530 100644 --- a/index.html +++ b/index.html @@ -532,6 +532,30 @@ + +

Washington Metrorail Safety Commission

+ + + + + + + + + + + + + + + + + + + + + +
CountryUnited States
LanguageEnglish
TypeRail
Frequency20 reports/year
FeedRSS
diff --git a/jq/wmsc.jq b/jq/wmsc.jq new file mode 100644 index 0000000..accf53f --- /dev/null +++ b/jq/wmsc.jq @@ -0,0 +1,48 @@ +# WMSC feed generator +# Expects pup JSON output holding
  • tags, outputs xmltodict-compatible JSON +# WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale. + +{ + "rss": { + "@version": "2.0", + "channel": { + "title": "WMSC", + "description": "Washington Metrorail Safety Commission safety reports", + "link": "https://wmsc.gov/oversight/reports/", + "language": "en-us", + "pubDate": (now | strftime("%a, %d %b %Y %T %z")), + "docs": "https://cyber.harvard.edu/rss/rss.html", + "ttl": 86400, + "generator": "ITSB", + "item": ( + # The WMSC parsing is unusually complex due to the lack of structured data. + # We need at least a title, a URL, and a date, the date being the harder part. + # The original data is not sorted by the only date we have (there is an implicit unspecified publication date), + # so we first compute a timestamp that we can use to sort items with. This helps with bad RSS reader implementations. + [.[] | .timestamp = ( + .text + # Dates are in the middle of the bullet contents, sometimes with their parts separated with punctuation, sometimes not: + # 2019-02-31, 20190231, 2019_0231, 2019 02-31 etc. + | match("[^[:alnum:]](?[0-9]{4})[^[:alnum:]]?(?(?:0[1-9]|1[0-2]))[^[:alnum:]]?(?(?:[0-2][0-9]|3[01]))").captures + # Turn the captured groups into a { group_name: matched_text } object + | [ .[] | { "key": .name, "value": .string } ] + | from_entries + # Turn those matched dates back into parseable stuff + | (.year + "-" + .month + "-" + .day) + # Get an actual timestamp + | strptime("%Y-%m-%d") + | mktime + )] + # Sort by timestamp in descending order + | sort_by(.timestamp) + | reverse + # Get the actual RSS item + | [.[] | { + "title": .text, + "pubDate": (.timestamp | strftime("%a, %d %b %Y %T %z")), + "link": (.children | map(select(.tag == "a")) | first | .href) + }] + ) + } + } +}