From 32d9ad4af4e22a63a68ce8a2cab5aaae992e9b5d Mon Sep 17 00:00:00 2001
From: Lucidiot <lucidiot@protonmail.com>
Date: Thu, 14 May 2020 05:28:57 +0000
Subject: [PATCH] Add WMSC custom feed

---
 feedgen.sh |  7 +++++++
 index.html | 24 ++++++++++++++++++++++++
 jq/wmsc.jq | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+)
 create mode 100644 jq/wmsc.jq
diff --git a/feedgen.sh b/feedgen.sh
index 5eb927a..3de57aa 100755
--- a/feedgen.sh
+++ b/feedgen.sh
@@ -150,3 +150,10 @@ curl -s 'http://www.uzpln.cz/zpravy-ln' \
 		--arg link 'http://www.uzpln.cz/zpravy-ln' \
 	| json2xml > $DIR/feeds/uzpln/cz.xml.new \
 	&& mv $DIR/feeds/uzpln/cz.xml.new $DIR/feeds/uzpln/cz.xml
+
+log Building WMSC feed to $DIR/feeds/wmsc.xml
+curl -s 'https://wmsc.gov/oversight/reports/' \
+	| pup '.post-content li json{}' \
+	| jq -f $DIR/jq/wmsc.jq \
+	| json2xml > $DIR/feeds/wmsc.xml.new \
+	&& mv $DIR/feeds/wmsc.xml.new $DIR/feeds/wmsc.xml
diff --git a/index.html b/index.html
index 8b559ef..ba57530 100644
--- a/index.html
+++ b/index.html
@@ -532,6 +532,30 @@
           </tr>
         </tbody>
       </table>
+
+      <h3>Washington Metrorail Safety Commission</h3>
+      <table>
+        <tr>
+          <td><strong>Country</strong></td>
+          <td>United States</td>
+        </tr>
+        <tr>
+          <td><strong>Language</strong></td>
+          <td>English</td>
+        </tr>
+        <tr>
+          <td><strong>Type</strong></td>
+          <td>Rail</td>
+        </tr>
+        <tr>
+          <td><strong>Frequency</strong></td>
+          <td>20 reports/year</td>
+        </tr>
+        <tr>
+          <td><strong>Feed</strong></td>
+          <td><a href="feeds/wmsc.xml" target="_blank"><img src="img/rss.gif" alt="RSS" /></a></td>
+        </tr>
+      </table>
     </div>
   </body>
 </html>
diff --git a/jq/wmsc.jq b/jq/wmsc.jq
new file mode 100644
index 0000000..accf53f
--- /dev/null
+++ b/jq/wmsc.jq
@@ -0,0 +1,48 @@
+# WMSC feed generator
+# Expects pup JSON output holding <li> tags, outputs xmltodict-compatible JSON
+# WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
+
+{
+    "rss": {
+        "@version": "2.0",
+        "channel": {
+            "title": "WMSC",
+            "description": "Washington Metrorail Safety Commission safety reports",
+            "link": "https://wmsc.gov/oversight/reports/",
+            "language": "en-us",
+            "pubDate": (now | strftime("%a, %d %b %Y %T %z")),
+            "docs": "https://cyber.harvard.edu/rss/rss.html",
+            "ttl": 86400,
+            "generator": "ITSB",
+            "item": (
+                # The WMSC parsing is unusually complex due to the lack of structured data.
+                # We need at least a title, a URL, and a date, the date being the harder part.
+                # The original data is not sorted by the only date we have (there is an implicit unspecified publication date),
+                # so we first compute a timestamp that we can use to sort items with. This helps with bad RSS reader implementations.
+                [.[] | .timestamp = (
+                    .text
+                    # Dates are in the middle of the bullet contents, sometimes with their parts separated with punctuation, sometimes not:
+                    # 2019-02-31, 20190231, 2019_0231, 2019 02-31 etc.
+                    | match("[^[:alnum:]](?<year>[0-9]{4})[^[:alnum:]]?(?<month>(?:0[1-9]|1[0-2]))[^[:alnum:]]?(?<day>(?:[0-2][0-9]|3[01]))").captures
+                    # Turn the captured groups into a { group_name: matched_text } object
+                    | [ .[] | { "key": .name, "value": .string } ]
+                    | from_entries
+                    # Turn those matched dates back into parseable stuff
+                    | (.year + "-" + .month + "-" + .day)
+                    # Get an actual timestamp
+                    | strptime("%Y-%m-%d")
+                    | mktime
+                )]
+                # Sort by timestamp in descending order
+                | sort_by(.timestamp)
+                | reverse
+                # Get the actual RSS item
+                | [.[] | {
+                    "title": .text,
+                    "pubDate": (.timestamp | strftime("%a, %d %b %Y %T %z")),
+                    "link": (.children | map(select(.tag == "a")) | first | .href)
+                }]
+            )
+        }
+    }
+}

Country	United States
Language	English
Type	Rail
Frequency	20 reports/year
Feed