URL management helpers, closes #5

2020-07-19 14:46:47 +02:00 · 2020-07-19 14:46:47 +02:00 · 8b06e0eb6f
parent 1c7fdf93b4
commit 8b06e0eb6f
8 changed files with 74 additions and 13 deletions
--- a/feedgen.sh
+++ b/feedgen.sh
@ -29,14 +29,14 @@ fi
 log Building ATSB feed to $DIR/feeds/atsb.xml
 curl -s 'https://www.atsb.gov.au/publications/safety-investigation-reports/?s=1&sort=OccurrenceReleaseDate&sortAscending=descending&investigationStatus=Completed,Discontinued&printAll=true' \
 	| pup 'table.selectable_grid tr:not(.header) json{}' \
-	| jq -f $DIR/jq/atsb.jq \
+	| jq -L $DIR/jq -f $DIR/jq/atsb.jq \
 	| json2xml > $DIR/feeds/atsb.xml.new \
 	&& mv $DIR/feeds/atsb.xml.new $DIR/feeds/atsb.xml

 log Building TAIC feed to $DIR/feeds/taic.xml
 curl -s 'https://www.taic.org.nz/inquiries?order=field_publication_date&sort=desc' \
 	| pup '#view-table-wrapper tbody tr json{}' \
-	| jq -f $DIR/jq/taic.jq \
+	| jq -L $DIR/jq -f $DIR/jq/taic.jq \
 	| json2xml > $DIR/feeds/taic.xml.new \
 	&& mv $DIR/feeds/taic.xml.new $DIR/feeds/taic.xml

@ -44,21 +44,21 @@ log Building JTSB Aviation English feed to $DIR/feeds/jtsb/en/air.xml
 mkdir -p $DIR/feeds/jtsb/en
 curl -s 'https://www.mlit.go.jp/jtsb/airrep.html' \
 	| pup 'table.kankokuiken-en tr:not(:first-child) json{}' \
-	| jq -f $DIR/jq/jtsb/en/air.jq \
+	| jq -L $DIR/jq -f $DIR/jq/jtsb/en/air.jq \
 	| json2xml > $DIR/feeds/jtsb/en/air.xml.new \
 	&& mv $DIR/feeds/jtsb/en/air.xml.new $DIR/feeds/jtsb/en/air.xml

 log Building JTSB Rail English feed to $DIR/feeds/jtsb/en/rail.xml
 curl -s 'https://www.mlit.go.jp/jtsb/railrep.html' \
 	| pup 'table.kankokuiken-en tr:not(:first-child) json{}' \
-	| jq -f $DIR/jq/jtsb/en/rail.jq \
+	| jq -L $DIR/jq -f $DIR/jq/jtsb/en/rail.jq \
 	| json2xml > $DIR/feeds/jtsb/en/rail.xml.new \
 	&& mv $DIR/feeds/jtsb/en/rail.xml.new $DIR/feeds/jtsb/en/rail.xml

 log Building JTSB Marine English feed to $DIR/feeds/jtsb/en/marine.xml
 curl -s 'https://www.mlit.go.jp/jtsb/marrep.html' \
 	| pup 'table.kankokuiken-en tr:not(:first-child) json{}' \
-	| jq -f $DIR/jq/jtsb/en/marine.jq \
+	| jq -L $DIR/jq -f $DIR/jq/jtsb/en/marine.jq \
 	| json2xml > $DIR/feeds/jtsb/en/marine.xml.new \
 	&& mv $DIR/feeds/jtsb/en/marine.xml.new $DIR/feeds/jtsb/en/marine.xml

@ -134,7 +134,8 @@ log Building UZPLN English feed to $DIR/feeds/uzpln/en.xml
 mkdir -p $DIR/feeds/uzpln
 curl -s 'https://www.uzpln.cz/en/reports' \
 	| pup 'table.table tbody tr:not(:first-child) json{}' \
-	| jq -f $DIR/jq/uzpln.jq \
+	| jq -L $DIR/jq \
+		-f $DIR/jq/uzpln.jq \
 		--arg language 'en' \
 		--arg description 'Air Accidents Investigation Institute' \
 		--arg link 'https://www.uzpln.cz/en/reports' \
@ -144,7 +145,8 @@ curl -s 'https://www.uzpln.cz/en/reports' \
 log Building UZPLN Czech feed to $DIR/feeds/uzpln/cz.xml
 curl -s 'https://www.uzpln.cz/zpravy-ln' \
 	| pup 'table.table tbody tr:not(:first-child) json{}' \
-	| jq -f $DIR/jq/uzpln.jq \
+	| jq -L $DIR/jq \
+		-f $DIR/jq/uzpln.jq \
 		--arg language 'cz' \
 		--arg description 'Ústav pro odborné Zjišťování Příčin Leteckých Nehod' \
 		--arg link 'https://www.uzpln.cz/zpravy-ln' \
--- a/jq/atsb.jq
+++ b/jq/atsb.jq
@ -1,6 +1,7 @@
 # ATSB feed generator
 # Expects pup JSON output holding <tr> tags, outputs xmltodict-compatible JSON
 # WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
+import "./helpers" as helpers;

 {
    "rss": {
@ -17,7 +18,7 @@
            "item": [.[] | {
                "title": .children[0].children[0].text,
                "description": .children[1].text,
-                "link": ("https://www.atsb.gov.au" + .children[0].children[0].href),
+                "link": (.children[0].children[0].href|helpers::urlresolve("https://www.atsb.gov.au")),
                "pubDate": (try (.children[4].text | strptime("%d %b %Y") | mktime | strftime("%a, %d %b %Y %T %z")))
            }]
        }
--- a/jq/helpers.jq
+++ b/jq/helpers.jq
@ -0,0 +1,53 @@
+# Extract the named capturing groups of a regular expression into a
+# {name: matched text} mapping.
+def regex_capture(regex):
+    match(regex).captures
+    | [
+        .[]
+        | select(.name)
+        | { key: .name, value: .string }
+    ] | from_entries;
+
+# Parse URLs into an object with {scheme, netloc, path, params, query, fragment}.
+# Similar to Python's urllib.parse.urlparse.
+def urlparse: regex_capture("^(?:(?<scheme>[^:/?#]+):)?(?://(?<netloc>[^/?#]*))?(?:(?<path>(?:[^?#]+/)?[^?#;]*)(?:;(?<params>[^?#/]*))?)?(?:\\?(?<query>[^#]*))?(?:#(?<fragment>.*))?$");
+
+# Parse URLs into an object with {scheme, netloc, path, query, fragment}. Path parameters are not parsed.
+# Similar to Python's urllib.parse.urlsplit.
+def urlsplit: regex_capture("^(?:(?<scheme>[^:/?#]+):)?(?://(?<netloc>[^/?#]*))?(?<path>(?:[^?#]+/)?[^?#]*)?(?:\\?(?<query>[^#]*))?(?:#(?<fragment>.*))?$");
+
+# Reverse operation of either urlparse or urlsplit.
+def urlunparse:
+    (if .scheme then .scheme + "://" else "" end)
+    + (.netloc // "")
+    + (.path // "")
+    + (if .params then ";" + .params else "" end)
+    + (if .query then "?" + .query else "" end)
+    + (if .fragment then "#" + .fragment else "" end);
+
+# Resolve a possibly relative URI into an absolute URI.
+def urlresolve(base):
+    (if type == "string" then urlsplit else . end) as $parsed
+    # There is a scheme: this is an absolute URL
+    | if $parsed.scheme then . else (
+        base|(if type == "string" then urlsplit else . end) as $parsedbase
+        # No scheme but a domain: use the base's scheme
+        | $parsed
+        | if .netloc then (
+            .scheme = $parsedbase.scheme
+        # No scheme and no domain: resolve the relative URI
+        ) elif .path then (
+            .scheme = $parsedbase.scheme
+            | .netloc = $parsedbase.netloc
+            # When the path does not start with a slash, make it relative to the base's path
+            # Note that this assumes the base URL always points to a folder, even if it does not end with a /
+            | if .path|startswith("/")|not then (
+              .path = (($parsedbase.path|rtrimstr("/")) + "/" + ($parsed.path|ltrimstr("/")))
+            ) else . end
+        ) elif (.query // .fragment) then (
+            .scheme = $parsedbase.scheme
+            | .netloc = $parsedbase.netloc
+            | .path = $parsedbase.path
+        ) else . end
+        | urlunparse
+    ) end;
--- a/jq/jtsb/en/air.jq
+++ b/jq/jtsb/en/air.jq
@ -1,6 +1,7 @@
 # JTSB aviation feed generator (English version) from http://www.mlit.go.jp/jtsb/airrep.html
 # Expects pup JSON output holding <tr> tags, outputs xmltodict-compatible JSON
 # WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
+import "./helpers" as helpers;

 {
    "rss": {
@ -31,7 +32,7 @@
                    + " - "
                    + .children[6].children[0].text
                ),
-                "link": ("http://www.mlit.go.jp/jtsb/" + .children[-1].children[0].href),
+                "link": (.children[-1].children[0].href|helpers::urlresolve("http://www.mlit.go.jp/jtsb/")),
                "pubDate": (try (.children[1].children[0].text | strptime("%Y.%m.%d") | mktime | strftime("%a, %d %b %Y %T %z")))
            }]
        }
--- a/jq/jtsb/en/marine.jq
+++ b/jq/jtsb/en/marine.jq
@ -1,6 +1,7 @@
 # JTSB marine feed generator (English version) from http://www.mlit.go.jp/jtsb/marrep.html
 # Expects pup JSON output holding <tr> tags, outputs xmltodict-compatible JSON
 # WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
+import "./helpers" as helpers;

 {
    "rss": {
@ -25,7 +26,7 @@
                    + " - "
                    + .children[5].children[0].text
                ),
-                "link": ("http://www.mlit.go.jp/jtsb/" + .children[-1].children[0].href),
+                "link": (.children[-1].children[0].href|helpers::urlresolve("http://www.mlit.go.jp/jtsb/")),
                "pubDate": (try (.children[1].children[0].text | strptime("%Y.%m.%d") | mktime | strftime("%a, %d %b %Y %T %z")))
            }]
        }
--- a/jq/jtsb/en/rail.jq
+++ b/jq/jtsb/en/rail.jq
@ -1,6 +1,7 @@
 # JTSB rail feed generator (English version) from http://www.mlit.go.jp/jtsb/railrep.html
 # Expects pup JSON output holding <tr> tags, outputs xmltodict-compatible JSON
 # WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
+import "./helpers" as helpers;

 {
    "rss": {
@ -25,7 +26,7 @@
                    + " - "
                    + .children[5].children[0].text
                ),
-                "link": ("http://www.mlit.go.jp/jtsb/" + .children[-1].children[0].href),
+                "link": (.children[-1].children[0].href|helpers::urlresolve("http://www.mlit.go.jp/jtsb/")),
                "pubDate": (try (.children[1].children[0].text | strptime("%Y.%m.%d") | mktime | strftime("%a, %d %b %Y %T %z")))
            }]
        }
--- a/jq/taic.jq
+++ b/jq/taic.jq
@ -1,6 +1,7 @@
 # TAIC feed generator from https://www.taic.org.nz/inquiries?order=field_publication_date&sort=desc
 # Expects pup JSON output holding <tr> tags, outputs xmltodict-compatible JSON
 # WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
+import "./helpers" as helpers;

 {
    "rss": {
@ -17,7 +18,7 @@
            "item": [.[] | {
                "title": .children[0].children[1].text,
                "description": .children[0].children[2].text,
-                "link": ("https://www.taic.org.nz" + .children[0].children[1].href),
+                "link": (.children[0].children[1].href|helpers::urlresolve("https://www.taic.org.nz")),
                "pubDate": (.children[3].children[0].datetime | fromdateiso8601 | strftime("%a, %d %b %Y %T %z"))
            }]
        }
--- a/jq/uzpln.jq
+++ b/jq/uzpln.jq
@ -5,6 +5,7 @@
 #   $link: Feed link
 # Expects pup JSON output holding <tr> tags, outputs xmltodict-compatible JSON
 # WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
+import "./helpers" as helpers;

 {
    "rss": {
@ -20,7 +21,7 @@
            "generator": "ITSB",
            "item": [.[] | {
                "title": (.children[4].text + " - " + .children[2].text),
-                "link": ("http://www.uzpln.cz" + .children[5].children[0].href),
+                "link": (.children[5].children[0].href|helpers::urlresolve("http://www.uzpln.cz")),
                "pubDate": (.children[0].text | strptime("%Y-%m-%d") | mktime | strftime("%a, %d %b %Y %T %z"))
            }]
        }