URL management helpers, closes #5

This commit is contained in:
Lucidiot 2020-07-19 14:46:47 +02:00
parent 1c7fdf93b4
commit 8b06e0eb6f
Signed by: lucidiot
GPG Key ID: 3358C1CA6906FB8D
8 changed files with 74 additions and 13 deletions

View File

@ -29,14 +29,14 @@ fi
log Building ATSB feed to $DIR/feeds/atsb.xml
curl -s 'https://www.atsb.gov.au/publications/safety-investigation-reports/?s=1&sort=OccurrenceReleaseDate&sortAscending=descending&investigationStatus=Completed,Discontinued&printAll=true' \
| pup 'table.selectable_grid tr:not(.header) json{}' \
| jq -f $DIR/jq/atsb.jq \
| jq -L $DIR/jq -f $DIR/jq/atsb.jq \
| json2xml > $DIR/feeds/atsb.xml.new \
&& mv $DIR/feeds/atsb.xml.new $DIR/feeds/atsb.xml
log Building TAIC feed to $DIR/feeds/taic.xml
curl -s 'https://www.taic.org.nz/inquiries?order=field_publication_date&sort=desc' \
| pup '#view-table-wrapper tbody tr json{}' \
| jq -f $DIR/jq/taic.jq \
| jq -L $DIR/jq -f $DIR/jq/taic.jq \
| json2xml > $DIR/feeds/taic.xml.new \
&& mv $DIR/feeds/taic.xml.new $DIR/feeds/taic.xml
@ -44,21 +44,21 @@ log Building JTSB Aviation English feed to $DIR/feeds/jtsb/en/air.xml
mkdir -p $DIR/feeds/jtsb/en
curl -s 'https://www.mlit.go.jp/jtsb/airrep.html' \
| pup 'table.kankokuiken-en tr:not(:first-child) json{}' \
| jq -f $DIR/jq/jtsb/en/air.jq \
| jq -L $DIR/jq -f $DIR/jq/jtsb/en/air.jq \
| json2xml > $DIR/feeds/jtsb/en/air.xml.new \
&& mv $DIR/feeds/jtsb/en/air.xml.new $DIR/feeds/jtsb/en/air.xml
log Building JTSB Rail English feed to $DIR/feeds/jtsb/en/rail.xml
curl -s 'https://www.mlit.go.jp/jtsb/railrep.html' \
| pup 'table.kankokuiken-en tr:not(:first-child) json{}' \
| jq -f $DIR/jq/jtsb/en/rail.jq \
| jq -L $DIR/jq -f $DIR/jq/jtsb/en/rail.jq \
| json2xml > $DIR/feeds/jtsb/en/rail.xml.new \
&& mv $DIR/feeds/jtsb/en/rail.xml.new $DIR/feeds/jtsb/en/rail.xml
log Building JTSB Marine English feed to $DIR/feeds/jtsb/en/marine.xml
curl -s 'https://www.mlit.go.jp/jtsb/marrep.html' \
| pup 'table.kankokuiken-en tr:not(:first-child) json{}' \
| jq -f $DIR/jq/jtsb/en/marine.jq \
| jq -L $DIR/jq -f $DIR/jq/jtsb/en/marine.jq \
| json2xml > $DIR/feeds/jtsb/en/marine.xml.new \
&& mv $DIR/feeds/jtsb/en/marine.xml.new $DIR/feeds/jtsb/en/marine.xml
@ -134,7 +134,8 @@ log Building UZPLN English feed to $DIR/feeds/uzpln/en.xml
mkdir -p $DIR/feeds/uzpln
curl -s 'https://www.uzpln.cz/en/reports' \
| pup 'table.table tbody tr:not(:first-child) json{}' \
| jq -f $DIR/jq/uzpln.jq \
| jq -L $DIR/jq \
-f $DIR/jq/uzpln.jq \
--arg language 'en' \
--arg description 'Air Accidents Investigation Institute' \
--arg link 'https://www.uzpln.cz/en/reports' \
@ -144,7 +145,8 @@ curl -s 'https://www.uzpln.cz/en/reports' \
log Building UZPLN Czech feed to $DIR/feeds/uzpln/cz.xml
curl -s 'https://www.uzpln.cz/zpravy-ln' \
| pup 'table.table tbody tr:not(:first-child) json{}' \
| jq -f $DIR/jq/uzpln.jq \
| jq -L $DIR/jq \
-f $DIR/jq/uzpln.jq \
--arg language 'cz' \
--arg description 'Ústav pro odborné Zjišťování Příčin Leteckých Nehod' \
--arg link 'https://www.uzpln.cz/zpravy-ln' \

View File

@ -1,6 +1,7 @@
# ATSB feed generator
# Expects pup JSON output holding <tr> tags, outputs xmltodict-compatible JSON
# WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
import "./helpers" as helpers;
{
"rss": {
@ -17,7 +18,7 @@
"item": [.[] | {
"title": .children[0].children[0].text,
"description": .children[1].text,
"link": ("https://www.atsb.gov.au" + .children[0].children[0].href),
"link": (.children[0].children[0].href|helpers::urlresolve("https://www.atsb.gov.au")),
"pubDate": (try (.children[4].text | strptime("%d %b %Y") | mktime | strftime("%a, %d %b %Y %T %z")))
}]
}

53
jq/helpers.jq Normal file
View File

@ -0,0 +1,53 @@
# Extract the named capturing groups of a regular expression into a
# {name: matched text} mapping.
def regex_capture(regex):
match(regex).captures
| [
.[]
| select(.name)
| { key: .name, value: .string }
] | from_entries;
# Parse URLs into an object with {scheme, netloc, path, params, query, fragment}.
# Similar to Python's urllib.parse.urlparse.
def urlparse: regex_capture("^(?:(?<scheme>[^:/?#]+):)?(?://(?<netloc>[^/?#]*))?(?:(?<path>(?:[^?#]+/)?[^?#;]*)(?:;(?<params>[^?#/]*))?)?(?:\\?(?<query>[^#]*))?(?:#(?<fragment>.*))?$");
# Parse URLs into an object with {scheme, netloc, path, query, fragment}. Path parameters are not parsed.
# Similar to Python's urllib.parse.urlsplit.
def urlsplit: regex_capture("^(?:(?<scheme>[^:/?#]+):)?(?://(?<netloc>[^/?#]*))?(?<path>(?:[^?#]+/)?[^?#]*)?(?:\\?(?<query>[^#]*))?(?:#(?<fragment>.*))?$");
# Reverse operation of either urlparse or urlsplit.
def urlunparse:
(if .scheme then .scheme + "://" else "" end)
+ (.netloc // "")
+ (.path // "")
+ (if .params then ";" + .params else "" end)
+ (if .query then "?" + .query else "" end)
+ (if .fragment then "#" + .fragment else "" end);
# Resolve a possibly relative URI into an absolute URI.
def urlresolve(base):
(if type == "string" then urlsplit else . end) as $parsed
# There is a scheme: this is an absolute URL
| if $parsed.scheme then . else (
base|(if type == "string" then urlsplit else . end) as $parsedbase
# No scheme but a domain: use the base's scheme
| $parsed
| if .netloc then (
.scheme = $parsedbase.scheme
# No scheme and no domain: resolve the relative URI
) elif .path then (
.scheme = $parsedbase.scheme
| .netloc = $parsedbase.netloc
# When the path does not start with a slash, make it relative to the base's path
# Note that this assumes the base URL always points to a folder, even if it does not end with a /
| if .path|startswith("/")|not then (
.path = (($parsedbase.path|rtrimstr("/")) + "/" + ($parsed.path|ltrimstr("/")))
) else . end
) elif (.query // .fragment) then (
.scheme = $parsedbase.scheme
| .netloc = $parsedbase.netloc
| .path = $parsedbase.path
) else . end
| urlunparse
) end;

View File

@ -1,6 +1,7 @@
# JTSB aviation feed generator (English version) from http://www.mlit.go.jp/jtsb/airrep.html
# Expects pup JSON output holding <tr> tags, outputs xmltodict-compatible JSON
# WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
import "./helpers" as helpers;
{
"rss": {
@ -31,7 +32,7 @@
+ " - "
+ .children[6].children[0].text
),
"link": ("http://www.mlit.go.jp/jtsb/" + .children[-1].children[0].href),
"link": (.children[-1].children[0].href|helpers::urlresolve("http://www.mlit.go.jp/jtsb/")),
"pubDate": (try (.children[1].children[0].text | strptime("%Y.%m.%d") | mktime | strftime("%a, %d %b %Y %T %z")))
}]
}

View File

@ -1,6 +1,7 @@
# JTSB marine feed generator (English version) from http://www.mlit.go.jp/jtsb/marrep.html
# Expects pup JSON output holding <tr> tags, outputs xmltodict-compatible JSON
# WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
import "./helpers" as helpers;
{
"rss": {
@ -25,7 +26,7 @@
+ " - "
+ .children[5].children[0].text
),
"link": ("http://www.mlit.go.jp/jtsb/" + .children[-1].children[0].href),
"link": (.children[-1].children[0].href|helpers::urlresolve("http://www.mlit.go.jp/jtsb/")),
"pubDate": (try (.children[1].children[0].text | strptime("%Y.%m.%d") | mktime | strftime("%a, %d %b %Y %T %z")))
}]
}

View File

@ -1,6 +1,7 @@
# JTSB rail feed generator (English version) from http://www.mlit.go.jp/jtsb/railrep.html
# Expects pup JSON output holding <tr> tags, outputs xmltodict-compatible JSON
# WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
import "./helpers" as helpers;
{
"rss": {
@ -25,7 +26,7 @@
+ " - "
+ .children[5].children[0].text
),
"link": ("http://www.mlit.go.jp/jtsb/" + .children[-1].children[0].href),
"link": (.children[-1].children[0].href|helpers::urlresolve("http://www.mlit.go.jp/jtsb/")),
"pubDate": (try (.children[1].children[0].text | strptime("%Y.%m.%d") | mktime | strftime("%a, %d %b %Y %T %z")))
}]
}

View File

@ -1,6 +1,7 @@
# TAIC feed generator from https://www.taic.org.nz/inquiries?order=field_publication_date&sort=desc
# Expects pup JSON output holding <tr> tags, outputs xmltodict-compatible JSON
# WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
import "./helpers" as helpers;
{
"rss": {
@ -17,7 +18,7 @@
"item": [.[] | {
"title": .children[0].children[1].text,
"description": .children[0].children[2].text,
"link": ("https://www.taic.org.nz" + .children[0].children[1].href),
"link": (.children[0].children[1].href|helpers::urlresolve("https://www.taic.org.nz")),
"pubDate": (.children[3].children[0].datetime | fromdateiso8601 | strftime("%a, %d %b %Y %T %z"))
}]
}

View File

@ -5,6 +5,7 @@
# $link: Feed link
# Expects pup JSON output holding <tr> tags, outputs xmltodict-compatible JSON
# WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
import "./helpers" as helpers;
{
"rss": {
@ -20,7 +21,7 @@
"generator": "ITSB",
"item": [.[] | {
"title": (.children[4].text + " - " + .children[2].text),
"link": ("http://www.uzpln.cz" + .children[5].children[0].href),
"link": (.children[5].children[0].href|helpers::urlresolve("http://www.uzpln.cz")),
"pubDate": (.children[0].text | strptime("%Y-%m-%d") | mktime | strftime("%a, %d %b %Y %T %z"))
}]
}