URL management helpers, closes #5
This commit is contained in:
parent
1c7fdf93b4
commit
8b06e0eb6f
16
feedgen.sh
16
feedgen.sh
|
@ -29,14 +29,14 @@ fi
|
|||
log Building ATSB feed to $DIR/feeds/atsb.xml
|
||||
curl -s 'https://www.atsb.gov.au/publications/safety-investigation-reports/?s=1&sort=OccurrenceReleaseDate&sortAscending=descending&investigationStatus=Completed,Discontinued&printAll=true' \
|
||||
| pup 'table.selectable_grid tr:not(.header) json{}' \
|
||||
| jq -f $DIR/jq/atsb.jq \
|
||||
| jq -L $DIR/jq -f $DIR/jq/atsb.jq \
|
||||
| json2xml > $DIR/feeds/atsb.xml.new \
|
||||
&& mv $DIR/feeds/atsb.xml.new $DIR/feeds/atsb.xml
|
||||
|
||||
log Building TAIC feed to $DIR/feeds/taic.xml
|
||||
curl -s 'https://www.taic.org.nz/inquiries?order=field_publication_date&sort=desc' \
|
||||
| pup '#view-table-wrapper tbody tr json{}' \
|
||||
| jq -f $DIR/jq/taic.jq \
|
||||
| jq -L $DIR/jq -f $DIR/jq/taic.jq \
|
||||
| json2xml > $DIR/feeds/taic.xml.new \
|
||||
&& mv $DIR/feeds/taic.xml.new $DIR/feeds/taic.xml
|
||||
|
||||
|
@ -44,21 +44,21 @@ log Building JTSB Aviation English feed to $DIR/feeds/jtsb/en/air.xml
|
|||
mkdir -p $DIR/feeds/jtsb/en
|
||||
curl -s 'https://www.mlit.go.jp/jtsb/airrep.html' \
|
||||
| pup 'table.kankokuiken-en tr:not(:first-child) json{}' \
|
||||
| jq -f $DIR/jq/jtsb/en/air.jq \
|
||||
| jq -L $DIR/jq -f $DIR/jq/jtsb/en/air.jq \
|
||||
| json2xml > $DIR/feeds/jtsb/en/air.xml.new \
|
||||
&& mv $DIR/feeds/jtsb/en/air.xml.new $DIR/feeds/jtsb/en/air.xml
|
||||
|
||||
log Building JTSB Rail English feed to $DIR/feeds/jtsb/en/rail.xml
|
||||
curl -s 'https://www.mlit.go.jp/jtsb/railrep.html' \
|
||||
| pup 'table.kankokuiken-en tr:not(:first-child) json{}' \
|
||||
| jq -f $DIR/jq/jtsb/en/rail.jq \
|
||||
| jq -L $DIR/jq -f $DIR/jq/jtsb/en/rail.jq \
|
||||
| json2xml > $DIR/feeds/jtsb/en/rail.xml.new \
|
||||
&& mv $DIR/feeds/jtsb/en/rail.xml.new $DIR/feeds/jtsb/en/rail.xml
|
||||
|
||||
log Building JTSB Marine English feed to $DIR/feeds/jtsb/en/marine.xml
|
||||
curl -s 'https://www.mlit.go.jp/jtsb/marrep.html' \
|
||||
| pup 'table.kankokuiken-en tr:not(:first-child) json{}' \
|
||||
| jq -f $DIR/jq/jtsb/en/marine.jq \
|
||||
| jq -L $DIR/jq -f $DIR/jq/jtsb/en/marine.jq \
|
||||
| json2xml > $DIR/feeds/jtsb/en/marine.xml.new \
|
||||
&& mv $DIR/feeds/jtsb/en/marine.xml.new $DIR/feeds/jtsb/en/marine.xml
|
||||
|
||||
|
@ -134,7 +134,8 @@ log Building UZPLN English feed to $DIR/feeds/uzpln/en.xml
|
|||
mkdir -p $DIR/feeds/uzpln
|
||||
curl -s 'https://www.uzpln.cz/en/reports' \
|
||||
| pup 'table.table tbody tr:not(:first-child) json{}' \
|
||||
| jq -f $DIR/jq/uzpln.jq \
|
||||
| jq -L $DIR/jq \
|
||||
-f $DIR/jq/uzpln.jq \
|
||||
--arg language 'en' \
|
||||
--arg description 'Air Accidents Investigation Institute' \
|
||||
--arg link 'https://www.uzpln.cz/en/reports' \
|
||||
|
@ -144,7 +145,8 @@ curl -s 'https://www.uzpln.cz/en/reports' \
|
|||
log Building UZPLN Czech feed to $DIR/feeds/uzpln/cz.xml
|
||||
curl -s 'https://www.uzpln.cz/zpravy-ln' \
|
||||
| pup 'table.table tbody tr:not(:first-child) json{}' \
|
||||
| jq -f $DIR/jq/uzpln.jq \
|
||||
| jq -L $DIR/jq \
|
||||
-f $DIR/jq/uzpln.jq \
|
||||
--arg language 'cz' \
|
||||
--arg description 'Ústav pro odborné Zjišťování Příčin Leteckých Nehod' \
|
||||
--arg link 'https://www.uzpln.cz/zpravy-ln' \
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# ATSB feed generator
|
||||
# Expects pup JSON output holding <tr> tags, outputs xmltodict-compatible JSON
|
||||
# WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
|
||||
import "./helpers" as helpers;
|
||||
|
||||
{
|
||||
"rss": {
|
||||
|
@ -17,7 +18,7 @@
|
|||
"item": [.[] | {
|
||||
"title": .children[0].children[0].text,
|
||||
"description": .children[1].text,
|
||||
"link": ("https://www.atsb.gov.au" + .children[0].children[0].href),
|
||||
"link": (.children[0].children[0].href|helpers::urlresolve("https://www.atsb.gov.au")),
|
||||
"pubDate": (try (.children[4].text | strptime("%d %b %Y") | mktime | strftime("%a, %d %b %Y %T %z")))
|
||||
}]
|
||||
}
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
# Extract the named capturing groups of a regular expression into a
|
||||
# {name: matched text} mapping.
|
||||
def regex_capture(regex):
|
||||
match(regex).captures
|
||||
| [
|
||||
.[]
|
||||
| select(.name)
|
||||
| { key: .name, value: .string }
|
||||
] | from_entries;
|
||||
|
||||
# Parse URLs into an object with {scheme, netloc, path, params, query, fragment}.
|
||||
# Similar to Python's urllib.parse.urlparse.
|
||||
def urlparse: regex_capture("^(?:(?<scheme>[^:/?#]+):)?(?://(?<netloc>[^/?#]*))?(?:(?<path>(?:[^?#]+/)?[^?#;]*)(?:;(?<params>[^?#/]*))?)?(?:\\?(?<query>[^#]*))?(?:#(?<fragment>.*))?$");
|
||||
|
||||
# Parse URLs into an object with {scheme, netloc, path, query, fragment}. Path parameters are not parsed.
|
||||
# Similar to Python's urllib.parse.urlsplit.
|
||||
def urlsplit: regex_capture("^(?:(?<scheme>[^:/?#]+):)?(?://(?<netloc>[^/?#]*))?(?<path>(?:[^?#]+/)?[^?#]*)?(?:\\?(?<query>[^#]*))?(?:#(?<fragment>.*))?$");
|
||||
|
||||
# Reverse operation of either urlparse or urlsplit.
|
||||
def urlunparse:
|
||||
(if .scheme then .scheme + "://" else "" end)
|
||||
+ (.netloc // "")
|
||||
+ (.path // "")
|
||||
+ (if .params then ";" + .params else "" end)
|
||||
+ (if .query then "?" + .query else "" end)
|
||||
+ (if .fragment then "#" + .fragment else "" end);
|
||||
|
||||
# Resolve a possibly relative URI into an absolute URI.
|
||||
def urlresolve(base):
|
||||
(if type == "string" then urlsplit else . end) as $parsed
|
||||
# There is a scheme: this is an absolute URL
|
||||
| if $parsed.scheme then . else (
|
||||
base|(if type == "string" then urlsplit else . end) as $parsedbase
|
||||
# No scheme but a domain: use the base's scheme
|
||||
| $parsed
|
||||
| if .netloc then (
|
||||
.scheme = $parsedbase.scheme
|
||||
# No scheme and no domain: resolve the relative URI
|
||||
) elif .path then (
|
||||
.scheme = $parsedbase.scheme
|
||||
| .netloc = $parsedbase.netloc
|
||||
# When the path does not start with a slash, make it relative to the base's path
|
||||
# Note that this assumes the base URL always points to a folder, even if it does not end with a /
|
||||
| if .path|startswith("/")|not then (
|
||||
.path = (($parsedbase.path|rtrimstr("/")) + "/" + ($parsed.path|ltrimstr("/")))
|
||||
) else . end
|
||||
) elif (.query // .fragment) then (
|
||||
.scheme = $parsedbase.scheme
|
||||
| .netloc = $parsedbase.netloc
|
||||
| .path = $parsedbase.path
|
||||
) else . end
|
||||
| urlunparse
|
||||
) end;
|
|
@ -1,6 +1,7 @@
|
|||
# JTSB aviation feed generator (English version) from http://www.mlit.go.jp/jtsb/airrep.html
|
||||
# Expects pup JSON output holding <tr> tags, outputs xmltodict-compatible JSON
|
||||
# WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
|
||||
import "./helpers" as helpers;
|
||||
|
||||
{
|
||||
"rss": {
|
||||
|
@ -31,7 +32,7 @@
|
|||
+ " - "
|
||||
+ .children[6].children[0].text
|
||||
),
|
||||
"link": ("http://www.mlit.go.jp/jtsb/" + .children[-1].children[0].href),
|
||||
"link": (.children[-1].children[0].href|helpers::urlresolve("http://www.mlit.go.jp/jtsb/")),
|
||||
"pubDate": (try (.children[1].children[0].text | strptime("%Y.%m.%d") | mktime | strftime("%a, %d %b %Y %T %z")))
|
||||
}]
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# JTSB marine feed generator (English version) from http://www.mlit.go.jp/jtsb/marrep.html
|
||||
# Expects pup JSON output holding <tr> tags, outputs xmltodict-compatible JSON
|
||||
# WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
|
||||
import "./helpers" as helpers;
|
||||
|
||||
{
|
||||
"rss": {
|
||||
|
@ -25,7 +26,7 @@
|
|||
+ " - "
|
||||
+ .children[5].children[0].text
|
||||
),
|
||||
"link": ("http://www.mlit.go.jp/jtsb/" + .children[-1].children[0].href),
|
||||
"link": (.children[-1].children[0].href|helpers::urlresolve("http://www.mlit.go.jp/jtsb/")),
|
||||
"pubDate": (try (.children[1].children[0].text | strptime("%Y.%m.%d") | mktime | strftime("%a, %d %b %Y %T %z")))
|
||||
}]
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# JTSB rail feed generator (English version) from http://www.mlit.go.jp/jtsb/railrep.html
|
||||
# Expects pup JSON output holding <tr> tags, outputs xmltodict-compatible JSON
|
||||
# WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
|
||||
import "./helpers" as helpers;
|
||||
|
||||
{
|
||||
"rss": {
|
||||
|
@ -25,7 +26,7 @@
|
|||
+ " - "
|
||||
+ .children[5].children[0].text
|
||||
),
|
||||
"link": ("http://www.mlit.go.jp/jtsb/" + .children[-1].children[0].href),
|
||||
"link": (.children[-1].children[0].href|helpers::urlresolve("http://www.mlit.go.jp/jtsb/")),
|
||||
"pubDate": (try (.children[1].children[0].text | strptime("%Y.%m.%d") | mktime | strftime("%a, %d %b %Y %T %z")))
|
||||
}]
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# TAIC feed generator from https://www.taic.org.nz/inquiries?order=field_publication_date&sort=desc
|
||||
# Expects pup JSON output holding <tr> tags, outputs xmltodict-compatible JSON
|
||||
# WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
|
||||
import "./helpers" as helpers;
|
||||
|
||||
{
|
||||
"rss": {
|
||||
|
@ -17,7 +18,7 @@
|
|||
"item": [.[] | {
|
||||
"title": .children[0].children[1].text,
|
||||
"description": .children[0].children[2].text,
|
||||
"link": ("https://www.taic.org.nz" + .children[0].children[1].href),
|
||||
"link": (.children[0].children[1].href|helpers::urlresolve("https://www.taic.org.nz")),
|
||||
"pubDate": (.children[3].children[0].datetime | fromdateiso8601 | strftime("%a, %d %b %Y %T %z"))
|
||||
}]
|
||||
}
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
# $link: Feed link
|
||||
# Expects pup JSON output holding <tr> tags, outputs xmltodict-compatible JSON
|
||||
# WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
|
||||
import "./helpers" as helpers;
|
||||
|
||||
{
|
||||
"rss": {
|
||||
|
@ -20,7 +21,7 @@
|
|||
"generator": "ITSB",
|
||||
"item": [.[] | {
|
||||
"title": (.children[4].text + " - " + .children[2].text),
|
||||
"link": ("http://www.uzpln.cz" + .children[5].children[0].href),
|
||||
"link": (.children[5].children[0].href|helpers::urlresolve("http://www.uzpln.cz")),
|
||||
"pubDate": (.children[0].text | strptime("%Y-%m-%d") | mktime | strftime("%a, %d %b %Y %T %z"))
|
||||
}]
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue