Add AAIA custom feeds, close #37

This commit is contained in:
Lucidiot 2021-03-20 00:34:25 +01:00
parent 0685bdac52
commit 8f47f99818
Signed by: lucidiot
GPG Key ID: 3358C1CA6906FB8D
3 changed files with 131 additions and 0 deletions

View File

@ -1127,5 +1127,48 @@
<output>oeaif/de.xml</output>
</feed>
</source>
<source id="aaia">
<name>Air Accident Investigation Authority</name>
<region>Hong Kong</region>
<type>Aviation</type>
<frequency>5-10 reports/year</frequency>
<feed type="aviation" lang="English" format="rss" id="aaia-en">
<curl>
<url>https://www.thb.gov.hk/aaia/eng/investigation_reports/index.htm</url>
</curl>
<pup>td:not([colspan]) table tr:not(:first-child)</pup>
<jq path="aaia.jq">
<arg name="lang">en</arg>
<arg name="link">https://www.thb.gov.hk/aaia/eng/investigation_reports/index.htm</arg>
</jq>
<json2xml />
<output>aaia/en.xml</output>
</feed>
<feed type="aviation" lang="Simplified Chinese" format="rss" id="aaia-zh-hans">
<curl>
<url>https://www.thb.gov.hk/aaia/sc/investigation_reports/index.htm</url>
</curl>
<pup>td:not([colspan]) table tr:not(:first-child)</pup>
<jq path="aaia.jq">
<arg name="lang">zh-Hans</arg>
<arg name="link">https://www.thb.gov.hk/aaia/sc/investigation_reports/index.htm</arg>
</jq>
<json2xml />
<output>aaia/zh-hans.xml</output>
</feed>
<feed type="aviation" lang="Traditional Chinese" format="rss" id="aaia-zh-hant">
<curl>
<url>https://www.thb.gov.hk/aaia/tc/investigation_reports/index.htm</url>
</curl>
<pup>td:not([colspan]) table tr:not(:first-child)</pup>
<jq path="aaia.jq">
<arg name="lang">zh-Hant</arg>
<arg name="link">https://www.thb.gov.hk/aaia/tc/investigation_reports/index.htm</arg>
</jq>
<json2xml />
<output>aaia/zh-hant.xml</output>
</feed>
</source>
</section>
</itsb>

50
jq/aaia.jq Normal file
View File

@ -0,0 +1,50 @@
# AAIA feed generator
# Expects pup JSON output holding <tr> tags, outputs xmltodict-compatible JSON
# WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale.
# Expected variables:
# $lang: Language code, without the country code (en/zh-Hans/zh-Hant).
# $link: Link to the investigation list.
import "./helpers" as helpers;
{
"rss": {
"@version": "2.0",
"channel": {
"title": "AAIA (\($lang))",
"description": "Air Accident Investigation Authority reports",
"link": $link,
"language": "\($lang)-HK",
"pubDate": (now | strftime("%a, %d %b %Y %T %z")),
"docs": "https://www.rssboard.org/rss-specification",
"ttl": 1440,
"generator": "ITSB",
"item": [
.[].children // []
# Ignore empty lines
| select(length)
| . as $data
# Grab all the report links, as we will use the last one as the link and put all of them in the description
| [
.[2:][].children
| ..
| select(.tag? == "a")
| .href |= helpers::urlresolve($link)
]
| {
"title": ($data[1].text // $data[1].children[0].text),
"link": .[-1].href,
"description": (
[.[] | "<li><a href=\"\(.href)\" target=\"_blank\">\(.text)</a></li>"]
| join("")
| "<ul>\(.)</ul>"
),
"pubDate": (
$data[0].text // $data[0].children[0].text
| if $lang == "en" then strptime("%d %B %Y") | mktime else helpers::parse_chinese_date end
| strftime("%a, %d %b %Y %T %z")
)
}
]
}
}
}

View File

@ -41,3 +41,41 @@ def urlresolve(base):
) else . end
| urlunparse
) end;
# Basic Chinese number parsing meant for Chinese date parsing.
def parse_chinese_number:
. as $input
| {
"零": "0",
"": "0",
"一": "1",
"二": "2",
"三": "3",
"四": "4",
"五": "5",
"六": "6",
"七": "7",
"八": "8",
"九": "9",
# 10 is ignored here as we will parse number by number.
"十": ""
} as $charmap
| $input / ""
| map($charmap[.] // .)
| join("")
# Special case for when we parse 十 alone
| if . == "" then 1 else . end
| tonumber
# Parsing number by number, ignoring 10, will work as long as there is a digit after 10:
# 二十八 works because we parse it as 二八 (2 and 8), but 二十 would yield 2 only,
# so we multiply manually by 10 when the number ends with 10.
| if $input|endswith("十") then . * 10 else . end;
# Parse a Traditional or Simplified Chinese date into a Unix timestamp.
def parse_chinese_date:
capture("(?<year>[0123456789零一二三四五六七八九十]+)年(?<month>[0123456789零一二三四五六七八九十]+)月(?<day>[0123456789零一二三四五六七八九十]+)日")
| map_values(parse_chinese_number)
# Handle the Chinese calendar by assuming we never would get dates before 1900. This will no longer work in the year 2811.
| if .year < 1900 then .year += 1911 else . end
| "\(.year)-\(.month)-\(.day)T00:00:00Z"
| fromdateiso8601;