diff --git a/itsb.xml b/itsb.xml index 1f78e74..5dfa8dd 100644 --- a/itsb.xml +++ b/itsb.xml @@ -1127,5 +1127,48 @@ oeaif/de.xml + + + Air Accident Investigation Authority + Hong Kong + Aviation + 5-10 reports/year + + + https://www.thb.gov.hk/aaia/eng/investigation_reports/index.htm + + td:not([colspan]) table tr:not(:first-child) + + en + https://www.thb.gov.hk/aaia/eng/investigation_reports/index.htm + + + aaia/en.xml + + + + https://www.thb.gov.hk/aaia/sc/investigation_reports/index.htm + + td:not([colspan]) table tr:not(:first-child) + + zh-Hans + https://www.thb.gov.hk/aaia/sc/investigation_reports/index.htm + + + aaia/zh-hans.xml + + + + https://www.thb.gov.hk/aaia/tc/investigation_reports/index.htm + + td:not([colspan]) table tr:not(:first-child) + + zh-Hant + https://www.thb.gov.hk/aaia/tc/investigation_reports/index.htm + + + aaia/zh-hant.xml + + diff --git a/jq/aaia.jq b/jq/aaia.jq new file mode 100644 index 0000000..5598b84 --- /dev/null +++ b/jq/aaia.jq @@ -0,0 +1,50 @@ +# AAIA feed generator +# Expects pup JSON output holding tags, outputs xmltodict-compatible JSON +# WARNING: Dates are locale-sensitive; the RSS feed might not generate correctly with another locale. +# Expected variables: +# $lang: Language code, without the country code (en/zh-Hans/zh-Hant). +# $link: Link to the investigation list. +import "./helpers" as helpers; + +{ + "rss": { + "@version": "2.0", + "channel": { + "title": "AAIA (\($lang))", + "description": "Air Accident Investigation Authority reports", + "link": $link, + "language": "\($lang)-HK", + "pubDate": (now | strftime("%a, %d %b %Y %T %z")), + "docs": "https://www.rssboard.org/rss-specification", + "ttl": 1440, + "generator": "ITSB", + "item": [ + .[].children // [] + # Ignore empty lines + | select(length) + | . as $data + # Grab all the report links, as we will use the last one as the link and put all of them in the description + | [ + .[2:][].children + | .. + | select(.tag? == "a") + | .href |= helpers::urlresolve($link) + ] + | { + "title": ($data[1].text // $data[1].children[0].text), + "link": .[-1].href, + "description": ( + [.[] | "
  • \(.text)
  • "] + | join("") + | "" + ), + "pubDate": ( + $data[0].text // $data[0].children[0].text + | if $lang == "en" then strptime("%d %B %Y") | mktime else helpers::parse_chinese_date end + | strftime("%a, %d %b %Y %T %z") + ) + } + ] + } + } +} diff --git a/jq/helpers.jq b/jq/helpers.jq index 1633ec6..c864a67 100644 --- a/jq/helpers.jq +++ b/jq/helpers.jq @@ -41,3 +41,41 @@ def urlresolve(base): ) else . end | urlunparse ) end; + +# Basic Chinese number parsing meant for Chinese date parsing. +def parse_chinese_number: + . as $input + | { + "零": "0", + "〇": "0", + "一": "1", + "二": "2", + "三": "3", + "四": "4", + "五": "5", + "六": "6", + "七": "7", + "八": "8", + "九": "9", + # 10 is ignored here as we will parse number by number. + "十": "" + } as $charmap + | $input / "" + | map($charmap[.] // .) + | join("") + # Special case for when we parse 十 alone + | if . == "" then 1 else . end + | tonumber + # Parsing number by number, ignoring 10, will work as long as there is a digit after 10: + # 二十八 works because we parse it as 二八 (2 and 8), but 二十 would yield 2 only, + # so we multiply manually by 10 when the number ends with 10. + | if $input|endswith("十") then . * 10 else . end; + +# Parse a Traditional or Simplified Chinese date into a Unix timestamp. +def parse_chinese_date: + capture("(?[0123456789零〇一二三四五六七八九十]+)年(?[0123456789零〇一二三四五六七八九十]+)月(?[0123456789零〇一二三四五六七八九十]+)日") + | map_values(parse_chinese_number) + # Handle the Chinese calendar by assuming we never would get dates before 1900. This will no longer work in the year 2811. + | if .year < 1900 then .year += 1911 else . end + | "\(.year)-\(.month)-\(.day)T00:00:00Z" + | fromdateiso8601;