Compare commits

...

3 Commits

Author SHA1 Message Date
~lucidiot 5ef1805fd8
Rewrite ATSB feed generator, closes #119
continuous-integration/drone/push Build is passing Details
2024-02-24 19:47:38 +00:00
~lucidiot d4317056f2
XQuery support via Xidel 2024-02-24 19:46:25 +00:00
~lucidiot f447ee9fc1
Fix DAAI feed 2024-02-20 22:33:05 +00:00
5 changed files with 198 additions and 16 deletions

View File

@ -298,12 +298,7 @@
<frequency>100 reports/year</frequency>
<url>https://www.atsb.gov.au/</url>
<feed format="rss" lang="English" id="atsb-rss">
<curl>
<url>https://www.atsb.gov.au/publications/safety-investigation-reports/?s=1&amp;sort=OccurrenceReleaseDate&amp;sortAscending=descending&amp;investigationStatus=Completed,Discontinued&amp;printAll=true</url>
</curl>
<pup>table.selectable_grid tr:not(.header)</pup>
<jq path="atsb.jq" />
<json2xml />
<xquery path="atsb.xqy" user-agent="a/1" />
<output>atsb.xml</output>
</feed>
</source>
@ -908,18 +903,13 @@
<region>Namibia</region>
<type>Aviation</type>
<frequency>3-5 reports/year</frequency>
<url>https://mwt.gov.na/directorate-of-aircraft-accident-and-incident-investigations</url>
<url>https://mwt.gov.na/web/mwt/aviation</url>
<feed format="rss" lang="English" id="daai-rss">
<curl>
<!-- The SSL verification fails on tilde.town for some reason, but it works properly in web browsers. -->
<url verify-ssl="false">https://mwt.gov.na/published-daai-report</url>
<url verify-ssl="false">https://mwt.gov.na/web/mwt/completed-investigations</url>
</curl>
<!--
We would need both :not(:first-child) and :not(:nth-child(2)) to
properly remove all the header rows since there are two of them,
but pup does not allow that so the jq script will ignore the first row it gets.
-->
<pup>#our-content tr:not(:first-child)</pup>
<pup>.journal-content-article tr</pup>
<jq path="daai.jq" />
<json2xml />
<output>daai.xml</output>

View File

@ -216,6 +216,87 @@
</xs:simpleContent>
</xs:complexType>
<xs:simpleType name="XidelOutputFormat">
<xs:annotation>
<xs:documentation>
Output format of a `xidel` command.
</xs:documentation>
</xs:annotation>
<xs:restriction base="xs:string">
<xs:enumeration value="xml">
<xs:annotation>
<xs:documentation>
XML document.
</xs:documentation>
</xs:annotation>
</xs:enumeration>
<xs:enumeration value="html">
<xs:annotation>
<xs:documentation>
XHTML document.
</xs:documentation>
</xs:annotation>
</xs:enumeration>
<xs:enumeration value="adhoc">
<xs:annotation>
<xs:documentation>
Human-readable representation. This will output the `text()` content of XML nodes, and JSON structures are output as they are, with indentation.
</xs:documentation>
</xs:annotation>
</xs:enumeration>
<xs:enumeration value="xml-wrapped">
<xs:annotation>
<xs:documentation>
Human-readable representation, embedded within an XML structure.
</xs:documentation>
</xs:annotation>
</xs:enumeration>
<xs:enumeration value="json-wrapped">
<xs:annotation>
<xs:documentation>
Human-readable representation, embedded within a JSON structure.
</xs:documentation>
</xs:annotation>
</xs:enumeration>
</xs:restriction>
</xs:simpleType>
<xs:complexType name="XQueryAction">
<xs:annotation>
<xs:documentation>
Run an XQuery script.
</xs:documentation>
</xs:annotation>
<xs:attribute name="path" type="xs:string" use="required">
<xs:annotation>
<xs:documentation>
Path to the XQuery script relative to the project's xquery/ directory.
</xs:documentation>
</xs:annotation>
</xs:attribute>
<xs:attribute name="timeout" type="xs:nonNegativeInteger" use="optional" default="60">
<xs:annotation>
<xs:documentation>
Maximum execution time for the script, in seconds. Set to 0 to disable.
</xs:documentation>
</xs:annotation>
</xs:attribute>
<xs:attribute name="user-agent" type="xs:string" use="optional" default="itsb/1.0 (+https://tilde.town/~lucidiot/itsb/)">
<xs:annotation>
<xs:documentation>
User-Agent header to send along with any HTTP requests.
</xs:documentation>
</xs:annotation>
</xs:attribute>
<xs:attribute name="output-format" type="XidelOutputFormat" use="optional" default="xml">
<xs:annotation>
<xs:documentation>
Format to use to output the script's results.
</xs:documentation>
</xs:annotation>
</xs:attribute>
</xs:complexType>
<xs:group name="Command">
<xs:choice>
<xs:element name="curl" type="CurlCommand" />
@ -244,6 +325,7 @@
</xs:unique>
</xs:element>
<xs:element name="shell" type="ShellCommand" />
<xs:element name="xquery" type="XQueryAction" />
</xs:choice>
</xs:group>

View File

@ -16,7 +16,9 @@ import "./helpers" as helpers;
"ttl": 1440,
"generator": "ITSB",
"item": [
.[1:][].children
.[].children
# Only include rows that contain hyperlinks, to skip blank rows or headers
| select(try .[].children[].tag == "a")
| {
"title": "\(.[1].children[0].text) - \(.[2].children[0].text) \(.[3].children[0].children[0].text) - \(.[-1].children[-1].text)",
"link": (.[3].children[0].children[0].href | helpers::urlresolve("https://mwt.gov.na/published-daai-report")),

60
xquery/atsb.xqy Normal file
View File

@ -0,0 +1,60 @@
<rss
version="2.0"
xmlns:admin="http://webns.net/mvcb/"
xmlns:atom="http://www.w3.org/2005/Atom"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
xmlns:webfeeds="http://webfeeds.org/rss/1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="xsd/rss.xsd"
xsi:schemaLocation="
http://purl.org/rss/1.0/modules/syndication/ xsd/syndication.xsd
http://webfeeds.org/rss/1.0 xsd/webfeeds.xsd
"
>
<channel>
<title>ATSB</title>
<description>Australian Transport Safety Bureau accident reports</description>
<link>https://www.atsb.gov.au/marine-investigation-reports?field_mode_of_transport_target_id=All</link>
<language>en-au</language>
<pubDate>{format-dateTime(adjust-dateTime-to-timezone(current-dateTime(), xs:dayTimeDuration("PT0S")), "[FNn,*-3], [D01] [MNn,*-3] [Y0001] [H01]:[m01]:[s01] GMT")}</pubDate>
<webMaster>lucidiot@envs.net (lucidiot)</webMaster>
<docs>http://www.rssboard.org/rss-specification</docs>
<ttl>1440</ttl>
<admin:errorReportsTo rdf:resource="mailto:lucidiot@envs.net" />
<admin:generatorAgent rdf:resource="https://tildegit.org/lucidiot/itsb/" />
<atom:link href="https://tilde.town/~lucidiot/itsb/feeds/atsb.xml" rel="self" type="application/rss+xml" />
<dc:format>application/rss+xml</dc:format>
<sy:updatePeriod>daily</sy:updatePeriod>
<sy:updateFrequency>1</sy:updateFrequency>
<sy:updateBase>1990-01-01T15:00+00:00</sy:updateBase>
<webfeeds:partial>true</webfeeds:partial>
<webfeeds:deprecated>false</webfeeds:deprecated>
{
(
163, (: Completed :)
168 (: Discontinued :)
) ! x:parse-html(x:request({
"url": concat("https://www.atsb.gov.au/marine-investigation-reports?field_investigation_status_target_id=", ., "&amp;field_mode_of_transport_target_id=All"),
"headers": "Accept-Encoding: lol
Accept-Language: zz,
Cache-Control: lol"
})/raw)//table[contains(@class, "views-table")]/tbody/tr
! <item>
<title>{./td[2]/a/text()}</title>
<link>{fn:resolve-uri(./td[2]/a/@href)}</link>
<guid>{fn:resolve-uri(./td[2]/a/@href)}</guid>
<description>{./td[1]/a/text()}</description>
<pubDate>{format-dateTime(xs:dateTime(.//time/@datetime[1]), "[FNn,*-3], [D01] [MNn,*-3] [Y0001] [H01]:[m01]:[s01] GMT")}</pubDate>
<category domain="https://www.atsb.gov.au/">{./td[4]/text()}</category>
</item>
}
</channel>
</rss>

View File

@ -95,6 +95,15 @@ if ! command -v json2xml >/dev/null 2>&1; then
fi
]]></xsl:text>
<xsl:if test="//itsb:xquery">
<xsl:text><![CDATA[
if ! command -v xidel >/dev/null 2>&1; then
echo "xidel is not installed or available in \$PATH." >&2
echo "See <https://www.videlibri.de/xidel.html#downloads> for installation instructions." >&2
exit 1
fi
]]></xsl:text>
</xsl:if>
<xsl:apply-templates
select="//itsb:link[@verify-ssl = 'false' or @verify-ssl = '0'][generate-id() = generate-id(key('ssl-hosts', substring-before(substring-after(text(), 'https://'), '/'))[1])]"
mode="check"
@ -128,7 +137,7 @@ rm "$DIR/.itsb-feedgen"]]></xsl:text>
</xsl:text>
</xsl:if>
<xsl:for-each select="./itsb:curl|./itsb:jq|./itsb:pup|./itsb:json2xml|./itsb:xml2json|./itsb:shell|./itsb:output">
<xsl:for-each select="./itsb:curl|./itsb:jq|./itsb:pup|./itsb:json2xml|./itsb:xml2json|./itsb:shell|./itsb:xquery|./itsb:output">
<xsl:apply-templates select="." />
<xsl:if test="not(position()=last())">
<xsl:text> \
@ -263,6 +272,45 @@ rm "$DIR/.itsb-feedgen"]]></xsl:text>
<xsl:value-of select="text()" />
</xsl:template>
<xsl:template match="itsb:xquery">
<xsl:text>timeout </xsl:text>
<xsl:choose>
<xsl:when test="@timeout">
<xsl:value-of select="@timeout" />
</xsl:when>
<xsl:otherwise>
<xsl:text>60</xsl:text>
</xsl:otherwise>
</xsl:choose>
<xsl:text> xidel --silent --trace-stack --wait=1 --user-agent='</xsl:text>
<xsl:call-template name="escape">
<xsl:with-param name="text">
<xsl:choose>
<xsl:when test="@user-agent">
<xsl:value-of select="@user-agent" />
</xsl:when>
<xsl:otherwise>
<xsl:text>itsb/1.0 (+https://tilde.town/~lucidiot/itsb/)</xsl:text>
</xsl:otherwise>
</xsl:choose>
</xsl:with-param>
</xsl:call-template>
<xsl:text>' --output-format=</xsl:text>
<xsl:choose>
<xsl:when test="@output-format">
<xsl:value-of select="@output-format" />
</xsl:when>
<xsl:otherwise>
<xsl:text>xml</xsl:text>
</xsl:otherwise>
</xsl:choose>
<xsl:text> --extract-kind=xquery3 --extract-file='xquery/</xsl:text>
<xsl:call-template name="escape">
<xsl:with-param name="text" select="@path" />
</xsl:call-template>
<xsl:text>'</xsl:text>
</xsl:template>
<xsl:template match="itsb:output">
<xsl:text>> $DIR/feeds/</xsl:text>
<xsl:value-of select="text()" />