Rewrite NHC OPML builder with XQuery

This commit is contained in:
~lucidiot 2023-10-13 02:02:49 +02:00
parent 001ae0d3f1
commit 852326c558
2 changed files with 35 additions and 34 deletions

View File

@ -1,35 +1,5 @@
#!/bin/sh -e
node -e 'const baseUrl = "https://www.nhc.noaa.gov/aboutrss.shtml";
require("node:https").get(baseUrl, res => {
const chunks = [];
res.on("data", chunk => chunks.push(chunk));
res.on("end", () => {
process.stdout.write(JSON.stringify({
opml: {
"@version": "2.0",
head: {
title: "National Hurricane Center and Central Pacific Hurricane Center RSS feeds",
dateModified: new Date().toUTCString(),
ownerName: "lucidiot",
ownerEmail: "lucidiot@envs.net",
ownerId: "https://tilde.town/~lucidiot/contact.html",
docs: "http://dev.opml.org/spec2.html"
},
body: {
outline: Array.from(
Buffer.concat(chunks).toString("latin1").matchAll(/<a[^>]*href="[^"]+\.xml"[^>]*>\s*<img[^>]*src="[^"]*gifs\/xml.gif"[^>]*>\s*<\/a>\s*(?<name>[^<]+)<a[^>]*href="(?<url>[^"]+\.xml)"[^>]*>/gi),
({ groups: { name, url } }) => {
const text = name.replace(/[:\s]*$/g, "").replace(/&#(\d+);/, (_, code) => String.fromCharCode(parseInt(code, 10))).replace(/&(amp|gt|lt|apos|quot);/, (_, name) => ({ amp: "&", gt: ">", lt: "<", apos: "'"'"'", quot: "\"" }[name]));
return {
"@type": "rss",
"@text": text,
"@xmlUrl": new URL(url, baseUrl).href,
"@language": ["Español", "Atlantico", "Cartera"].some(word => text.includes(word)) ? "es" : "en-us"
}
},
)
}
}
}), "utf-8");
});
})' | oq -i json -o xml .
# The page is encoded as latin1 but xidel does not convert it by itself
curl --silent --fail -H 'User-Agent: RSRSSS/1.0 (+https://envs.net/~lucidiot/rsrsss/feed.xml)' https://www.nhc.noaa.gov/aboutrss.shtml |
iconv -f ISO-8859-1 -t UTF-8 |
xidel --silent - --extract-kind=xquery3 --extract-file="$(dirname "$(CDPATH= cd -- "$(dirname -- "$0")" && pwd -P)")/xquery/nhc_opml.xqy" --output-format=xml

31
xquery/nhc_opml.xqy Normal file
View File

@ -0,0 +1,31 @@
xquery version "3.0" encoding "utf-8";
<opml version="2.0">
<head>
<title>National Hurricane Center and Central Pacific Hurricane Center RSS feeds</title>
<dateModified>{format-dateTime(adjust-dateTime-to-timezone(current-dateTime(), xs:dayTimeDuration("PT0S")), "[FNn,*-3], [D01] [MNn,*-3] [Y0001] [H01]:[m01]:[s01] GMT")}</dateModified>
<ownerName>lucidiot</ownerName>
<ownerEmail>lucidiot@envs.net</ownerEmail>
<ownerId>https://tilde.town/~lucidiot/contact.html</ownerId>
<docs>http://dev.opml.org/spec2.html</docs>
</head>
<body>
{
(:
Each feed is listed as a link with an RSS icon inside it,
followed by a text node representing the feed name,
followed by another link with the URL as its text.
Start by picking the link with the image in it…
:)
for $feed in //a[ends-with(@href, ".xml")][./img]
(: Find the text node that follows the image to get the feed name :)
let $text := substring-before($feed/following-sibling::text()[1], ":")
return <outline
type="rss"
text="{$text}"
xmlUrl="{resolve-uri($feed/@href, 'https://www.nhc.noaa.gov/aboutrss.shtml')}"
language="{if (some $word in ('Español', 'Atlantico', 'Cartera') satisfies contains($text, $word)) then 'es' else 'en-us'}"
/>
(: The only way to determine the feed's language without fetching the feeds themselves is to look for Spanish keywords. :)
}
</body>
</opml>