Rewrite NHC OPML builder with XQuery
This commit is contained in:
parent
001ae0d3f1
commit
852326c558
|
@ -1,35 +1,5 @@
|
|||
#!/bin/sh -e
|
||||
node -e 'const baseUrl = "https://www.nhc.noaa.gov/aboutrss.shtml";
|
||||
require("node:https").get(baseUrl, res => {
|
||||
const chunks = [];
|
||||
res.on("data", chunk => chunks.push(chunk));
|
||||
res.on("end", () => {
|
||||
process.stdout.write(JSON.stringify({
|
||||
opml: {
|
||||
"@version": "2.0",
|
||||
head: {
|
||||
title: "National Hurricane Center and Central Pacific Hurricane Center RSS feeds",
|
||||
dateModified: new Date().toUTCString(),
|
||||
ownerName: "lucidiot",
|
||||
ownerEmail: "lucidiot@envs.net",
|
||||
ownerId: "https://tilde.town/~lucidiot/contact.html",
|
||||
docs: "http://dev.opml.org/spec2.html"
|
||||
},
|
||||
body: {
|
||||
outline: Array.from(
|
||||
Buffer.concat(chunks).toString("latin1").matchAll(/<a[^>]*href="[^"]+\.xml"[^>]*>\s*<img[^>]*src="[^"]*gifs\/xml.gif"[^>]*>\s*<\/a>\s*(?<name>[^<]+)<a[^>]*href="(?<url>[^"]+\.xml)"[^>]*>/gi),
|
||||
({ groups: { name, url } }) => {
|
||||
const text = name.replace(/[:\s]*$/g, "").replace(/&#(\d+);/, (_, code) => String.fromCharCode(parseInt(code, 10))).replace(/&(amp|gt|lt|apos|quot);/, (_, name) => ({ amp: "&", gt: ">", lt: "<", apos: "'"'"'", quot: "\"" }[name]));
|
||||
return {
|
||||
"@type": "rss",
|
||||
"@text": text,
|
||||
"@xmlUrl": new URL(url, baseUrl).href,
|
||||
"@language": ["Español", "Atlantico", "Cartera"].some(word => text.includes(word)) ? "es" : "en-us"
|
||||
}
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
}), "utf-8");
|
||||
});
|
||||
})' | oq -i json -o xml .
|
||||
# The page is encoded as latin1 but xidel does not convert it by itself
|
||||
curl --silent --fail -H 'User-Agent: RSRSSS/1.0 (+https://envs.net/~lucidiot/rsrsss/feed.xml)' https://www.nhc.noaa.gov/aboutrss.shtml |
|
||||
iconv -f ISO-8859-1 -t UTF-8 |
|
||||
xidel --silent - --extract-kind=xquery3 --extract-file="$(dirname "$(CDPATH= cd -- "$(dirname -- "$0")" && pwd -P)")/xquery/nhc_opml.xqy" --output-format=xml
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
xquery version "3.0" encoding "utf-8";
|
||||
<opml version="2.0">
|
||||
<head>
|
||||
<title>National Hurricane Center and Central Pacific Hurricane Center RSS feeds</title>
|
||||
<dateModified>{format-dateTime(adjust-dateTime-to-timezone(current-dateTime(), xs:dayTimeDuration("PT0S")), "[FNn,*-3], [D01] [MNn,*-3] [Y0001] [H01]:[m01]:[s01] GMT")}</dateModified>
|
||||
<ownerName>lucidiot</ownerName>
|
||||
<ownerEmail>lucidiot@envs.net</ownerEmail>
|
||||
<ownerId>https://tilde.town/~lucidiot/contact.html</ownerId>
|
||||
<docs>http://dev.opml.org/spec2.html</docs>
|
||||
</head>
|
||||
<body>
|
||||
{
|
||||
(:
|
||||
Each feed is listed as a link with an RSS icon inside it,
|
||||
followed by a text node representing the feed name,
|
||||
followed by another link with the URL as its text.
|
||||
Start by picking the link with the image in it…
|
||||
:)
|
||||
for $feed in //a[ends-with(@href, ".xml")][./img]
|
||||
(: Find the text node that follows the image to get the feed name :)
|
||||
let $text := substring-before($feed/following-sibling::text()[1], ":")
|
||||
return <outline
|
||||
type="rss"
|
||||
text="{$text}"
|
||||
xmlUrl="{resolve-uri($feed/@href, 'https://www.nhc.noaa.gov/aboutrss.shtml')}"
|
||||
language="{if (some $word in ('Español', 'Atlantico', 'Cartera') satisfies contains($text, $word)) then 'es' else 'en-us'}"
|
||||
/>
|
||||
(: The only way to determine the feed's language without fetching the feeds themselves is to look for Spanish keywords. :)
|
||||
}
|
||||
</body>
|
||||
</opml>
|
Loading…
Reference in New Issue