// This file is part of Snownews - A lightweight console RSS newsreader // // Copyright (c) 2003-2004 Rene Puls // Copyright (c) 2003-2004 Oliver Feiler // Copyright (c) 2021 Mike Sharov // // Snownews is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License version 3 // as published by the Free Software Foundation. // // Snownews is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. // See the GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with Snownews. If not, see http://www.gnu.org/licenses/. #include "parse.h" #include "feedio.h" #include "conv.h" #include //{{{ Local variables -------------------------------------------------- static bool saverestore = false; static struct newsitem* copy = NULL; static struct newsitem* firstcopy = NULL; static const char dcNs[] = "http://purl.org/dc/elements/1.1/"; static const char snowNs[] = "http://snownews.kcore.de/ns/1.0/"; static const char contentNs[] = "http://purl.org/rss/1.0/modules/content/"; //}}}------------------------------------------------------------------- //{{{ free_feed static void free_feed (struct feed* feed) { free (feed->title); free (feed->link); free (feed->description); if (feed->items) { while (feed->items->next) { feed->items = feed->items->next; free (feed->items->prev->data->title); free (feed->items->prev->data->link); free (feed->items->prev->data->description); free (feed->items->prev->data->hash); free (feed->items->prev->data); free (feed->items->prev); } free (feed->items->data->title); free (feed->items->data->link); free (feed->items->data->description); free (feed->items->data->hash); free (feed->items->data); free (feed->items); } feed->items = NULL; feed->title = NULL; feed->link = NULL; feed->description = NULL; } //}}}------------------------------------------------------------------- //{{{ libXML accessors // // libXML made the strange choice of making xmlChar an unsigned char. // Since all the text passed to it is char, a sea of casts results. static bool node_name_is (xmlNodePtr pn, const char* name) { return 0 == xmlStrcmp (pn->name, (const xmlChar*) name); } static bool node_ns_name_is (xmlNodePtr pn, const char* ns, const char* name) { return pn->ns && 0 == xmlStrcmp (pn->ns->href, (const xmlChar*) ns) && node_name_is (pn, name); } static void copy_node_text_to (xmlDocPtr doc, xmlNodePtr pn, char** pd, bool fullclean) { char* nt = (char*) xmlNodeListGetString (doc, pn->children, 1); if (nt) { if (!nt[0]) return free (nt); char* cnt = text_from_html (nt); if (!cnt) return free (nt); if (cnt[0]) { free (nt); nt = cnt; } else free (cnt); CleanupString (nt, fullclean); if (*pd) free (*pd); *pd = nt; } } static long number_from_node_text (xmlDocPtr doc, xmlNodePtr pn) { char* s = NULL; copy_node_text_to (doc, pn, &s, false); long v = atol(s); free (s); return v; } static time_t pubDate_from_node_text (xmlDocPtr doc, xmlNodePtr pn) { char* s = NULL; copy_node_text_to (doc, pn, &s, false); time_t t = pubDateToUnix (s); free (s); return t; } static time_t ISODate_from_node_text (xmlDocPtr doc, xmlNodePtr pn) { char* s = NULL; copy_node_text_to (doc, pn, &s, false); time_t t = ISODateToUnix (s); free (s); return t; } static void copy_node_prop_to (xmlNodePtr pn, const char* name, char** pd, bool fullclean) { if (*pd) xmlFree (*pd); *pd = (char*) xmlGetProp (pn, (const xmlChar*) name); CleanupString (*pd, fullclean); } //}}}------------------------------------------------------------------- //{{{ RSS 1 parsing // This function is called every time we hit an . As parameter it // needs the current newsfeed (struct newsfeed*), as well as the current // XML Document handle and the current element, both come directly from // the libxml. static void parse_rdf10_item (struct feed* feed, xmlDocPtr doc, xmlNodePtr node) { // Reserve memory for a new news item struct newsitem* item = calloc (1, sizeof (struct newsitem)); item->data = calloc (1, sizeof (struct newsdata)); item->data->parent = feed; char* guid = NULL; // Go through all the tags in the tag and extract the information. // same procedure as in the parse_channel() function for (xmlNodePtr cur = node; cur != NULL; cur = cur->next) { if (cur->type != XML_ELEMENT_NODE) continue; // Basic RSS if (node_name_is (cur, "title")) copy_node_text_to (doc, cur, &item->data->title, true); else if (node_name_is (cur, "link")) copy_node_text_to (doc, cur, &item->data->link, false); else if (node_name_is (cur, "description")) copy_node_text_to (doc, cur, &item->data->description, false); // Userland extensions (No namespace!) else if (node_name_is (cur, "guid")) copy_node_text_to (doc, cur, &guid, true); else if (node_name_is (cur, "pubDate")) item->data->date = pubDate_from_node_text (doc, cur); else if (node_name_is (cur, "readstatus")) item->data->readstatus = number_from_node_text (doc, cur); // content:encoded else if (node_ns_name_is (cur, contentNs, "encoded")) copy_node_text_to (doc, cur, &item->data->description, false); // Dublin Core dc:date else if (node_ns_name_is (cur, dcNs, "date")) item->data->date = ISODate_from_node_text (doc, cur); // Using snow namespace else if (node_ns_name_is (cur, snowNs, "hash")) copy_node_text_to (doc, cur, &item->data->hash, true); else if (node_ns_name_is (cur, snowNs, "date")) item->data->date = number_from_node_text (doc, cur); } // If we have loaded the hash from disk cache, don't regenerate it. // is not saved in the cache, thus we would generate a different // hash than the one from the live feed. if (!item->data->hash) { const char* hashitems[] = { item->data->title, item->data->link, guid, NULL }; item->data->hash = genItemHash (hashitems, 3); } if (!item->data->title) item->data->title = strdup (_("No title")); if (guid) { xmlFree (guid); guid = NULL; } // If saverestore == true, restore readstatus. if (saverestore) { for (struct newsitem* i = firstcopy; i; i = i->next) { if (strcmp (item->data->hash, i->data->hash) == 0) { item->data->readstatus = i->data->readstatus; break; } } } if (!feed->items) feed->items = item; else { item->prev = feed->items; while (item->prev->next) item->prev = item->prev->next; item->prev->next = item; } } // Called during parsing, if we look for a element // The function returns a new struct for the newsfeed. static void parse_rdf10_channel (struct feed* feed, xmlDocPtr doc, xmlNodePtr node) { // Free everything before we write to it again. free_feed (feed); // Go through all the tags in the tag and extract the information for (xmlNodePtr cur = node; cur; cur = cur->next) { if (cur->type != XML_ELEMENT_NODE) continue; if (node_name_is (cur, "title")) copy_node_text_to (doc, cur, &feed->title, true); else if (node_name_is (cur, "link")) copy_node_text_to (doc, cur, &feed->link, false); else if (node_name_is (cur, "description")) copy_node_text_to (doc, cur, &feed->description, false); } } //}}}------------------------------------------------------------------- //{{{ RSS 2 parsing static void parse_rdf20_channel (struct feed* feed, xmlDocPtr doc, xmlNodePtr node) { // Free everything before we write to it again. free_feed (feed); // Go through all the tags in the tag and extract the information for (xmlNodePtr cur = node; cur; cur = cur->next) { if (cur->type != XML_ELEMENT_NODE) continue; if (node_name_is (cur, "title")) copy_node_text_to (doc, cur, &feed->title, true); else if (node_name_is (cur, "link")) copy_node_text_to (doc, cur, &feed->link, false); else if (node_name_is (cur, "description")) copy_node_text_to (doc, cur, &feed->description, false); else if (node_name_is (cur, "item")) parse_rdf10_item (feed, doc, cur->children); } } //}}}------------------------------------------------------------------- //{{{ Atom parsing static void parse_atom_entry (struct feed* feed, xmlDocPtr doc, xmlNodePtr node) { // Reserve memory for a new news item struct newsitem* item = calloc (1, sizeof (struct newsitem)); item->data = calloc (1, sizeof (struct newsdata)); item->data->parent = feed; char* guid = NULL; // Go through all the tags in the tag and extract the information. // same procedure as in the parse_channel() function for (xmlNodePtr cur = node; cur != NULL; cur = cur->next) { if (cur->type != XML_ELEMENT_NODE) continue; if (node_name_is (cur, "title")) copy_node_text_to (doc, cur, &item->data->title, true); else if (node_name_is (cur, "link")) { char* rel = NULL; copy_node_prop_to (cur, "rel", &rel, false ); if (!rel || 0 == strcmp (rel, "alternate")) copy_node_prop_to (cur, "href", &item->data->link, false); free (rel); } else if (node_name_is (cur, "summary") && !item->data->description) copy_node_text_to (doc, cur, &item->data->description, false); else if (node_name_is (cur, "content")) copy_node_text_to (doc, cur, &item->data->description, false); else if (node_name_is (cur, "id")) copy_node_text_to (doc, cur, &guid, false); else if (node_name_is (cur, "updated")) item->data->date = ISODate_from_node_text (doc, cur); } // If we have loaded the hash from disk cache, don't regenerate it. // is not saved in the cache, thus we would generate a different // hash than the one from the live feed. if (!item->data->hash) { const char* hashitems[] = { item->data->title, item->data->link, guid, NULL }; item->data->hash = genItemHash (hashitems, 3); } if (!item->data->title) item->data->title = strdup (_("No title")); if (guid) { free (guid); guid = NULL; } // If saverestore == true, restore readstatus. if (saverestore) { for (const struct newsitem* i = firstcopy; i; i = i->next) { if (strcmp (item->data->hash, i->data->hash) == 0) { item->data->readstatus = i->data->readstatus; break; } } } if (!feed->items) feed->items = item; else { item->prev = feed->items; while (item->prev->next) item->prev = item->prev->next; item->prev->next = item; } } static void parse_atom_channel (struct feed* feed, xmlDocPtr doc, xmlNodePtr node) { // Free everything before we write to it again. free_feed (feed); // Go through all the tags in the tag and extract the information for (xmlNodePtr cur = node; cur; cur = cur->next) { if (cur->type != XML_ELEMENT_NODE) continue; if (node_name_is (cur, "title")) copy_node_text_to (doc, cur, &feed->title, true); else if (node_name_is (cur, "link")) copy_node_prop_to (cur, "href", &feed->link, false); else if (node_name_is (cur, "entry")) parse_atom_entry (feed, doc, cur->children); } } //}}}------------------------------------------------------------------- int DeXML (struct feed* cur_ptr) { if (!cur_ptr->xmltext) return -1; saverestore = false; // If cur_ptr-> items! = NULL then we can cache item->readstatus if (cur_ptr->items != NULL) { saverestore = true; firstcopy = NULL; // Copy current newsitem struct. */ for (struct newsitem * cur_item = cur_ptr->items; cur_item != NULL; cur_item = cur_item->next) { copy = calloc (1, sizeof (struct newsitem)); copy->data = calloc (1, sizeof (struct newsdata)); copy->data->readstatus = cur_item->data->readstatus; if (cur_item->data->hash) copy->data->hash = strdup (cur_item->data->hash); copy->next = NULL; if (!firstcopy) { copy->prev = NULL; firstcopy = copy; } else { copy->prev = firstcopy; while (copy->prev->next) copy->prev = copy->prev->next; copy->prev->next = copy; } } } // xmlRecoverMemory: // parse an XML in-memory document and build a tree. // In case the document is not Well Formed, a tree is built anyway. xmlDocPtr doc = xmlRecoverMemory (cur_ptr->xmltext, strlen (cur_ptr->xmltext)); if (!doc) return 2; // Find the root element (in our case, it should read ""). // The RDF: prefix is ignored for now until the Jaguar // Find out how to read that exactly (jau). xmlNodePtr cur = xmlDocGetRootElement (doc); if (!cur) { xmlFreeDoc (doc); return 2; } // Check if the element really is called if (node_name_is (cur, "RDF")) { // Now we go through all the elements in the document. This loop however, // only the highest level elements work (HTML would only be HEAD and // BODY), so do not wander entire structure down through. The functions // are responsible for this, which we then call in the loop itself. for (xmlNodePtr c = cur->children; c; c = c->next) { if (c->type != XML_ELEMENT_NODE) continue; if (node_name_is (c, "channel")) parse_rdf10_channel (cur_ptr, doc, c->children); if (node_name_is (c, "item")) parse_rdf10_item (cur_ptr, doc, c->children); // Last-Modified is only used when reading from internal feeds (disk cache). if (node_ns_name_is (c, snowNs, "lastmodified")) cur_ptr->lastmodified = number_from_node_text (doc, c); } } else if (node_name_is (cur, "rss")) { for (xmlNodePtr c = cur->children; c; c = c->next) { if (c->type != XML_ELEMENT_NODE) continue; if (node_name_is (c, "channel")) parse_rdf20_channel (cur_ptr, doc, c->children); } } else if (node_name_is (cur, "feed")) { parse_atom_channel (cur_ptr, doc, cur->children); } else { xmlFreeDoc (doc); return 3; } xmlFreeDoc (doc); if (saverestore) { // free struct newsitem *copy. while (firstcopy->next) { firstcopy = firstcopy->next; free (firstcopy->prev->data->hash); free (firstcopy->prev->data); free (firstcopy->prev); } free (firstcopy->data->hash); free (firstcopy->data); free (firstcopy); firstcopy = NULL; } if (cur_ptr->custom_title) { free (cur_ptr->title); cur_ptr->title = strdup (cur_ptr->custom_title); } else if (!cur_ptr->title) cur_ptr->title = strdup (_("No title")); if (cur_ptr->original) free (cur_ptr->original); cur_ptr->original = strdup (cur_ptr->title); return 0; } unsigned ParseOPMLFile (const char* flbuf) { unsigned nfeeds = 0; xmlDocPtr doc = xmlRecoverMemory (flbuf, strlen (flbuf)); if (!doc) return nfeeds; xmlNodePtr rootnode = xmlDocGetRootElement (doc); if (!rootnode) { xmlFreeDoc (doc); return nfeeds; } if (xmlStrcmp (rootnode->name, (const xmlChar*) "opml") == 0) { for (xmlNodePtr body = rootnode->children; body; body = body->next) { if (body->type != XML_ELEMENT_NODE || !node_name_is (body, "body")) continue; for (xmlNodePtr outline = body->children; outline; outline = outline->next) { if (outline->type != XML_ELEMENT_NODE || !node_name_is (outline, "outline")) continue; char *text = NULL, *xmlUrl = NULL, *categories = NULL, *filter = NULL; copy_node_prop_to (outline, "text", &text, false); copy_node_prop_to (outline, "xmlUrl", &xmlUrl, false); copy_node_prop_to (outline, "category", &categories, false); copy_node_prop_to (outline, "filter", &filter, false); if (xmlUrl && text) { AddFeed (xmlUrl, text, categories, filter); ++nfeeds; } if (text) free (text); if (xmlUrl) free (xmlUrl); if (categories) free (categories); if (filter) free (filter); } } } xmlFreeDoc (doc); return nfeeds; }