sfeed/sfeed.c

1069 lines
29 KiB
C

#include <errno.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include "util.h"
#include "xml.h"
#define ISINCONTENT(ctx) ((ctx).iscontent && !((ctx).iscontenttag))
#define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag)
/* these feed fields support multiple separated values */
#define ISFEEDFIELDMULTI(t) ((t) == FeedFieldCategory)
/* string and byte-length */
#define STRP(s) s,sizeof(s)-1
enum FeedType {
FeedTypeNone = 0,
FeedTypeRSS = 1,
FeedTypeAtom = 2
};
enum ContentType {
ContentTypeNone = 0,
ContentTypePlain = 1,
ContentTypeHTML = 2
};
static const char *contenttypes[] = { "", "plain", "html" };
/* String data / memory pool */
typedef struct string {
char *data; /* data */
size_t len; /* string length */
size_t bufsiz; /* allocated size */
} String;
/* NOTE: the order of these fields (content, date, author) indicate the
* priority to use them, from least important to high. */
enum TagId {
TagUnknown = 0,
/* RSS */
RSSTagDcdate, RSSTagPubdate, /* creation date has higher priority */
RSSTagTitle,
RSSTagMediaDescription, RSSTagDescription, RSSTagContentEncoded,
RSSTagGuid,
RSSTagGuidPermalinkFalse,
RSSTagGuidPermalinkTrue,
/* must be defined after GUID, because it can be a link (isPermaLink) */
RSSTagLink,
RSSTagEnclosure,
RSSTagAuthor, RSSTagDccreator,
RSSTagCategory,
/* Atom */
/* creation date has higher priority */
AtomTagModified, AtomTagUpdated, AtomTagIssued, AtomTagPublished,
AtomTagTitle,
AtomTagMediaDescription, AtomTagSummary, AtomTagContent,
AtomTagId,
AtomTagLink,
AtomTagLinkAlternate,
AtomTagLinkEnclosure,
AtomTagAuthor, AtomTagAuthorName,
AtomTagCategory,
TagLast
};
typedef struct feedtag {
char *name; /* name of tag to match */
size_t len; /* len of `name` */
enum TagId id; /* unique ID */
} FeedTag;
typedef struct field {
String str;
enum TagId tagid; /* tagid set previously, used for tag priority */
} FeedField;
enum {
FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldContent,
FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCategory,
FeedFieldLast
};
typedef struct feedcontext {
String *field; /* current FeedItem field String */
FeedField fields[FeedFieldLast]; /* data for current item */
FeedTag tag; /* unique current parsed tag */
int iscontent; /* in content data */
int iscontenttag; /* in content tag */
enum ContentType contenttype; /* content-type for item */
enum FeedType feedtype;
int attrcount; /* count item HTML element attributes */
} FeedContext;
static long long datetounix(long long, int, int, int, int, int);
static FeedTag * gettag(enum FeedType, const char *, size_t);
static long gettzoffset(const char *);
static int isattr(const char *, size_t, const char *, size_t);
static int istag(const char *, size_t, const char *, size_t);
static int parsetime(const char *, long long *);
static void printfields(void);
static void string_append(String *, const char *, size_t);
static void string_buffer_realloc(String *, size_t);
static void string_clear(String *);
static void string_print_encoded(String *);
static void string_print_timestamp(String *);
static void string_print_trimmed(String *);
static void string_print_trimmed_multi(String *);
static void string_print_uri(String *);
static void xmlattr(XMLParser *, const char *, size_t, const char *, size_t,
const char *, size_t);
static void xmlattrentity(XMLParser *, const char *, size_t, const char *,
size_t, const char *, size_t);
static void xmlattrend(XMLParser *, const char *, size_t, const char *,
size_t);
static void xmlattrstart(XMLParser *, const char *, size_t, const char *,
size_t);
static void xmldata(XMLParser *, const char *, size_t);
static void xmldataentity(XMLParser *, const char *, size_t);
static void xmltagend(XMLParser *, const char *, size_t, int);
static void xmltagstart(XMLParser *, const char *, size_t);
static void xmltagstartparsed(XMLParser *, const char *, size_t, int);
/* map tag name to TagId type */
/* RSS, must be alphabetical order */
static const FeedTag rsstags[] = {
{ STRP("author"), RSSTagAuthor },
{ STRP("category"), RSSTagCategory },
{ STRP("content:encoded"), RSSTagContentEncoded },
{ STRP("dc:creator"), RSSTagDccreator },
{ STRP("dc:date"), RSSTagDcdate },
{ STRP("description"), RSSTagDescription },
/* RSS: <enclosure url="" />, Atom has <link rel="enclosure" /> */
{ STRP("enclosure"), RSSTagEnclosure },
{ STRP("guid"), RSSTagGuid },
{ STRP("link"), RSSTagLink },
{ STRP("media:description"), RSSTagMediaDescription },
{ STRP("pubdate"), RSSTagPubdate },
{ STRP("title"), RSSTagTitle }
};
/* Atom, must be alphabetical order */
static const FeedTag atomtags[] = {
{ STRP("author"), AtomTagAuthor },
{ STRP("category"), AtomTagCategory },
{ STRP("content"), AtomTagContent },
{ STRP("id"), AtomTagId },
{ STRP("issued"), AtomTagIssued }, /* Atom 0.3 */
/* Atom: <link href="" />, RSS has <link></link> */
{ STRP("link"), AtomTagLink },
{ STRP("media:description"), AtomTagMediaDescription },
{ STRP("modified"), AtomTagModified }, /* Atom 0.3 */
{ STRP("published"), AtomTagPublished },
{ STRP("summary"), AtomTagSummary },
{ STRP("title"), AtomTagTitle },
{ STRP("updated"), AtomTagUpdated }
};
/* special case: nested <author><name> */
static const FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor };
static const FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorName };
/* reference to no / unknown tag */
static const FeedTag notag = { STRP(""), TagUnknown };
/* map TagId type to RSS/Atom field, all tags must be defined */
static const int fieldmap[TagLast] = {
[TagUnknown] = -1,
/* RSS */
[RSSTagDcdate] = FeedFieldTime,
[RSSTagPubdate] = FeedFieldTime,
[RSSTagTitle] = FeedFieldTitle,
[RSSTagMediaDescription] = FeedFieldContent,
[RSSTagDescription] = FeedFieldContent,
[RSSTagContentEncoded] = FeedFieldContent,
[RSSTagGuid] = -1,
[RSSTagGuidPermalinkFalse] = FeedFieldId,
[RSSTagGuidPermalinkTrue] = FeedFieldId, /* special-case: both a link and an id */
[RSSTagLink] = FeedFieldLink,
[RSSTagEnclosure] = FeedFieldEnclosure,
[RSSTagAuthor] = FeedFieldAuthor,
[RSSTagDccreator] = FeedFieldAuthor,
[RSSTagCategory] = FeedFieldCategory,
/* Atom */
[AtomTagModified] = FeedFieldTime,
[AtomTagUpdated] = FeedFieldTime,
[AtomTagIssued] = FeedFieldTime,
[AtomTagPublished] = FeedFieldTime,
[AtomTagTitle] = FeedFieldTitle,
[AtomTagMediaDescription] = FeedFieldContent,
[AtomTagSummary] = FeedFieldContent,
[AtomTagContent] = FeedFieldContent,
[AtomTagId] = FeedFieldId,
[AtomTagLink] = -1,
[AtomTagLinkAlternate] = FeedFieldLink,
[AtomTagLinkEnclosure] = FeedFieldEnclosure,
[AtomTagAuthor] = -1,
[AtomTagAuthorName] = FeedFieldAuthor,
[AtomTagCategory] = FeedFieldCategory
};
static const int FieldSeparator = '\t';
/* separator for multiple values in a field, separator should be 1 byte */
static const char FieldMultiSeparator[] = "|";
static struct uri baseuri;
static const char *baseurl;
static FeedContext ctx;
static XMLParser parser; /* XML parser state */
static String attrispermalink, attrrel, attrtype, tmpstr;
static int
tagcmp(const void *v1, const void *v2)
{
return strcasecmp(((FeedTag *)v1)->name, ((FeedTag *)v2)->name);
}
/* Unique tagid for parsed tag name. */
static FeedTag *
gettag(enum FeedType feedtype, const char *name, size_t namelen)
{
FeedTag f, *r = NULL;
f.name = (char *)name;
switch (feedtype) {
case FeedTypeRSS:
r = bsearch(&f, rsstags, sizeof(rsstags) / sizeof(rsstags[0]),
sizeof(rsstags[0]), tagcmp);
break;
case FeedTypeAtom:
r = bsearch(&f, atomtags, sizeof(atomtags) / sizeof(atomtags[0]),
sizeof(atomtags[0]), tagcmp);
break;
default:
break;
}
return r;
}
static char *
ltrim(const char *s)
{
for (; ISSPACE((unsigned char)*s); s++)
;
return (char *)s;
}
static char *
rtrim(const char *s)
{
const char *e;
for (e = s + strlen(s); e > s && ISSPACE((unsigned char)*(e - 1)); e--)
;
return (char *)e;
}
/* Clear string only; don't free, prevents unnecessary reallocation. */
static void
string_clear(String *s)
{
if (s->data)
s->data[0] = '\0';
s->len = 0;
}
static void
string_buffer_realloc(String *s, size_t newlen)
{
size_t alloclen;
if (newlen > SIZE_MAX / 2) {
alloclen = SIZE_MAX;
} else {
for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
;
}
if (!(s->data = realloc(s->data, alloclen)))
err(1, "realloc");
s->bufsiz = alloclen;
}
/* Append data to String, s->data and data may not overlap. */
static void
string_append(String *s, const char *data, size_t len)
{
if (!len)
return;
if (s->len >= SIZE_MAX - len) {
errno = EOVERFLOW;
err(1, "realloc");
}
/* check if allocation is necessary, never shrink the buffer. */
if (s->len + len >= s->bufsiz)
string_buffer_realloc(s, s->len + len + 1);
memcpy(s->data + s->len, data, len);
s->len += len;
s->data[s->len] = '\0';
}
/* Print text, encode TABs, newlines and '\', remove other whitespace.
* Remove leading and trailing whitespace. */
static void
string_print_encoded(String *s)
{
const char *p, *e;
if (!s->data || !s->len)
return;
p = ltrim(s->data);
e = rtrim(p);
for (; *p && p != e; p++) {
switch (*p) {
case '\n': putchar('\\'); putchar('n'); break;
case '\\': putchar('\\'); putchar('\\'); break;
case '\t': putchar('\\'); putchar('t'); break;
default:
/* ignore control chars */
if (!ISCNTRL((unsigned char)*p))
putchar(*p);
break;
}
}
}
static void
printtrimmed(const char *s)
{
char *p, *e;
p = ltrim(s);
e = rtrim(p);
for (; *p && p != e; p++) {
if (ISSPACE((unsigned char)*p))
putchar(' '); /* any whitespace to space */
else if (!ISCNTRL((unsigned char)*p))
/* ignore other control chars */
putchar(*p);
}
}
/* Print text, replace TABs, carriage return and other whitespace with ' '.
* Other control chars are removed. Remove leading and trailing whitespace. */
static void
string_print_trimmed(String *s)
{
if (!s->data || !s->len)
return;
printtrimmed(s->data);
}
/* Print each field with trimmed whitespace, separated by '|'. */
static void
string_print_trimmed_multi(String *s)
{
char *p, *e;
int c;
if (!s->data || !s->len)
return;
for (p = s->data; ; p = e + 1) {
if ((e = strstr(p, FieldMultiSeparator))) {
c = *e;
*e = '\0';
printtrimmed(p);
*e = c; /* restore NUL byte to original character */
fputs(FieldMultiSeparator, stdout);
} else {
printtrimmed(p);
break;
}
}
}
/* Print URL, if it's a relative URL then it uses the global `baseurl`. */
static void
printuri(char *s)
{
char link[4096], *p, *e;
struct uri newuri, olduri;
int c, r = -1;
p = ltrim(s);
e = rtrim(p);
c = *e;
*e = '\0';
if (baseurl && !uri_hasscheme(p) &&
uri_parse(p, &olduri) != -1 && !olduri.proto[0] &&
uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.proto[0])
r = uri_format(link, sizeof(link), &newuri);
if (r >= 0 && (size_t)r < sizeof(link))
printtrimmed(link);
else
printtrimmed(p);
*e = c; /* restore NUL byte to original character */
}
/* Print URL, if it's a relative URL then it uses the global `baseurl`. */
static void
string_print_uri(String *s)
{
if (!s->data || !s->len)
return;
printuri(s->data);
}
/* Print as UNIX timestamp, print nothing if the time is empty or invalid. */
static void
string_print_timestamp(String *s)
{
long long t;
if (!s->data || !s->len)
return;
if (parsetime(s->data, &t) != -1)
printf("%lld", t);
}
/* Convert time fields. Returns a UNIX timestamp. */
static long long
datetounix(long long year, int mon, int day, int hour, int min, int sec)
{
static const int secs_through_month[] = {
0, 31 * 86400, 59 * 86400, 90 * 86400,
120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
long long t;
if (year - 2ULL <= 136) {
leaps = (year - 68) >> 2;
if (!((year - 68) & 3)) {
leaps--;
is_leap = 1;
} else {
is_leap = 0;
}
t = 31536000 * (year - 70) + 86400 * leaps;
} else {
cycles = (year - 100) / 400;
rem = (year - 100) % 400;
if (rem < 0) {
cycles--;
rem += 400;
}
if (!rem) {
is_leap = 1;
} else {
if (rem >= 300)
centuries = 3, rem -= 300;
else if (rem >= 200)
centuries = 2, rem -= 200;
else if (rem >= 100)
centuries = 1, rem -= 100;
if (rem) {
leaps = rem / 4U;
rem %= 4U;
is_leap = !rem;
}
}
leaps += 97 * cycles + 24 * centuries - is_leap;
t = (year - 100) * 31536000LL + leaps * 86400LL + 946684800 + 86400;
}
t += secs_through_month[mon];
if (is_leap && mon >= 2)
t += 86400;
t += 86400LL * (day - 1);
t += 3600LL * hour;
t += 60LL * min;
t += sec;
return t;
}
/* Get timezone from string, return time offset in seconds from UTC.
* NOTE: only parses timezones in RFC-822, many other timezone names are
* ambiguous anyway.
* ANSI and military zones are defined wrong in RFC822 and are unsupported,
* see note on RFC2822 4.3 page 32. */
static long
gettzoffset(const char *s)
{
static const struct {
char *name;
int offhour;
} tzones[] = {
{ "CDT", -5 * 3600 },
{ "CST", -6 * 3600 },
{ "EDT", -4 * 3600 },
{ "EST", -5 * 3600 },
{ "MDT", -6 * 3600 },
{ "MST", -7 * 3600 },
{ "PDT", -7 * 3600 },
{ "PST", -8 * 3600 },
};
const char *p;
long tzhour = 0, tzmin = 0;
size_t i;
for (; ISSPACE((unsigned char)*s); s++)
;
switch (*s) {
case '-': /* offset */
case '+':
for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
tzhour = (tzhour * 10) + (*p - '0');
if (*p == ':')
p++;
for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
tzmin = (tzmin * 10) + (*p - '0');
return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1);
default: /* timezone name */
for (i = 0; ISALPHA((unsigned char)s[i]); i++)
;
if (i != 3)
return 0;
/* compare timezone and adjust offset relative to UTC */
for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) {
if (!memcmp(s, tzones[i].name, 3))
return tzones[i].offhour;
}
}
return 0;
}
/* Parse time string `s` into the UNIX timestamp `tp`.
Returns 0 on success or -1 on failure. */
static int
parsetime(const char *s, long long *tp)
{
static const struct {
char *name;
int len;
} mons[] = {
{ STRP("January"), },
{ STRP("February"), },
{ STRP("March"), },
{ STRP("April"), },
{ STRP("May"), },
{ STRP("June"), },
{ STRP("July"), },
{ STRP("August"), },
{ STRP("September"), },
{ STRP("October"), },
{ STRP("November"), },
{ STRP("December"), },
};
int va[6] = { 0 }, i, j, v, vi;
size_t m;
for (; ISSPACE((unsigned char)*s); s++)
;
if (!ISDIGIT((unsigned char)*s) && !ISALPHA((unsigned char)*s))
return -1;
if (ISDIGIT((unsigned char)s[0]) &&
ISDIGIT((unsigned char)s[1]) &&
ISDIGIT((unsigned char)s[2]) &&
ISDIGIT((unsigned char)s[3])) {
/* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */
vi = 0;
} else {
/* format: "[%a, ]%d %b %Y %H:%M:%S" */
/* parse "[%a, ]%d %b %Y " part, then use time parsing as above */
for (; ISALPHA((unsigned char)*s); s++)
;
for (; ISSPACE((unsigned char)*s); s++)
;
if (*s == ',')
s++;
for (; ISSPACE((unsigned char)*s); s++)
;
for (v = 0, i = 0; i < 2 && ISDIGIT((unsigned char)*s); s++, i++)
v = (v * 10) + (*s - '0');
va[2] = v; /* day */
for (; ISSPACE((unsigned char)*s); s++)
;
/* end of word month */
for (j = 0; ISALPHA((unsigned char)s[j]); j++)
;
/* check month name */
if (j < 3 || j > 9)
return -1; /* month cannot match */
for (m = 0; m < sizeof(mons) / sizeof(*mons); m++) {
/* abbreviation (3 length) or long name */
if ((j == 3 || j == mons[m].len) &&
!strncasecmp(mons[m].name, s, j)) {
va[1] = m + 1;
s += j;
break;
}
}
if (m >= 12)
return -1; /* no month found */
for (; ISSPACE((unsigned char)*s); s++)
;
for (v = 0, i = 0; i < 4 && ISDIGIT((unsigned char)*s); s++, i++)
v = (v * 10) + (*s - '0');
/* obsolete short year: RFC2822 4.3 */
if (i <= 3)
v += (v >= 0 && v <= 49) ? 2000 : 1900;
va[0] = v; /* year */
for (; ISSPACE((unsigned char)*s); s++)
;
/* parse only regular time part, see below */
vi = 3;
}
/* parse time parts (and possibly remaining date parts) */
for (; *s && vi < 6; vi++) {
for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) &&
ISDIGIT((unsigned char)*s); s++, i++) {
v = (v * 10) + (*s - '0');
}
va[vi] = v;
if ((vi < 2 && *s == '-') ||
(vi == 2 && (*s == 'T' || ISSPACE((unsigned char)*s))) ||
(vi > 2 && *s == ':'))
s++;
}
/* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */
if (*s == '.') {
for (s++; ISDIGIT((unsigned char)*s); s++)
;
}
/* invalid range */
if (va[0] < 0 || va[0] > 9999 ||
va[1] < 1 || va[1] > 12 ||
va[2] < 1 || va[2] > 31 ||
va[3] < 0 || va[3] > 23 ||
va[4] < 0 || va[4] > 59 ||
va[5] < 0 || va[5] > 60) /* allow leap second */
return -1;
*tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) -
gettzoffset(s);
return 0;
}
static void
printfields(void)
{
string_print_timestamp(&ctx.fields[FeedFieldTime].str);
putchar(FieldSeparator);
string_print_trimmed(&ctx.fields[FeedFieldTitle].str);
putchar(FieldSeparator);
string_print_uri(&ctx.fields[FeedFieldLink].str);
putchar(FieldSeparator);
string_print_encoded(&ctx.fields[FeedFieldContent].str);
putchar(FieldSeparator);
fputs(contenttypes[ctx.contenttype], stdout);
putchar(FieldSeparator);
string_print_trimmed(&ctx.fields[FeedFieldId].str);
putchar(FieldSeparator);
string_print_trimmed(&ctx.fields[FeedFieldAuthor].str);
putchar(FieldSeparator);
string_print_uri(&ctx.fields[FeedFieldEnclosure].str);
putchar(FieldSeparator);
string_print_trimmed_multi(&ctx.fields[FeedFieldCategory].str);
putchar('\n');
if (ferror(stdout)) /* check for errors but do not flush */
checkfileerror(stdout, "<stdout>", 'w');
}
static int
istag(const char *name, size_t len, const char *name2, size_t len2)
{
return (len == len2 && !strcasecmp(name, name2));
}
static int
isattr(const char *name, size_t len, const char *name2, size_t len2)
{
return (len == len2 && !strcasecmp(name, name2));
}
static void
xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
const char *v, size_t vl)
{
/* handles transforming inline XML to data */
if (ISINCONTENT(ctx)) {
if (ctx.contenttype == ContentTypeHTML)
xmldata(p, v, vl);
return;
}
if (!ctx.tag.id)
return;
/* content-type may be: Atom: text, xhtml, html or mime-type.
MRSS (media:description): plain, html. */
if (ISCONTENTTAG(ctx)) {
if (isattr(n, nl, STRP("type")))
string_append(&attrtype, v, vl);
return;
}
if (ctx.feedtype == FeedTypeRSS) {
if (ctx.tag.id == RSSTagEnclosure &&
isattr(n, nl, STRP("url"))) {
string_append(&tmpstr, v, vl);
} else if (ctx.tag.id == RSSTagGuid &&
isattr(n, nl, STRP("ispermalink"))) {
string_append(&attrispermalink, v, vl);
}
} else if (ctx.feedtype == FeedTypeAtom) {
if (ctx.tag.id == AtomTagLink) {
if (isattr(n, nl, STRP("rel"))) {
string_append(&attrrel, v, vl);
} else if (isattr(n, nl, STRP("href"))) {
string_append(&tmpstr, v, vl);
}
} else if (ctx.tag.id == AtomTagCategory &&
isattr(n, nl, STRP("term"))) {
string_append(&tmpstr, v, vl);
}
}
}
static void
xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
const char *data, size_t datalen)
{
char buf[16];
int len;
/* handles transforming inline XML to data */
if (ISINCONTENT(ctx)) {
if (ctx.contenttype == ContentTypeHTML)
xmldata(p, data, datalen);
return;
}
if (!ctx.tag.id)
return;
/* try to translate entity, else just pass as data to
* xmlattr handler. */
if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
xmlattr(p, t, tl, n, nl, buf, (size_t)len);
else
xmlattr(p, t, tl, n, nl, data, datalen);
}
static void
xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
{
if (ISINCONTENT(ctx)) {
if (ctx.contenttype == ContentTypeHTML) {
/* handles transforming inline XML to data */
xmldata(p, "\"", 1);
ctx.attrcount = 0;
}
return;
}
}
static void
xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
{
if (ISINCONTENT(ctx)) {
if (ctx.contenttype == ContentTypeHTML) {
/* handles transforming inline XML to data */
if (!ctx.attrcount)
xmldata(p, " ", 1);
ctx.attrcount++;
xmldata(p, n, nl);
xmldata(p, "=\"", 2);
}
return;
}
if (attrispermalink.len && isattr(n, nl, STRP("ispermalink")))
string_clear(&attrispermalink);
else if (attrrel.len && isattr(n, nl, STRP("rel")))
string_clear(&attrrel);
else if (attrtype.len && isattr(n, nl, STRP("type")))
string_clear(&attrtype);
else if (tmpstr.len &&
(isattr(n, nl, STRP("href")) ||
isattr(n, nl, STRP("term")) ||
isattr(n, nl, STRP("url"))))
string_clear(&tmpstr); /* use the last value for multiple attribute values */
}
static void
xmldata(XMLParser *p, const char *s, size_t len)
{
if (!ctx.field)
return;
if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
string_append(&tmpstr, s, len);
else
string_append(ctx.field, s, len);
}
static void
xmldataentity(XMLParser *p, const char *data, size_t datalen)
{
char buf[16];
int len;
if (!ctx.field)
return;
/* try to translate entity, else just pass as data to
* xmldata handler. */
if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
xmldata(p, buf, (size_t)len);
else
xmldata(p, data, datalen);
}
static void
xmltagstart(XMLParser *p, const char *t, size_t tl)
{
const FeedTag *f;
if (ISINCONTENT(ctx)) {
if (ctx.contenttype == ContentTypeHTML) {
ctx.attrcount = 0;
xmldata(p, "<", 1);
xmldata(p, t, tl);
}
return;
}
/* start of RSS or Atom item / entry */
if (ctx.feedtype == FeedTypeNone) {
if (istag(t, tl, STRP("entry")))
ctx.feedtype = FeedTypeAtom;
else if (istag(t, tl, STRP("item")))
ctx.feedtype = FeedTypeRSS;
return;
}
/* field tagid already set or nested tags. */
if (ctx.tag.id) {
/* nested <author><name> for Atom */
if (ctx.tag.id == AtomTagAuthor &&
istag(t, tl, STRP("name"))) {
memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ctx.tag));
} else {
return; /* other nested tags are not allowed: return */
}
}
/* in item */
if (ctx.tag.id == TagUnknown) {
if (!(f = gettag(ctx.feedtype, t, tl)))
f = &notag;
memcpy(&(ctx.tag), f, sizeof(ctx.tag));
}
ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent);
string_clear(&attrispermalink);
string_clear(&attrrel);
string_clear(&attrtype);
}
static void
xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
{
enum TagId tagid;
if (ISINCONTENT(ctx)) {
if (ctx.contenttype == ContentTypeHTML) {
if (isshort)
xmldata(p, "/>", 2);
else
xmldata(p, ">", 1);
}
return;
}
/* set tag type based on it's attribute value */
if (ctx.tag.id == RSSTagGuid) {
/* if empty the default is "true" */
if (!attrispermalink.len ||
isattr(attrispermalink.data, attrispermalink.len, STRP("true")))
ctx.tag.id = RSSTagGuidPermalinkTrue;
else
ctx.tag.id = RSSTagGuidPermalinkFalse;
} else if (ctx.tag.id == AtomTagLink) {
/* empty or "alternate": other types could be
"enclosure", "related", "self" or "via" */
if (!attrrel.len || isattr(attrrel.data, attrrel.len, STRP("alternate")))
ctx.tag.id = AtomTagLinkAlternate;
else if (isattr(attrrel.data, attrrel.len, STRP("enclosure")))
ctx.tag.id = AtomTagLinkEnclosure;
else
ctx.tag.id = AtomTagLink; /* unknown */
}
tagid = ctx.tag.id;
/* map tag type to field: unknown or lesser priority is ignored,
when tags of the same type are repeated only the first is used. */
if (fieldmap[tagid] == -1 ||
(!ISFEEDFIELDMULTI(fieldmap[tagid]) &&
tagid <= ctx.fields[fieldmap[tagid]].tagid)) {
return;
}
if (ctx.iscontenttag) {
ctx.iscontent = 1;
ctx.iscontenttag = 0;
/* detect content-type based on type attribute */
if (attrtype.len) {
if (isattr(attrtype.data, attrtype.len, STRP("html")) ||
isattr(attrtype.data, attrtype.len, STRP("xhtml")) ||
isattr(attrtype.data, attrtype.len, STRP("text/html")) ||
isattr(attrtype.data, attrtype.len, STRP("text/xhtml")) ||
isattr(attrtype.data, attrtype.len, STRP("application/xhtml+xml")))
ctx.contenttype = ContentTypeHTML;
else /* unknown: handle as base64 text data */
ctx.contenttype = ContentTypePlain;
} else {
/* default content-type */
if (tagid == RSSTagContentEncoded || tagid == RSSTagDescription)
ctx.contenttype = ContentTypeHTML;
else
ctx.contenttype = ContentTypePlain;
}
}
ctx.field = &(ctx.fields[fieldmap[tagid]].str);
ctx.fields[fieldmap[tagid]].tagid = tagid;
/* clear field if it is overwritten (with a priority order) for the new
value, if the field can have multiple values then do not clear it. */
if (!ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
string_clear(ctx.field);
}
static void
xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
{
size_t i;
if (ctx.feedtype == FeedTypeNone)
return;
if (ISINCONTENT(ctx)) {
/* not a closed content field */
if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) {
if (!isshort && ctx.contenttype == ContentTypeHTML) {
xmldata(p, "</", 2);
xmldata(p, t, tl);
xmldata(p, ">", 1);
}
return;
}
} else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)) {
/* matched tag end: close it */
/* copy also to the link field if the attribute isPermaLink="true"
and it is not set by a tag with higher priority. */
if (ctx.tag.id == RSSTagGuidPermalinkTrue && ctx.field &&
ctx.tag.id > ctx.fields[FeedFieldLink].tagid) {
string_clear(&ctx.fields[FeedFieldLink].str);
string_append(&ctx.fields[FeedFieldLink].str,
ctx.field->data, ctx.field->len);
ctx.fields[FeedFieldLink].tagid = ctx.tag.id;
}
} else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom &&
istag(t, tl, STRP("entry"))) || /* Atom */
(ctx.feedtype == FeedTypeRSS &&
istag(t, tl, STRP("item"))))) /* RSS */
{
/* end of RSS or Atom entry / item */
printfields();
/* clear strings */
for (i = 0; i < FeedFieldLast; i++) {
string_clear(&ctx.fields[i].str);
ctx.fields[i].tagid = TagUnknown;
}
ctx.contenttype = ContentTypeNone;
/* allow parsing of Atom and RSS concatenated in one XML stream. */
ctx.feedtype = FeedTypeNone;
} else {
return; /* not end of field */
}
/* temporary string: for fields that cannot be processed
directly and need more context, for example by it's tag
attributes, like the Atom link rel="alternate|enclosure". */
if (tmpstr.len && ctx.field) {
if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) {
if (ctx.field->len)
string_append(ctx.field, FieldMultiSeparator, 1);
string_append(ctx.field, tmpstr.data, tmpstr.len);
} else {
string_clear(ctx.field);
string_append(ctx.field, tmpstr.data, tmpstr.len);
}
}
/* close field */
string_clear(&tmpstr); /* reuse and clear temporary string */
if (ctx.tag.id == AtomTagAuthorName)
memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* outer tag */
else
memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
ctx.iscontent = 0;
ctx.field = NULL;
}
int
main(int argc, char *argv[])
{
if (pledge("stdio", NULL) == -1)
err(1, "pledge");
if (argc > 1) {
if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[0])
baseurl = argv[1];
else
errx(1, "baseurl incorrect or too long");
}
memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
parser.xmlattr = xmlattr;
parser.xmlattrentity = xmlattrentity;
parser.xmlattrend = xmlattrend;
parser.xmlattrstart = xmlattrstart;
parser.xmlcdata = xmldata;
parser.xmldata = xmldata;
parser.xmldataentity = xmldataentity;
parser.xmltagend = xmltagend;
parser.xmltagstart = xmltagstart;
parser.xmltagstartparsed = xmltagstartparsed;
/* NOTE: getnext is defined in xml.h for inline optimization */
xml_parse(&parser);
checkfileerror(stdin, "<stdin>", 'r');
checkfileerror(stdout, "<stdout>", 'w');
return 0;
}