1069 lines
29 KiB
C
1069 lines
29 KiB
C
#include <errno.h>
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <strings.h>
|
|
|
|
#include "util.h"
|
|
#include "xml.h"
|
|
|
|
#define ISINCONTENT(ctx) ((ctx).iscontent && !((ctx).iscontenttag))
|
|
#define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag)
|
|
|
|
/* these feed fields support multiple separated values */
|
|
#define ISFEEDFIELDMULTI(t) ((t) == FeedFieldCategory)
|
|
|
|
/* string and byte-length */
|
|
#define STRP(s) s,sizeof(s)-1
|
|
|
|
enum FeedType {
|
|
FeedTypeNone = 0,
|
|
FeedTypeRSS = 1,
|
|
FeedTypeAtom = 2
|
|
};
|
|
|
|
enum ContentType {
|
|
ContentTypeNone = 0,
|
|
ContentTypePlain = 1,
|
|
ContentTypeHTML = 2
|
|
};
|
|
static const char *contenttypes[] = { "", "plain", "html" };
|
|
|
|
/* String data / memory pool */
|
|
typedef struct string {
|
|
char *data; /* data */
|
|
size_t len; /* string length */
|
|
size_t bufsiz; /* allocated size */
|
|
} String;
|
|
|
|
/* NOTE: the order of these fields (content, date, author) indicate the
|
|
* priority to use them, from least important to high. */
|
|
enum TagId {
|
|
TagUnknown = 0,
|
|
/* RSS */
|
|
RSSTagDcdate, RSSTagPubdate, /* creation date has higher priority */
|
|
RSSTagTitle,
|
|
RSSTagMediaDescription, RSSTagDescription, RSSTagContentEncoded,
|
|
RSSTagGuid,
|
|
RSSTagGuidPermalinkFalse,
|
|
RSSTagGuidPermalinkTrue,
|
|
/* must be defined after GUID, because it can be a link (isPermaLink) */
|
|
RSSTagLink,
|
|
RSSTagEnclosure,
|
|
RSSTagAuthor, RSSTagDccreator,
|
|
RSSTagCategory,
|
|
/* Atom */
|
|
/* creation date has higher priority */
|
|
AtomTagModified, AtomTagUpdated, AtomTagIssued, AtomTagPublished,
|
|
AtomTagTitle,
|
|
AtomTagMediaDescription, AtomTagSummary, AtomTagContent,
|
|
AtomTagId,
|
|
AtomTagLink,
|
|
AtomTagLinkAlternate,
|
|
AtomTagLinkEnclosure,
|
|
AtomTagAuthor, AtomTagAuthorName,
|
|
AtomTagCategory,
|
|
TagLast
|
|
};
|
|
|
|
typedef struct feedtag {
|
|
char *name; /* name of tag to match */
|
|
size_t len; /* len of `name` */
|
|
enum TagId id; /* unique ID */
|
|
} FeedTag;
|
|
|
|
typedef struct field {
|
|
String str;
|
|
enum TagId tagid; /* tagid set previously, used for tag priority */
|
|
} FeedField;
|
|
|
|
enum {
|
|
FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldContent,
|
|
FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCategory,
|
|
FeedFieldLast
|
|
};
|
|
|
|
typedef struct feedcontext {
|
|
String *field; /* current FeedItem field String */
|
|
FeedField fields[FeedFieldLast]; /* data for current item */
|
|
FeedTag tag; /* unique current parsed tag */
|
|
int iscontent; /* in content data */
|
|
int iscontenttag; /* in content tag */
|
|
enum ContentType contenttype; /* content-type for item */
|
|
enum FeedType feedtype;
|
|
int attrcount; /* count item HTML element attributes */
|
|
} FeedContext;
|
|
|
|
static long long datetounix(long long, int, int, int, int, int);
|
|
static FeedTag * gettag(enum FeedType, const char *, size_t);
|
|
static long gettzoffset(const char *);
|
|
static int isattr(const char *, size_t, const char *, size_t);
|
|
static int istag(const char *, size_t, const char *, size_t);
|
|
static int parsetime(const char *, long long *);
|
|
static void printfields(void);
|
|
static void string_append(String *, const char *, size_t);
|
|
static void string_buffer_realloc(String *, size_t);
|
|
static void string_clear(String *);
|
|
static void string_print_encoded(String *);
|
|
static void string_print_timestamp(String *);
|
|
static void string_print_trimmed(String *);
|
|
static void string_print_trimmed_multi(String *);
|
|
static void string_print_uri(String *);
|
|
static void xmlattr(XMLParser *, const char *, size_t, const char *, size_t,
|
|
const char *, size_t);
|
|
static void xmlattrentity(XMLParser *, const char *, size_t, const char *,
|
|
size_t, const char *, size_t);
|
|
static void xmlattrend(XMLParser *, const char *, size_t, const char *,
|
|
size_t);
|
|
static void xmlattrstart(XMLParser *, const char *, size_t, const char *,
|
|
size_t);
|
|
static void xmldata(XMLParser *, const char *, size_t);
|
|
static void xmldataentity(XMLParser *, const char *, size_t);
|
|
static void xmltagend(XMLParser *, const char *, size_t, int);
|
|
static void xmltagstart(XMLParser *, const char *, size_t);
|
|
static void xmltagstartparsed(XMLParser *, const char *, size_t, int);
|
|
|
|
/* map tag name to TagId type */
|
|
/* RSS, must be alphabetical order */
|
|
static const FeedTag rsstags[] = {
|
|
{ STRP("author"), RSSTagAuthor },
|
|
{ STRP("category"), RSSTagCategory },
|
|
{ STRP("content:encoded"), RSSTagContentEncoded },
|
|
{ STRP("dc:creator"), RSSTagDccreator },
|
|
{ STRP("dc:date"), RSSTagDcdate },
|
|
{ STRP("description"), RSSTagDescription },
|
|
/* RSS: <enclosure url="" />, Atom has <link rel="enclosure" /> */
|
|
{ STRP("enclosure"), RSSTagEnclosure },
|
|
{ STRP("guid"), RSSTagGuid },
|
|
{ STRP("link"), RSSTagLink },
|
|
{ STRP("media:description"), RSSTagMediaDescription },
|
|
{ STRP("pubdate"), RSSTagPubdate },
|
|
{ STRP("title"), RSSTagTitle }
|
|
};
|
|
|
|
/* Atom, must be alphabetical order */
|
|
static const FeedTag atomtags[] = {
|
|
{ STRP("author"), AtomTagAuthor },
|
|
{ STRP("category"), AtomTagCategory },
|
|
{ STRP("content"), AtomTagContent },
|
|
{ STRP("id"), AtomTagId },
|
|
{ STRP("issued"), AtomTagIssued }, /* Atom 0.3 */
|
|
/* Atom: <link href="" />, RSS has <link></link> */
|
|
{ STRP("link"), AtomTagLink },
|
|
{ STRP("media:description"), AtomTagMediaDescription },
|
|
{ STRP("modified"), AtomTagModified }, /* Atom 0.3 */
|
|
{ STRP("published"), AtomTagPublished },
|
|
{ STRP("summary"), AtomTagSummary },
|
|
{ STRP("title"), AtomTagTitle },
|
|
{ STRP("updated"), AtomTagUpdated }
|
|
};
|
|
|
|
/* special case: nested <author><name> */
|
|
static const FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor };
|
|
static const FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorName };
|
|
|
|
/* reference to no / unknown tag */
|
|
static const FeedTag notag = { STRP(""), TagUnknown };
|
|
|
|
/* map TagId type to RSS/Atom field, all tags must be defined */
|
|
static const int fieldmap[TagLast] = {
|
|
[TagUnknown] = -1,
|
|
/* RSS */
|
|
[RSSTagDcdate] = FeedFieldTime,
|
|
[RSSTagPubdate] = FeedFieldTime,
|
|
[RSSTagTitle] = FeedFieldTitle,
|
|
[RSSTagMediaDescription] = FeedFieldContent,
|
|
[RSSTagDescription] = FeedFieldContent,
|
|
[RSSTagContentEncoded] = FeedFieldContent,
|
|
[RSSTagGuid] = -1,
|
|
[RSSTagGuidPermalinkFalse] = FeedFieldId,
|
|
[RSSTagGuidPermalinkTrue] = FeedFieldId, /* special-case: both a link and an id */
|
|
[RSSTagLink] = FeedFieldLink,
|
|
[RSSTagEnclosure] = FeedFieldEnclosure,
|
|
[RSSTagAuthor] = FeedFieldAuthor,
|
|
[RSSTagDccreator] = FeedFieldAuthor,
|
|
[RSSTagCategory] = FeedFieldCategory,
|
|
/* Atom */
|
|
[AtomTagModified] = FeedFieldTime,
|
|
[AtomTagUpdated] = FeedFieldTime,
|
|
[AtomTagIssued] = FeedFieldTime,
|
|
[AtomTagPublished] = FeedFieldTime,
|
|
[AtomTagTitle] = FeedFieldTitle,
|
|
[AtomTagMediaDescription] = FeedFieldContent,
|
|
[AtomTagSummary] = FeedFieldContent,
|
|
[AtomTagContent] = FeedFieldContent,
|
|
[AtomTagId] = FeedFieldId,
|
|
[AtomTagLink] = -1,
|
|
[AtomTagLinkAlternate] = FeedFieldLink,
|
|
[AtomTagLinkEnclosure] = FeedFieldEnclosure,
|
|
[AtomTagAuthor] = -1,
|
|
[AtomTagAuthorName] = FeedFieldAuthor,
|
|
[AtomTagCategory] = FeedFieldCategory
|
|
};
|
|
|
|
static const int FieldSeparator = '\t';
|
|
/* separator for multiple values in a field, separator should be 1 byte */
|
|
static const char FieldMultiSeparator[] = "|";
|
|
static struct uri baseuri;
|
|
static const char *baseurl;
|
|
|
|
static FeedContext ctx;
|
|
static XMLParser parser; /* XML parser state */
|
|
static String attrispermalink, attrrel, attrtype, tmpstr;
|
|
|
|
static int
|
|
tagcmp(const void *v1, const void *v2)
|
|
{
|
|
return strcasecmp(((FeedTag *)v1)->name, ((FeedTag *)v2)->name);
|
|
}
|
|
|
|
/* Unique tagid for parsed tag name. */
|
|
static FeedTag *
|
|
gettag(enum FeedType feedtype, const char *name, size_t namelen)
|
|
{
|
|
FeedTag f, *r = NULL;
|
|
|
|
f.name = (char *)name;
|
|
|
|
switch (feedtype) {
|
|
case FeedTypeRSS:
|
|
r = bsearch(&f, rsstags, sizeof(rsstags) / sizeof(rsstags[0]),
|
|
sizeof(rsstags[0]), tagcmp);
|
|
break;
|
|
case FeedTypeAtom:
|
|
r = bsearch(&f, atomtags, sizeof(atomtags) / sizeof(atomtags[0]),
|
|
sizeof(atomtags[0]), tagcmp);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return r;
|
|
}
|
|
|
|
static char *
|
|
ltrim(const char *s)
|
|
{
|
|
for (; ISSPACE((unsigned char)*s); s++)
|
|
;
|
|
return (char *)s;
|
|
}
|
|
|
|
static char *
|
|
rtrim(const char *s)
|
|
{
|
|
const char *e;
|
|
|
|
for (e = s + strlen(s); e > s && ISSPACE((unsigned char)*(e - 1)); e--)
|
|
;
|
|
return (char *)e;
|
|
}
|
|
|
|
/* Clear string only; don't free, prevents unnecessary reallocation. */
|
|
static void
|
|
string_clear(String *s)
|
|
{
|
|
if (s->data)
|
|
s->data[0] = '\0';
|
|
s->len = 0;
|
|
}
|
|
|
|
static void
|
|
string_buffer_realloc(String *s, size_t newlen)
|
|
{
|
|
size_t alloclen;
|
|
|
|
if (newlen > SIZE_MAX / 2) {
|
|
alloclen = SIZE_MAX;
|
|
} else {
|
|
for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
|
|
;
|
|
}
|
|
if (!(s->data = realloc(s->data, alloclen)))
|
|
err(1, "realloc");
|
|
s->bufsiz = alloclen;
|
|
}
|
|
|
|
/* Append data to String, s->data and data may not overlap. */
|
|
static void
|
|
string_append(String *s, const char *data, size_t len)
|
|
{
|
|
if (!len)
|
|
return;
|
|
|
|
if (s->len >= SIZE_MAX - len) {
|
|
errno = EOVERFLOW;
|
|
err(1, "realloc");
|
|
}
|
|
|
|
/* check if allocation is necessary, never shrink the buffer. */
|
|
if (s->len + len >= s->bufsiz)
|
|
string_buffer_realloc(s, s->len + len + 1);
|
|
memcpy(s->data + s->len, data, len);
|
|
s->len += len;
|
|
s->data[s->len] = '\0';
|
|
}
|
|
|
|
/* Print text, encode TABs, newlines and '\', remove other whitespace.
|
|
* Remove leading and trailing whitespace. */
|
|
static void
|
|
string_print_encoded(String *s)
|
|
{
|
|
const char *p, *e;
|
|
|
|
if (!s->data || !s->len)
|
|
return;
|
|
|
|
p = ltrim(s->data);
|
|
e = rtrim(p);
|
|
|
|
for (; *p && p != e; p++) {
|
|
switch (*p) {
|
|
case '\n': putchar('\\'); putchar('n'); break;
|
|
case '\\': putchar('\\'); putchar('\\'); break;
|
|
case '\t': putchar('\\'); putchar('t'); break;
|
|
default:
|
|
/* ignore control chars */
|
|
if (!ISCNTRL((unsigned char)*p))
|
|
putchar(*p);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
printtrimmed(const char *s)
|
|
{
|
|
char *p, *e;
|
|
|
|
p = ltrim(s);
|
|
e = rtrim(p);
|
|
for (; *p && p != e; p++) {
|
|
if (ISSPACE((unsigned char)*p))
|
|
putchar(' '); /* any whitespace to space */
|
|
else if (!ISCNTRL((unsigned char)*p))
|
|
/* ignore other control chars */
|
|
putchar(*p);
|
|
}
|
|
}
|
|
|
|
/* Print text, replace TABs, carriage return and other whitespace with ' '.
|
|
* Other control chars are removed. Remove leading and trailing whitespace. */
|
|
static void
|
|
string_print_trimmed(String *s)
|
|
{
|
|
if (!s->data || !s->len)
|
|
return;
|
|
|
|
printtrimmed(s->data);
|
|
}
|
|
|
|
/* Print each field with trimmed whitespace, separated by '|'. */
|
|
static void
|
|
string_print_trimmed_multi(String *s)
|
|
{
|
|
char *p, *e;
|
|
int c;
|
|
|
|
if (!s->data || !s->len)
|
|
return;
|
|
|
|
for (p = s->data; ; p = e + 1) {
|
|
if ((e = strstr(p, FieldMultiSeparator))) {
|
|
c = *e;
|
|
*e = '\0';
|
|
printtrimmed(p);
|
|
*e = c; /* restore NUL byte to original character */
|
|
fputs(FieldMultiSeparator, stdout);
|
|
} else {
|
|
printtrimmed(p);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Print URL, if it's a relative URL then it uses the global `baseurl`. */
|
|
static void
|
|
printuri(char *s)
|
|
{
|
|
char link[4096], *p, *e;
|
|
struct uri newuri, olduri;
|
|
int c, r = -1;
|
|
|
|
p = ltrim(s);
|
|
e = rtrim(p);
|
|
c = *e;
|
|
*e = '\0';
|
|
|
|
if (baseurl && !uri_hasscheme(p) &&
|
|
uri_parse(p, &olduri) != -1 && !olduri.proto[0] &&
|
|
uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.proto[0])
|
|
r = uri_format(link, sizeof(link), &newuri);
|
|
|
|
if (r >= 0 && (size_t)r < sizeof(link))
|
|
printtrimmed(link);
|
|
else
|
|
printtrimmed(p);
|
|
|
|
*e = c; /* restore NUL byte to original character */
|
|
}
|
|
|
|
/* Print URL, if it's a relative URL then it uses the global `baseurl`. */
|
|
static void
|
|
string_print_uri(String *s)
|
|
{
|
|
if (!s->data || !s->len)
|
|
return;
|
|
|
|
printuri(s->data);
|
|
}
|
|
|
|
/* Print as UNIX timestamp, print nothing if the time is empty or invalid. */
|
|
static void
|
|
string_print_timestamp(String *s)
|
|
{
|
|
long long t;
|
|
|
|
if (!s->data || !s->len)
|
|
return;
|
|
|
|
if (parsetime(s->data, &t) != -1)
|
|
printf("%lld", t);
|
|
}
|
|
|
|
/* Convert time fields. Returns a UNIX timestamp. */
|
|
static long long
|
|
datetounix(long long year, int mon, int day, int hour, int min, int sec)
|
|
{
|
|
static const int secs_through_month[] = {
|
|
0, 31 * 86400, 59 * 86400, 90 * 86400,
|
|
120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
|
|
243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
|
|
int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
|
|
long long t;
|
|
|
|
if (year - 2ULL <= 136) {
|
|
leaps = (year - 68) >> 2;
|
|
if (!((year - 68) & 3)) {
|
|
leaps--;
|
|
is_leap = 1;
|
|
} else {
|
|
is_leap = 0;
|
|
}
|
|
t = 31536000 * (year - 70) + 86400 * leaps;
|
|
} else {
|
|
cycles = (year - 100) / 400;
|
|
rem = (year - 100) % 400;
|
|
if (rem < 0) {
|
|
cycles--;
|
|
rem += 400;
|
|
}
|
|
if (!rem) {
|
|
is_leap = 1;
|
|
} else {
|
|
if (rem >= 300)
|
|
centuries = 3, rem -= 300;
|
|
else if (rem >= 200)
|
|
centuries = 2, rem -= 200;
|
|
else if (rem >= 100)
|
|
centuries = 1, rem -= 100;
|
|
if (rem) {
|
|
leaps = rem / 4U;
|
|
rem %= 4U;
|
|
is_leap = !rem;
|
|
}
|
|
}
|
|
leaps += 97 * cycles + 24 * centuries - is_leap;
|
|
t = (year - 100) * 31536000LL + leaps * 86400LL + 946684800 + 86400;
|
|
}
|
|
t += secs_through_month[mon];
|
|
if (is_leap && mon >= 2)
|
|
t += 86400;
|
|
t += 86400LL * (day - 1);
|
|
t += 3600LL * hour;
|
|
t += 60LL * min;
|
|
t += sec;
|
|
|
|
return t;
|
|
}
|
|
|
|
/* Get timezone from string, return time offset in seconds from UTC.
|
|
* NOTE: only parses timezones in RFC-822, many other timezone names are
|
|
* ambiguous anyway.
|
|
* ANSI and military zones are defined wrong in RFC822 and are unsupported,
|
|
* see note on RFC2822 4.3 page 32. */
|
|
static long
|
|
gettzoffset(const char *s)
|
|
{
|
|
static const struct {
|
|
char *name;
|
|
int offhour;
|
|
} tzones[] = {
|
|
{ "CDT", -5 * 3600 },
|
|
{ "CST", -6 * 3600 },
|
|
{ "EDT", -4 * 3600 },
|
|
{ "EST", -5 * 3600 },
|
|
{ "MDT", -6 * 3600 },
|
|
{ "MST", -7 * 3600 },
|
|
{ "PDT", -7 * 3600 },
|
|
{ "PST", -8 * 3600 },
|
|
};
|
|
const char *p;
|
|
long tzhour = 0, tzmin = 0;
|
|
size_t i;
|
|
|
|
for (; ISSPACE((unsigned char)*s); s++)
|
|
;
|
|
switch (*s) {
|
|
case '-': /* offset */
|
|
case '+':
|
|
for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
|
|
tzhour = (tzhour * 10) + (*p - '0');
|
|
if (*p == ':')
|
|
p++;
|
|
for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
|
|
tzmin = (tzmin * 10) + (*p - '0');
|
|
return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1);
|
|
default: /* timezone name */
|
|
for (i = 0; ISALPHA((unsigned char)s[i]); i++)
|
|
;
|
|
if (i != 3)
|
|
return 0;
|
|
/* compare timezone and adjust offset relative to UTC */
|
|
for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) {
|
|
if (!memcmp(s, tzones[i].name, 3))
|
|
return tzones[i].offhour;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* Parse time string `s` into the UNIX timestamp `tp`.
|
|
Returns 0 on success or -1 on failure. */
|
|
static int
|
|
parsetime(const char *s, long long *tp)
|
|
{
|
|
static const struct {
|
|
char *name;
|
|
int len;
|
|
} mons[] = {
|
|
{ STRP("January"), },
|
|
{ STRP("February"), },
|
|
{ STRP("March"), },
|
|
{ STRP("April"), },
|
|
{ STRP("May"), },
|
|
{ STRP("June"), },
|
|
{ STRP("July"), },
|
|
{ STRP("August"), },
|
|
{ STRP("September"), },
|
|
{ STRP("October"), },
|
|
{ STRP("November"), },
|
|
{ STRP("December"), },
|
|
};
|
|
int va[6] = { 0 }, i, j, v, vi;
|
|
size_t m;
|
|
|
|
for (; ISSPACE((unsigned char)*s); s++)
|
|
;
|
|
if (!ISDIGIT((unsigned char)*s) && !ISALPHA((unsigned char)*s))
|
|
return -1;
|
|
|
|
if (ISDIGIT((unsigned char)s[0]) &&
|
|
ISDIGIT((unsigned char)s[1]) &&
|
|
ISDIGIT((unsigned char)s[2]) &&
|
|
ISDIGIT((unsigned char)s[3])) {
|
|
/* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */
|
|
vi = 0;
|
|
} else {
|
|
/* format: "[%a, ]%d %b %Y %H:%M:%S" */
|
|
/* parse "[%a, ]%d %b %Y " part, then use time parsing as above */
|
|
for (; ISALPHA((unsigned char)*s); s++)
|
|
;
|
|
for (; ISSPACE((unsigned char)*s); s++)
|
|
;
|
|
if (*s == ',')
|
|
s++;
|
|
for (; ISSPACE((unsigned char)*s); s++)
|
|
;
|
|
for (v = 0, i = 0; i < 2 && ISDIGIT((unsigned char)*s); s++, i++)
|
|
v = (v * 10) + (*s - '0');
|
|
va[2] = v; /* day */
|
|
for (; ISSPACE((unsigned char)*s); s++)
|
|
;
|
|
/* end of word month */
|
|
for (j = 0; ISALPHA((unsigned char)s[j]); j++)
|
|
;
|
|
/* check month name */
|
|
if (j < 3 || j > 9)
|
|
return -1; /* month cannot match */
|
|
for (m = 0; m < sizeof(mons) / sizeof(*mons); m++) {
|
|
/* abbreviation (3 length) or long name */
|
|
if ((j == 3 || j == mons[m].len) &&
|
|
!strncasecmp(mons[m].name, s, j)) {
|
|
va[1] = m + 1;
|
|
s += j;
|
|
break;
|
|
}
|
|
}
|
|
if (m >= 12)
|
|
return -1; /* no month found */
|
|
for (; ISSPACE((unsigned char)*s); s++)
|
|
;
|
|
for (v = 0, i = 0; i < 4 && ISDIGIT((unsigned char)*s); s++, i++)
|
|
v = (v * 10) + (*s - '0');
|
|
/* obsolete short year: RFC2822 4.3 */
|
|
if (i <= 3)
|
|
v += (v >= 0 && v <= 49) ? 2000 : 1900;
|
|
va[0] = v; /* year */
|
|
for (; ISSPACE((unsigned char)*s); s++)
|
|
;
|
|
/* parse only regular time part, see below */
|
|
vi = 3;
|
|
}
|
|
|
|
/* parse time parts (and possibly remaining date parts) */
|
|
for (; *s && vi < 6; vi++) {
|
|
for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) &&
|
|
ISDIGIT((unsigned char)*s); s++, i++) {
|
|
v = (v * 10) + (*s - '0');
|
|
}
|
|
va[vi] = v;
|
|
|
|
if ((vi < 2 && *s == '-') ||
|
|
(vi == 2 && (*s == 'T' || ISSPACE((unsigned char)*s))) ||
|
|
(vi > 2 && *s == ':'))
|
|
s++;
|
|
}
|
|
|
|
/* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */
|
|
if (*s == '.') {
|
|
for (s++; ISDIGIT((unsigned char)*s); s++)
|
|
;
|
|
}
|
|
|
|
/* invalid range */
|
|
if (va[0] < 0 || va[0] > 9999 ||
|
|
va[1] < 1 || va[1] > 12 ||
|
|
va[2] < 1 || va[2] > 31 ||
|
|
va[3] < 0 || va[3] > 23 ||
|
|
va[4] < 0 || va[4] > 59 ||
|
|
va[5] < 0 || va[5] > 60) /* allow leap second */
|
|
return -1;
|
|
|
|
*tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) -
|
|
gettzoffset(s);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
printfields(void)
|
|
{
|
|
string_print_timestamp(&ctx.fields[FeedFieldTime].str);
|
|
putchar(FieldSeparator);
|
|
string_print_trimmed(&ctx.fields[FeedFieldTitle].str);
|
|
putchar(FieldSeparator);
|
|
string_print_uri(&ctx.fields[FeedFieldLink].str);
|
|
putchar(FieldSeparator);
|
|
string_print_encoded(&ctx.fields[FeedFieldContent].str);
|
|
putchar(FieldSeparator);
|
|
fputs(contenttypes[ctx.contenttype], stdout);
|
|
putchar(FieldSeparator);
|
|
string_print_trimmed(&ctx.fields[FeedFieldId].str);
|
|
putchar(FieldSeparator);
|
|
string_print_trimmed(&ctx.fields[FeedFieldAuthor].str);
|
|
putchar(FieldSeparator);
|
|
string_print_uri(&ctx.fields[FeedFieldEnclosure].str);
|
|
putchar(FieldSeparator);
|
|
string_print_trimmed_multi(&ctx.fields[FeedFieldCategory].str);
|
|
putchar('\n');
|
|
|
|
if (ferror(stdout)) /* check for errors but do not flush */
|
|
checkfileerror(stdout, "<stdout>", 'w');
|
|
}
|
|
|
|
static int
|
|
istag(const char *name, size_t len, const char *name2, size_t len2)
|
|
{
|
|
return (len == len2 && !strcasecmp(name, name2));
|
|
}
|
|
|
|
static int
|
|
isattr(const char *name, size_t len, const char *name2, size_t len2)
|
|
{
|
|
return (len == len2 && !strcasecmp(name, name2));
|
|
}
|
|
|
|
static void
|
|
xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
|
|
const char *v, size_t vl)
|
|
{
|
|
/* handles transforming inline XML to data */
|
|
if (ISINCONTENT(ctx)) {
|
|
if (ctx.contenttype == ContentTypeHTML)
|
|
xmldata(p, v, vl);
|
|
return;
|
|
}
|
|
|
|
if (!ctx.tag.id)
|
|
return;
|
|
|
|
/* content-type may be: Atom: text, xhtml, html or mime-type.
|
|
MRSS (media:description): plain, html. */
|
|
if (ISCONTENTTAG(ctx)) {
|
|
if (isattr(n, nl, STRP("type")))
|
|
string_append(&attrtype, v, vl);
|
|
return;
|
|
}
|
|
|
|
if (ctx.feedtype == FeedTypeRSS) {
|
|
if (ctx.tag.id == RSSTagEnclosure &&
|
|
isattr(n, nl, STRP("url"))) {
|
|
string_append(&tmpstr, v, vl);
|
|
} else if (ctx.tag.id == RSSTagGuid &&
|
|
isattr(n, nl, STRP("ispermalink"))) {
|
|
string_append(&attrispermalink, v, vl);
|
|
}
|
|
} else if (ctx.feedtype == FeedTypeAtom) {
|
|
if (ctx.tag.id == AtomTagLink) {
|
|
if (isattr(n, nl, STRP("rel"))) {
|
|
string_append(&attrrel, v, vl);
|
|
} else if (isattr(n, nl, STRP("href"))) {
|
|
string_append(&tmpstr, v, vl);
|
|
}
|
|
} else if (ctx.tag.id == AtomTagCategory &&
|
|
isattr(n, nl, STRP("term"))) {
|
|
string_append(&tmpstr, v, vl);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
|
|
const char *data, size_t datalen)
|
|
{
|
|
char buf[16];
|
|
int len;
|
|
|
|
/* handles transforming inline XML to data */
|
|
if (ISINCONTENT(ctx)) {
|
|
if (ctx.contenttype == ContentTypeHTML)
|
|
xmldata(p, data, datalen);
|
|
return;
|
|
}
|
|
|
|
if (!ctx.tag.id)
|
|
return;
|
|
|
|
/* try to translate entity, else just pass as data to
|
|
* xmlattr handler. */
|
|
if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
|
|
xmlattr(p, t, tl, n, nl, buf, (size_t)len);
|
|
else
|
|
xmlattr(p, t, tl, n, nl, data, datalen);
|
|
}
|
|
|
|
static void
|
|
xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
|
|
{
|
|
if (ISINCONTENT(ctx)) {
|
|
if (ctx.contenttype == ContentTypeHTML) {
|
|
/* handles transforming inline XML to data */
|
|
xmldata(p, "\"", 1);
|
|
ctx.attrcount = 0;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
|
|
static void
|
|
xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
|
|
{
|
|
if (ISINCONTENT(ctx)) {
|
|
if (ctx.contenttype == ContentTypeHTML) {
|
|
/* handles transforming inline XML to data */
|
|
if (!ctx.attrcount)
|
|
xmldata(p, " ", 1);
|
|
ctx.attrcount++;
|
|
xmldata(p, n, nl);
|
|
xmldata(p, "=\"", 2);
|
|
}
|
|
return;
|
|
}
|
|
|
|
if (attrispermalink.len && isattr(n, nl, STRP("ispermalink")))
|
|
string_clear(&attrispermalink);
|
|
else if (attrrel.len && isattr(n, nl, STRP("rel")))
|
|
string_clear(&attrrel);
|
|
else if (attrtype.len && isattr(n, nl, STRP("type")))
|
|
string_clear(&attrtype);
|
|
else if (tmpstr.len &&
|
|
(isattr(n, nl, STRP("href")) ||
|
|
isattr(n, nl, STRP("term")) ||
|
|
isattr(n, nl, STRP("url"))))
|
|
string_clear(&tmpstr); /* use the last value for multiple attribute values */
|
|
}
|
|
|
|
static void
|
|
xmldata(XMLParser *p, const char *s, size_t len)
|
|
{
|
|
if (!ctx.field)
|
|
return;
|
|
|
|
if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
|
|
string_append(&tmpstr, s, len);
|
|
else
|
|
string_append(ctx.field, s, len);
|
|
}
|
|
|
|
static void
|
|
xmldataentity(XMLParser *p, const char *data, size_t datalen)
|
|
{
|
|
char buf[16];
|
|
int len;
|
|
|
|
if (!ctx.field)
|
|
return;
|
|
|
|
/* try to translate entity, else just pass as data to
|
|
* xmldata handler. */
|
|
if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
|
|
xmldata(p, buf, (size_t)len);
|
|
else
|
|
xmldata(p, data, datalen);
|
|
}
|
|
|
|
static void
|
|
xmltagstart(XMLParser *p, const char *t, size_t tl)
|
|
{
|
|
const FeedTag *f;
|
|
|
|
if (ISINCONTENT(ctx)) {
|
|
if (ctx.contenttype == ContentTypeHTML) {
|
|
ctx.attrcount = 0;
|
|
xmldata(p, "<", 1);
|
|
xmldata(p, t, tl);
|
|
}
|
|
return;
|
|
}
|
|
|
|
/* start of RSS or Atom item / entry */
|
|
if (ctx.feedtype == FeedTypeNone) {
|
|
if (istag(t, tl, STRP("entry")))
|
|
ctx.feedtype = FeedTypeAtom;
|
|
else if (istag(t, tl, STRP("item")))
|
|
ctx.feedtype = FeedTypeRSS;
|
|
return;
|
|
}
|
|
|
|
/* field tagid already set or nested tags. */
|
|
if (ctx.tag.id) {
|
|
/* nested <author><name> for Atom */
|
|
if (ctx.tag.id == AtomTagAuthor &&
|
|
istag(t, tl, STRP("name"))) {
|
|
memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ctx.tag));
|
|
} else {
|
|
return; /* other nested tags are not allowed: return */
|
|
}
|
|
}
|
|
|
|
/* in item */
|
|
if (ctx.tag.id == TagUnknown) {
|
|
if (!(f = gettag(ctx.feedtype, t, tl)))
|
|
f = ¬ag;
|
|
memcpy(&(ctx.tag), f, sizeof(ctx.tag));
|
|
}
|
|
|
|
ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent);
|
|
string_clear(&attrispermalink);
|
|
string_clear(&attrrel);
|
|
string_clear(&attrtype);
|
|
}
|
|
|
|
static void
|
|
xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
|
|
{
|
|
enum TagId tagid;
|
|
|
|
if (ISINCONTENT(ctx)) {
|
|
if (ctx.contenttype == ContentTypeHTML) {
|
|
if (isshort)
|
|
xmldata(p, "/>", 2);
|
|
else
|
|
xmldata(p, ">", 1);
|
|
}
|
|
return;
|
|
}
|
|
|
|
/* set tag type based on it's attribute value */
|
|
if (ctx.tag.id == RSSTagGuid) {
|
|
/* if empty the default is "true" */
|
|
if (!attrispermalink.len ||
|
|
isattr(attrispermalink.data, attrispermalink.len, STRP("true")))
|
|
ctx.tag.id = RSSTagGuidPermalinkTrue;
|
|
else
|
|
ctx.tag.id = RSSTagGuidPermalinkFalse;
|
|
} else if (ctx.tag.id == AtomTagLink) {
|
|
/* empty or "alternate": other types could be
|
|
"enclosure", "related", "self" or "via" */
|
|
if (!attrrel.len || isattr(attrrel.data, attrrel.len, STRP("alternate")))
|
|
ctx.tag.id = AtomTagLinkAlternate;
|
|
else if (isattr(attrrel.data, attrrel.len, STRP("enclosure")))
|
|
ctx.tag.id = AtomTagLinkEnclosure;
|
|
else
|
|
ctx.tag.id = AtomTagLink; /* unknown */
|
|
}
|
|
|
|
tagid = ctx.tag.id;
|
|
|
|
/* map tag type to field: unknown or lesser priority is ignored,
|
|
when tags of the same type are repeated only the first is used. */
|
|
if (fieldmap[tagid] == -1 ||
|
|
(!ISFEEDFIELDMULTI(fieldmap[tagid]) &&
|
|
tagid <= ctx.fields[fieldmap[tagid]].tagid)) {
|
|
return;
|
|
}
|
|
|
|
if (ctx.iscontenttag) {
|
|
ctx.iscontent = 1;
|
|
ctx.iscontenttag = 0;
|
|
|
|
/* detect content-type based on type attribute */
|
|
if (attrtype.len) {
|
|
if (isattr(attrtype.data, attrtype.len, STRP("html")) ||
|
|
isattr(attrtype.data, attrtype.len, STRP("xhtml")) ||
|
|
isattr(attrtype.data, attrtype.len, STRP("text/html")) ||
|
|
isattr(attrtype.data, attrtype.len, STRP("text/xhtml")) ||
|
|
isattr(attrtype.data, attrtype.len, STRP("application/xhtml+xml")))
|
|
ctx.contenttype = ContentTypeHTML;
|
|
else /* unknown: handle as base64 text data */
|
|
ctx.contenttype = ContentTypePlain;
|
|
} else {
|
|
/* default content-type */
|
|
if (tagid == RSSTagContentEncoded || tagid == RSSTagDescription)
|
|
ctx.contenttype = ContentTypeHTML;
|
|
else
|
|
ctx.contenttype = ContentTypePlain;
|
|
}
|
|
}
|
|
|
|
ctx.field = &(ctx.fields[fieldmap[tagid]].str);
|
|
ctx.fields[fieldmap[tagid]].tagid = tagid;
|
|
|
|
/* clear field if it is overwritten (with a priority order) for the new
|
|
value, if the field can have multiple values then do not clear it. */
|
|
if (!ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
|
|
string_clear(ctx.field);
|
|
}
|
|
|
|
static void
|
|
xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
|
|
{
|
|
size_t i;
|
|
|
|
if (ctx.feedtype == FeedTypeNone)
|
|
return;
|
|
|
|
if (ISINCONTENT(ctx)) {
|
|
/* not a closed content field */
|
|
if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) {
|
|
if (!isshort && ctx.contenttype == ContentTypeHTML) {
|
|
xmldata(p, "</", 2);
|
|
xmldata(p, t, tl);
|
|
xmldata(p, ">", 1);
|
|
}
|
|
return;
|
|
}
|
|
} else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)) {
|
|
/* matched tag end: close it */
|
|
/* copy also to the link field if the attribute isPermaLink="true"
|
|
and it is not set by a tag with higher priority. */
|
|
if (ctx.tag.id == RSSTagGuidPermalinkTrue && ctx.field &&
|
|
ctx.tag.id > ctx.fields[FeedFieldLink].tagid) {
|
|
string_clear(&ctx.fields[FeedFieldLink].str);
|
|
string_append(&ctx.fields[FeedFieldLink].str,
|
|
ctx.field->data, ctx.field->len);
|
|
ctx.fields[FeedFieldLink].tagid = ctx.tag.id;
|
|
}
|
|
} else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom &&
|
|
istag(t, tl, STRP("entry"))) || /* Atom */
|
|
(ctx.feedtype == FeedTypeRSS &&
|
|
istag(t, tl, STRP("item"))))) /* RSS */
|
|
{
|
|
/* end of RSS or Atom entry / item */
|
|
printfields();
|
|
|
|
/* clear strings */
|
|
for (i = 0; i < FeedFieldLast; i++) {
|
|
string_clear(&ctx.fields[i].str);
|
|
ctx.fields[i].tagid = TagUnknown;
|
|
}
|
|
ctx.contenttype = ContentTypeNone;
|
|
/* allow parsing of Atom and RSS concatenated in one XML stream. */
|
|
ctx.feedtype = FeedTypeNone;
|
|
} else {
|
|
return; /* not end of field */
|
|
}
|
|
|
|
/* temporary string: for fields that cannot be processed
|
|
directly and need more context, for example by it's tag
|
|
attributes, like the Atom link rel="alternate|enclosure". */
|
|
if (tmpstr.len && ctx.field) {
|
|
if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) {
|
|
if (ctx.field->len)
|
|
string_append(ctx.field, FieldMultiSeparator, 1);
|
|
string_append(ctx.field, tmpstr.data, tmpstr.len);
|
|
} else {
|
|
string_clear(ctx.field);
|
|
string_append(ctx.field, tmpstr.data, tmpstr.len);
|
|
}
|
|
}
|
|
|
|
/* close field */
|
|
string_clear(&tmpstr); /* reuse and clear temporary string */
|
|
|
|
if (ctx.tag.id == AtomTagAuthorName)
|
|
memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* outer tag */
|
|
else
|
|
memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag));
|
|
|
|
ctx.iscontent = 0;
|
|
ctx.field = NULL;
|
|
}
|
|
|
|
int
|
|
main(int argc, char *argv[])
|
|
{
|
|
if (pledge("stdio", NULL) == -1)
|
|
err(1, "pledge");
|
|
|
|
if (argc > 1) {
|
|
if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[0])
|
|
baseurl = argv[1];
|
|
else
|
|
errx(1, "baseurl incorrect or too long");
|
|
}
|
|
|
|
memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag));
|
|
|
|
parser.xmlattr = xmlattr;
|
|
parser.xmlattrentity = xmlattrentity;
|
|
parser.xmlattrend = xmlattrend;
|
|
parser.xmlattrstart = xmlattrstart;
|
|
parser.xmlcdata = xmldata;
|
|
parser.xmldata = xmldata;
|
|
parser.xmldataentity = xmldataentity;
|
|
parser.xmltagend = xmltagend;
|
|
parser.xmltagstart = xmltagstart;
|
|
parser.xmltagstartparsed = xmltagstartparsed;
|
|
|
|
/* NOTE: getnext is defined in xml.h for inline optimization */
|
|
xml_parse(&parser);
|
|
|
|
checkfileerror(stdin, "<stdin>", 'r');
|
|
checkfileerror(stdout, "<stdout>", 'w');
|
|
|
|
return 0;
|
|
}
|