sfeed/xml.c

447 lines
10 KiB
C
Raw Normal View History

#include <sys/types.h>
#include <ctype.h>
#include <errno.h>
2015-08-22 18:13:18 +00:00
#include <limits.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "xml.h"
static void
xml_parseattrs(XMLParser *x)
{
size_t namelen = 0, valuelen;
int c, endsep, endname = 0;
2015-08-22 18:11:05 +00:00
while ((c = x->getnext()) != EOF) {
2015-07-28 19:56:46 +00:00
if (isspace(c)) { /* TODO: simplify endname ? */
if (namelen)
endname = 1;
continue;
}
2015-07-28 19:56:46 +00:00
if (c == '?')
; /* ignore */
2015-07-28 19:56:46 +00:00
else if (c == '=') {
x->name[namelen] = '\0';
2015-07-28 19:56:46 +00:00
} else if (namelen && ((endname && isalpha(c)) || (c == '>' || c == '/'))) {
/* attribute without value */
x->name[namelen] = '\0';
2015-07-28 19:56:46 +00:00
if (x->xmlattrstart)
x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
2015-07-28 19:56:46 +00:00
if (x->xmlattr)
x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
2015-07-28 19:56:46 +00:00
if (x->xmlattrend)
x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
endname = 0;
x->name[0] = c;
namelen = 1;
2015-07-28 19:56:46 +00:00
} else if (namelen && (c == '\'' || c == '"')) {
/* attribute with value */
endsep = c; /* c is end separator */
2015-07-28 19:56:46 +00:00
if (x->xmlattrstart)
x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
2015-08-22 18:11:05 +00:00
for (valuelen = 0; (c = x->getnext()) != EOF;) {
2015-07-28 19:56:46 +00:00
if (c == '&') { /* entities */
x->data[valuelen] = '\0';
/* call data function with data before entity if there is data */
2015-07-28 19:56:46 +00:00
if (valuelen && x->xmlattr)
x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
x->data[0] = c;
valuelen = 1;
2015-08-22 18:11:05 +00:00
while ((c = x->getnext()) != EOF) {
2015-07-28 19:56:46 +00:00
if (c == endsep)
break;
2015-07-28 19:56:46 +00:00
if (valuelen < sizeof(x->data) - 1)
x->data[valuelen++] = c;
else {
/* entity too long for buffer, handle as normal data */
x->data[valuelen] = '\0';
2015-07-28 19:56:46 +00:00
if (x->xmlattr)
x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
x->data[0] = c;
valuelen = 1;
break;
}
2015-07-28 19:56:46 +00:00
if (c == ';') {
x->data[valuelen] = '\0';
2015-07-28 19:56:46 +00:00
if (x->xmlattrentity)
x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
valuelen = 0;
break;
}
}
2015-07-28 19:56:46 +00:00
} else if (c != endsep) {
if (valuelen < sizeof(x->data) - 1) {
x->data[valuelen++] = c;
} else {
x->data[valuelen] = '\0';
2015-07-28 19:56:46 +00:00
if (x->xmlattr)
x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
x->data[0] = c;
valuelen = 1;
}
}
2015-07-28 19:56:46 +00:00
if (c == endsep) {
x->data[valuelen] = '\0';
2015-07-28 19:56:46 +00:00
if (x->xmlattr)
x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
2015-07-28 19:56:46 +00:00
if (x->xmlattrend)
x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
break;
}
}
namelen = endname = 0;
2015-07-28 19:56:46 +00:00
} else if (namelen < sizeof(x->name) - 1) {
x->name[namelen++] = c;
}
2015-07-28 19:56:46 +00:00
if (c == '>') {
break;
2015-07-28 19:56:46 +00:00
} else if (c == '/') {
x->isshorttag = 1;
x->name[0] = '\0';
namelen = 0;
}
}
}
static void
xml_parsecomment(XMLParser *x)
{
size_t datalen = 0, i = 0;
int c;
2015-07-28 19:56:46 +00:00
if (x->xmlcommentstart)
x->xmlcommentstart(x);
2015-08-22 18:11:05 +00:00
while ((c = x->getnext()) != EOF) {
if (c == '-' || c == '>') {
if (x->xmlcomment) {
x->data[datalen] = '\0';
x->xmlcomment(x, x->data, datalen);
datalen = 0;
}
}
if (c == '-') {
if (++i > 2) {
2015-07-28 19:56:46 +00:00
if (x->xmlcomment)
for (; i > 2; i--)
x->xmlcomment(x, "-", 1);
i = 2;
}
continue;
} else if (c == '>' && i == 2) {
if (x->xmlcommentend)
x->xmlcommentend(x);
return;
2015-07-28 19:56:46 +00:00
} else if (i) {
if (x->xmlcomment) {
for (; i > 0; i--)
x->xmlcomment(x, "-", 1);
}
i = 0;
}
if (datalen < sizeof(x->data) - 1) {
x->data[datalen++] = c;
} else {
x->data[datalen] = '\0';
2015-07-28 19:56:46 +00:00
if (x->xmlcomment)
x->xmlcomment(x, x->data, datalen);
x->data[0] = c;
datalen = 1;
}
}
}
static void
xml_parsecdata(XMLParser *x)
{
size_t datalen = 0, i = 0;
int c;
2015-07-28 19:56:46 +00:00
if (x->xmlcdatastart)
x->xmlcdatastart(x);
2015-08-22 18:11:05 +00:00
while ((c = x->getnext()) != EOF) {
if (c == ']' || c == '>') {
if (x->xmlcdata) {
2015-06-22 21:56:43 +00:00
x->data[datalen] = '\0';
x->xmlcdata(x, x->data, datalen);
datalen = 0;
}
}
if (c == ']') {
if (++i > 2) {
2015-07-28 19:56:46 +00:00
if (x->xmlcdata)
for (; i > 2; i--)
x->xmlcdata(x, "]", 1);
i = 2;
2015-06-20 22:18:44 +00:00
}
continue;
} else if (c == '>' && i == 2) {
if (x->xmlcdataend)
x->xmlcdataend(x);
return;
2015-07-28 19:56:46 +00:00
} else if (i) {
if (x->xmlcdata)
for (; i > 0; i--)
x->xmlcdata(x, "]", 1);
i = 0;
}
if (datalen < sizeof(x->data) - 1) {
x->data[datalen++] = c;
} else {
x->data[datalen] = '\0';
2015-07-28 19:56:46 +00:00
if (x->xmlcdata)
x->xmlcdata(x, x->data, datalen);
x->data[0] = c;
datalen = 1;
}
}
}
static int
codepointtoutf8(const uint32_t r, uint8_t *s)
2015-06-20 22:18:44 +00:00
{
2015-07-28 19:56:46 +00:00
if (cp >= 0x10000) {
2015-06-20 22:18:44 +00:00
/* 4 bytes */
*utf = 0xf0808080 | ((cp & 0xfc0000) << 6) |
((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
(cp & 0x3f);
return 4;
2015-07-28 19:56:46 +00:00
} else if (cp >= 0x00800) {
2015-06-20 22:18:44 +00:00
/* 3 bytes */
*utf = 0xe08080 |
((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
(cp & 0x3f);
return 3;
2015-07-28 19:56:46 +00:00
} else if (cp >= 0x80) {
2015-06-20 22:18:44 +00:00
/* 2 bytes */
*utf = 0xc080 |
((cp & 0xfc0) << 2) | (cp & 0x3f);
return 2;
}
*utf = cp & 0xff;
return *utf ? 1 : 0; /* 1 byte */
}
static int
namedentitytostr(const char *e, char *buf, size_t bufsiz)
2015-06-20 22:18:44 +00:00
{
static const struct {
2015-08-07 22:06:06 +00:00
char *entity;
int c;
} entities[] = {
{ .entity = "&amp;", .c = '&' },
{ .entity = "&lt;", .c = '<' },
{ .entity = "&gt;", .c = '>' },
{ .entity = "&apos;", .c = '\'' },
{ .entity = "&quot;", .c = '"' },
{ .entity = "&AMP;", .c = '&' },
{ .entity = "&LT;", .c = '<' },
{ .entity = "&GT;", .c = '>' },
{ .entity = "&APOS;", .c = '\'' },
{ .entity = "&QUOT;", .c = '"' }
};
2015-06-20 22:18:44 +00:00
size_t i;
/* buffer is too small */
2015-07-28 19:56:46 +00:00
if (bufsiz < 2)
2015-06-20 22:18:44 +00:00
return -1;
/* doesn't start with &: can't match */
2015-07-28 19:56:46 +00:00
if (*e != '&')
2015-06-20 22:18:44 +00:00
return 0;
2015-07-31 20:38:04 +00:00
for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
2015-08-07 22:06:06 +00:00
if (!strcmp(e, entities[i].entity)) {
2015-06-20 22:18:44 +00:00
buf[0] = entities[i].c;
buf[1] = '\0';
return 1;
}
}
return 0;
}
static int
numericentitytostr(const char *e, char *buf, size_t bufsiz)
2015-06-20 22:18:44 +00:00
{
uint32_t l = 0, cp = 0;
size_t b, len;
char *end;
/* buffer is too small */
2015-07-28 19:56:46 +00:00
if (bufsiz < 5)
2015-06-20 22:18:44 +00:00
return -1;
/* not a numeric entity */
if (e[0] != '&' || e[1] != '#')
2015-06-20 22:18:44 +00:00
return 0;
/* e[1] == '#', numeric / hexadecimal entity */
e += 2; /* skip "&#" */
errno = 0;
/* hex (16) or decimal (10) */
2015-07-28 19:56:46 +00:00
if (*e == 'x')
2015-06-20 22:18:44 +00:00
l = strtoul(e + 1, &end, 16);
else
l = strtoul(e, &end, 10);
/* invalid value or not a well-formed entity or too high codepoint */
if (errno || *end != ';' || l > 0x10FFFF)
2015-06-20 22:18:44 +00:00
return 0;
len = codepointtoutf8(l, buf);
2015-06-20 22:18:44 +00:00
buf[len] = '\0';
2015-08-06 15:54:09 +00:00
return len;
2015-06-20 22:18:44 +00:00
}
/* convert named- or numeric entity string to buffer string
* returns byte-length of string. */
int
2015-06-20 22:18:44 +00:00
xml_entitytostr(const char *e, char *buf, size_t bufsiz)
{
/* buffer is too small */
2015-07-28 19:56:46 +00:00
if (bufsiz < 5)
2015-06-20 22:18:44 +00:00
return -1;
/* doesn't start with & */
2015-07-28 19:56:46 +00:00
if (e[0] != '&')
2015-06-20 22:18:44 +00:00
return 0;
/* named entity */
2015-07-28 19:56:46 +00:00
if (e[1] != '#')
return namedentitytostr(e, buf, bufsiz);
2015-06-20 22:18:44 +00:00
else /* numeric entity */
return numericentitytostr(e, buf, bufsiz);
2015-06-20 22:18:44 +00:00
}
void
xml_parse(XMLParser *x)
{
int c, ispi;
size_t datalen, tagdatalen, taglen;
2015-08-22 18:11:05 +00:00
if (!x->getnext)
return;
while ((c = x->getnext()) != EOF && c != '<')
2015-08-06 15:54:09 +00:00
; /* skip until < */
2015-07-28 19:56:46 +00:00
while (c != EOF) {
if (c == '<') { /* parse tag */
2015-08-22 18:11:05 +00:00
if ((c = x->getnext()) == EOF)
return;
2015-07-28 19:56:46 +00:00
if (c == '!') { /* cdata and comments */
2015-08-22 18:11:05 +00:00
for (tagdatalen = 0; (c = x->getnext()) != EOF;) {
2015-08-06 15:54:09 +00:00
if (tagdatalen <= sizeof("[CDATA[") - 1) /* if (d < sizeof(x->data)) */
x->data[tagdatalen++] = c; /* TODO: prevent overflow */
2015-07-28 19:56:46 +00:00
if (c == '>')
break;
2015-08-06 15:54:09 +00:00
else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
2015-08-14 15:03:53 +00:00
(x->data[0] == '-')) {
xml_parsecomment(x);
break;
2015-07-28 19:56:46 +00:00
} else if (c == '[') {
2015-08-06 15:54:09 +00:00
if (tagdatalen == sizeof("[CDATA[") - 1 &&
2015-08-14 15:03:53 +00:00
!strncmp(x->data, "[CDATA[", tagdatalen)) {
xml_parsecdata(x);
break;
}
}
}
2015-08-14 15:03:53 +00:00
} else {
x->tag[0] = '\0';
x->taglen = 0;
2015-08-14 15:03:53 +00:00
/* normal tag (open, short open, close), processing instruction. */
2015-07-28 19:56:46 +00:00
if (isspace(c))
2015-08-22 18:11:05 +00:00
while ((c = x->getnext()) != EOF && isspace(c))
2015-08-06 15:54:09 +00:00
;
2015-07-28 19:56:46 +00:00
if (c == EOF)
return;
x->tag[0] = c;
ispi = (c == '?') ? 1 : 0;
x->isshorttag = ispi;
taglen = 1;
2015-08-22 18:11:05 +00:00
while ((c = x->getnext()) != EOF) {
2015-07-28 19:56:46 +00:00
if (c == '/') /* TODO: simplify short tag? */
x->isshorttag = 1; /* short tag */
2015-07-28 19:56:46 +00:00
else if (c == '>' || isspace(c)) {
x->tag[taglen] = '\0';
2015-07-28 19:56:46 +00:00
if (x->tag[0] == '/') { /* end tag, starts with </ */
x->taglen = --taglen; /* len -1 because of / */
2015-07-28 19:56:46 +00:00
if (taglen && x->xmltagend)
x->xmltagend(x, &(x->tag)[1], x->taglen, 0);
} else {
x->taglen = taglen;
2014-11-11 18:17:58 +00:00
/* start tag */
2015-07-28 19:56:46 +00:00
if (x->xmltagstart)
2014-11-11 18:17:58 +00:00
x->xmltagstart(x, x->tag, x->taglen);
2015-07-28 19:56:46 +00:00
if (isspace(c))
xml_parseattrs(x);
2015-07-28 19:56:46 +00:00
if (x->xmltagstartparsed)
x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
}
2014-11-11 18:17:58 +00:00
/* call tagend for shortform or processing instruction */
2015-07-28 19:56:46 +00:00
if ((x->isshorttag || ispi) && x->xmltagend)
x->xmltagend(x, x->tag, x->taglen, 1);
break;
2015-07-28 19:56:46 +00:00
} else if (taglen < sizeof(x->tag) - 1)
x->tag[taglen++] = c; /* NOTE: tag name truncation */
}
}
} else {
2014-11-11 18:12:24 +00:00
/* parse tag data */
datalen = 0;
2015-07-28 19:56:46 +00:00
if (x->xmldatastart)
x->xmldatastart(x);
2015-08-22 18:11:05 +00:00
while ((c = x->getnext()) != EOF) {
2015-07-28 19:56:46 +00:00
if (c == '&') {
if (datalen) {
x->data[datalen] = '\0';
2015-07-28 19:56:46 +00:00
if (x->xmldata)
2015-05-16 14:09:01 +00:00
x->xmldata(x, x->data, datalen);
}
x->data[0] = c;
datalen = 1;
2015-08-22 18:11:05 +00:00
while ((c = x->getnext()) != EOF) {
2015-07-28 19:56:46 +00:00
if (c == '<')
break;
2015-07-28 19:56:46 +00:00
if (datalen < sizeof(x->data) - 1)
x->data[datalen++] = c;
2015-07-28 19:56:46 +00:00
if (isspace(c))
break;
2015-07-28 19:56:46 +00:00
else if (c == ';') {
x->data[datalen] = '\0';
2015-07-28 19:56:46 +00:00
if (x->xmldataentity)
x->xmldataentity(x, x->data, datalen);
datalen = 0;
break;
}
}
2015-07-28 19:56:46 +00:00
} else if (c != '<') {
if (datalen < sizeof(x->data) - 1) {
x->data[datalen++] = c;
} else {
x->data[datalen] = '\0';
2015-07-28 19:56:46 +00:00
if (x->xmldata)
x->xmldata(x, x->data, datalen);
x->data[0] = c;
datalen = 1;
}
}
2015-07-28 19:56:46 +00:00
if (c == '<') {
x->data[datalen] = '\0';
2015-07-28 19:56:46 +00:00
if (x->xmldata && datalen)
x->xmldata(x, x->data, datalen);
2015-07-28 19:56:46 +00:00
if (x->xmldataend)
x->xmldataend(x);
break;
}
}
}
}
}