416 lines
9.8 KiB
C
416 lines
9.8 KiB
C
#include <errno.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include "xml.h"
|
|
|
|
#define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
|
|
#define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
|
|
|
|
static void
|
|
xml_parseattrs(XMLParser *x)
|
|
{
|
|
size_t namelen = 0, valuelen;
|
|
int c, endsep, endname = 0, valuestart = 0;
|
|
|
|
while ((c = GETNEXT()) != EOF) {
|
|
if (ISSPACE(c)) {
|
|
if (namelen)
|
|
endname = 1;
|
|
continue;
|
|
} else if (c == '?')
|
|
; /* ignore */
|
|
else if (c == '=') {
|
|
x->name[namelen] = '\0';
|
|
valuestart = 1;
|
|
endname = 1;
|
|
} else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
|
|
/* attribute without value */
|
|
x->name[namelen] = '\0';
|
|
if (x->xmlattrstart)
|
|
x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
|
|
if (x->xmlattr)
|
|
x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
|
|
if (x->xmlattrend)
|
|
x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
|
|
endname = 0;
|
|
x->name[0] = c;
|
|
namelen = 1;
|
|
} else if (namelen && valuestart) {
|
|
/* attribute with value */
|
|
if (x->xmlattrstart)
|
|
x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
|
|
|
|
valuelen = 0;
|
|
if (c == '\'' || c == '"') {
|
|
endsep = c;
|
|
} else {
|
|
endsep = ' '; /* ISSPACE() */
|
|
goto startvalue;
|
|
}
|
|
|
|
while ((c = GETNEXT()) != EOF) {
|
|
startvalue:
|
|
if (c == '&') { /* entities */
|
|
x->data[valuelen] = '\0';
|
|
/* call data function with data before entity if there is data */
|
|
if (valuelen && x->xmlattr)
|
|
x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
|
|
x->data[0] = c;
|
|
valuelen = 1;
|
|
while ((c = GETNEXT()) != EOF) {
|
|
if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
|
|
break;
|
|
if (valuelen < sizeof(x->data) - 1)
|
|
x->data[valuelen++] = c;
|
|
else {
|
|
/* entity too long for buffer, handle as normal data */
|
|
x->data[valuelen] = '\0';
|
|
if (x->xmlattr)
|
|
x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
|
|
x->data[0] = c;
|
|
valuelen = 1;
|
|
break;
|
|
}
|
|
if (c == ';') {
|
|
x->data[valuelen] = '\0';
|
|
if (x->xmlattrentity)
|
|
x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
|
|
valuelen = 0;
|
|
break;
|
|
}
|
|
}
|
|
} else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
|
|
if (valuelen < sizeof(x->data) - 1) {
|
|
x->data[valuelen++] = c;
|
|
} else {
|
|
x->data[valuelen] = '\0';
|
|
if (x->xmlattr)
|
|
x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
|
|
x->data[0] = c;
|
|
valuelen = 1;
|
|
}
|
|
}
|
|
if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
|
|
x->data[valuelen] = '\0';
|
|
if (x->xmlattr)
|
|
x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
|
|
if (x->xmlattrend)
|
|
x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
|
|
break;
|
|
}
|
|
}
|
|
namelen = endname = valuestart = 0;
|
|
} else if (namelen < sizeof(x->name) - 1) {
|
|
x->name[namelen++] = c;
|
|
}
|
|
if (c == '>') {
|
|
break;
|
|
} else if (c == '/') {
|
|
x->isshorttag = 1;
|
|
x->name[0] = '\0';
|
|
namelen = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
xml_parsecomment(XMLParser *x)
|
|
{
|
|
int c, i = 0;
|
|
|
|
while ((c = GETNEXT()) != EOF) {
|
|
if (c == '-') {
|
|
if (++i > 2)
|
|
i = 2;
|
|
continue;
|
|
} else if (c == '>' && i == 2) {
|
|
return;
|
|
} else if (i) {
|
|
i = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
xml_parsecdata(XMLParser *x)
|
|
{
|
|
size_t datalen = 0, i = 0;
|
|
int c;
|
|
|
|
while ((c = GETNEXT()) != EOF) {
|
|
if (c == ']' || c == '>') {
|
|
if (x->xmlcdata && datalen) {
|
|
x->data[datalen] = '\0';
|
|
x->xmlcdata(x, x->data, datalen);
|
|
datalen = 0;
|
|
}
|
|
}
|
|
|
|
if (c == ']') {
|
|
if (++i > 2) {
|
|
if (x->xmlcdata)
|
|
for (; i > 2; i--)
|
|
x->xmlcdata(x, "]", 1);
|
|
i = 2;
|
|
}
|
|
continue;
|
|
} else if (c == '>' && i == 2) {
|
|
return;
|
|
} else if (i) {
|
|
if (x->xmlcdata)
|
|
for (; i > 0; i--)
|
|
x->xmlcdata(x, "]", 1);
|
|
i = 0;
|
|
}
|
|
|
|
if (datalen < sizeof(x->data) - 1) {
|
|
x->data[datalen++] = c;
|
|
} else {
|
|
x->data[datalen] = '\0';
|
|
if (x->xmlcdata)
|
|
x->xmlcdata(x, x->data, datalen);
|
|
x->data[0] = c;
|
|
datalen = 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
static int
|
|
codepointtoutf8(long r, char *s)
|
|
{
|
|
if (r == 0) {
|
|
return 0; /* NUL byte */
|
|
} else if (r <= 0x7F) {
|
|
/* 1 byte: 0aaaaaaa */
|
|
s[0] = r;
|
|
return 1;
|
|
} else if (r <= 0x07FF) {
|
|
/* 2 bytes: 00000aaa aabbbbbb */
|
|
s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
|
|
s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
|
|
return 2;
|
|
} else if (r <= 0xFFFF) {
|
|
/* 3 bytes: aaaabbbb bbcccccc */
|
|
s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
|
|
s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
|
|
s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
|
|
return 3;
|
|
} else {
|
|
/* 4 bytes: 000aaabb bbbbcccc ccdddddd */
|
|
s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
|
|
s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
|
|
s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
|
|
s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
|
|
return 4;
|
|
}
|
|
}
|
|
|
|
static int
|
|
namedentitytostr(const char *e, char *buf, size_t bufsiz)
|
|
{
|
|
static const struct {
|
|
const char *entity;
|
|
int c;
|
|
} entities[] = {
|
|
{ "amp;", '&' },
|
|
{ "lt;", '<' },
|
|
{ "gt;", '>' },
|
|
{ "apos;", '\'' },
|
|
{ "quot;", '"' },
|
|
};
|
|
size_t i;
|
|
|
|
/* buffer is too small */
|
|
if (bufsiz < 2)
|
|
return -1;
|
|
|
|
for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
|
|
if (!strcmp(e, entities[i].entity)) {
|
|
buf[0] = entities[i].c;
|
|
buf[1] = '\0';
|
|
return 1;
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
static int
|
|
numericentitytostr(const char *e, char *buf, size_t bufsiz)
|
|
{
|
|
long l;
|
|
int len;
|
|
char *end;
|
|
|
|
/* buffer is too small */
|
|
if (bufsiz < 5)
|
|
return -1;
|
|
|
|
errno = 0;
|
|
/* hex (16) or decimal (10) */
|
|
if (*e == 'x')
|
|
l = strtol(++e, &end, 16);
|
|
else
|
|
l = strtol(e, &end, 10);
|
|
/* invalid value or not a well-formed entity or invalid code point */
|
|
if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
|
|
(l >= 0xd800 && l <= 0xdfff))
|
|
return -1;
|
|
len = codepointtoutf8(l, buf);
|
|
buf[len] = '\0';
|
|
|
|
return len;
|
|
}
|
|
|
|
/* convert named- or numeric entity string to buffer string
|
|
* returns byte-length of string or -1 on failure. */
|
|
int
|
|
xml_entitytostr(const char *e, char *buf, size_t bufsiz)
|
|
{
|
|
/* doesn't start with & */
|
|
if (e[0] != '&')
|
|
return -1;
|
|
/* numeric entity */
|
|
if (e[1] == '#')
|
|
return numericentitytostr(e + 2, buf, bufsiz);
|
|
else /* named entity */
|
|
return namedentitytostr(e + 1, buf, bufsiz);
|
|
}
|
|
|
|
void
|
|
xml_parse(XMLParser *x)
|
|
{
|
|
size_t datalen, tagdatalen;
|
|
int c, isend;
|
|
|
|
while ((c = GETNEXT()) != EOF && c != '<')
|
|
; /* skip until < */
|
|
|
|
while (c != EOF) {
|
|
if (c == '<') { /* parse tag */
|
|
if ((c = GETNEXT()) == EOF)
|
|
return;
|
|
|
|
if (c == '!') { /* CDATA and comments */
|
|
for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
|
|
/* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
|
|
if (tagdatalen <= sizeof("[CDATA[") - 1)
|
|
x->data[tagdatalen++] = c;
|
|
if (c == '>')
|
|
break;
|
|
else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
|
|
(x->data[0] == '-')) {
|
|
xml_parsecomment(x);
|
|
break;
|
|
} else if (c == '[') {
|
|
if (tagdatalen == sizeof("[CDATA[") - 1 &&
|
|
!strncmp(x->data, "[CDATA[", tagdatalen)) {
|
|
xml_parsecdata(x);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
/* normal tag (open, short open, close), processing instruction. */
|
|
x->tag[0] = c;
|
|
x->taglen = 1;
|
|
x->isshorttag = isend = 0;
|
|
|
|
/* treat processing instruction as shorttag, don't strip "?" prefix. */
|
|
if (c == '?') {
|
|
x->isshorttag = 1;
|
|
} else if (c == '/') {
|
|
if ((c = GETNEXT()) == EOF)
|
|
return;
|
|
x->tag[0] = c;
|
|
isend = 1;
|
|
}
|
|
|
|
while ((c = GETNEXT()) != EOF) {
|
|
if (c == '/')
|
|
x->isshorttag = 1; /* short tag */
|
|
else if (c == '>' || ISSPACE(c)) {
|
|
x->tag[x->taglen] = '\0';
|
|
if (isend) { /* end tag, starts with </ */
|
|
if (x->xmltagend)
|
|
x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
|
|
x->tag[0] = '\0';
|
|
x->taglen = 0;
|
|
} else {
|
|
/* start tag */
|
|
if (x->xmltagstart)
|
|
x->xmltagstart(x, x->tag, x->taglen);
|
|
if (ISSPACE(c))
|
|
xml_parseattrs(x);
|
|
if (x->xmltagstartparsed)
|
|
x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
|
|
}
|
|
/* call tagend for shortform or processing instruction */
|
|
if (x->isshorttag) {
|
|
if (x->xmltagend)
|
|
x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
|
|
x->tag[0] = '\0';
|
|
x->taglen = 0;
|
|
}
|
|
break;
|
|
} else if (x->taglen < sizeof(x->tag) - 1)
|
|
x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
|
|
}
|
|
}
|
|
} else {
|
|
/* parse tag data */
|
|
datalen = 0;
|
|
while ((c = GETNEXT()) != EOF) {
|
|
if (c == '&') {
|
|
if (datalen) {
|
|
x->data[datalen] = '\0';
|
|
if (x->xmldata)
|
|
x->xmldata(x, x->data, datalen);
|
|
}
|
|
x->data[0] = c;
|
|
datalen = 1;
|
|
while ((c = GETNEXT()) != EOF) {
|
|
if (c == '<')
|
|
break;
|
|
if (datalen < sizeof(x->data) - 1)
|
|
x->data[datalen++] = c;
|
|
else {
|
|
/* entity too long for buffer, handle as normal data */
|
|
x->data[datalen] = '\0';
|
|
if (x->xmldata)
|
|
x->xmldata(x, x->data, datalen);
|
|
x->data[0] = c;
|
|
datalen = 1;
|
|
break;
|
|
}
|
|
if (c == ';') {
|
|
x->data[datalen] = '\0';
|
|
if (x->xmldataentity)
|
|
x->xmldataentity(x, x->data, datalen);
|
|
datalen = 0;
|
|
break;
|
|
}
|
|
}
|
|
} else if (c != '<') {
|
|
if (datalen < sizeof(x->data) - 1) {
|
|
x->data[datalen++] = c;
|
|
} else {
|
|
x->data[datalen] = '\0';
|
|
if (x->xmldata)
|
|
x->xmldata(x, x->data, datalen);
|
|
x->data[0] = c;
|
|
datalen = 1;
|
|
}
|
|
}
|
|
if (c == '<') {
|
|
x->data[datalen] = '\0';
|
|
if (x->xmldata && datalen)
|
|
x->xmldata(x, x->data, datalen);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|