xml: separate reader context from parser

also:
- rename xmlparser_ prefix to xml_.
- make xml_parse public, this allows a custom reader like a direct mmap,
  see: XMLParser.getnext and (optionall) XMLParser.getnext_data.
- improve the README text.
This commit is contained in:
Hiltjo Posthuma 2015-08-14 13:47:19 +02:00
parent 5c724b8b1a
commit 582131202a
7 changed files with 100 additions and 68 deletions

View File

@ -5,7 +5,7 @@ XML parser
Dependencies
------------
- C compiler (C99)
- C compiler (C99).
Features
@ -25,19 +25,21 @@ Supports
- Short attributes without an explicity set value (<input type="checkbox" checked />).
- Comments
- CDATA sections.
- Helper function (xml_entitytostr) to convert XML 1.0 / HTML 2.0 named entities
and numeric entities to UTF-8.
- Reading XML from a fd, string buffer or implement a custom reader:
see: XMLParser.getnext and XMLParser.getnext_data.
Caveats
-------
- Internally static buffers are used, callbacks like XMLParser.xmldata are
- Internally fixed-size buffers are used, callbacks like XMLParser.xmldata are
called multiple times for the same tag if the data size is bigger than the
internal buffer size (sizeof(XMLParser.data)). To differentiate between new
calls for data you can use the xml*start and xml*end handlers.
- There is no table of (HTML / XML) named entities you should handle this with
the XMLParser.xmldataentity handler yourself.
- The XML is not checked for errors so it will continue parsing invalid XML
data, this is by design.
- The XML is not checked for errors so it will continue parsing XML data, this
is by design.
Files used
@ -51,6 +53,20 @@ Interface / API
Should be trivial, see xml.c and xml.h and the examples below.
The most minimal implementation to read and parse from fd 0 (stdin) is:
#include "xml.h"
static XMLParser x;
int
main(void)
{
xml_parse_fd(&x, 0); /* xml_parse_string(&x, "<sup />"); */
return 0;
}
Examples
--------
@ -60,5 +76,6 @@ sfeed_opml_import.c or sfeed_web.c or sfeed_xmlenc.c
License
-------
See LICENSE file.

View File

@ -737,7 +737,7 @@ main(int argc, char *argv[])
parser.xmltagstart = xml_handler_start_el;
parser.xmltagstartparsed = xml_handler_start_el_parsed;
xmlparser_parse_fd(&parser, 0);
xml_parse_fd(&parser, 0);
return 0;
}

View File

@ -87,7 +87,7 @@ main(void)
"# list of feeds to fetch:\n"
"feeds() {\n"
" # feed <name> <feedurl> [basesiteurl] [encoding]\n", stdout);
xmlparser_parse_fd(&parser, 0);
xml_parse_fd(&parser, 0);
fputs("}\n", stdout);
return 0;

View File

@ -94,7 +94,7 @@ main(int argc, char *argv[])
parser.xmltagstart = xmltagstart;
parser.xmltagstartparsed = xmltagstartparsed;
xmlparser_parse_fd(&parser, 0);
xml_parse_fd(&parser, 0);
return found > 0 ? 0: 1;
}

View File

@ -60,7 +60,7 @@ main(void)
parser.xmltagstart = xmltagstart;
parser.xmltagend = xmltagend;
xmlparser_parse_fd(&parser, 0);
xml_parse_fd(&parser, 0);
return 1;
}

110
xml.c
View File

@ -8,54 +8,75 @@
#include "xml.h"
struct xml_context_fd {
char buf[BUFSIZ];
int readerrno;
int fd;
size_t nread;
size_t offset;
};
struct xml_context_string {
const char *str;
};
static int
xmlparser_string_getnext(XMLParser *x)
xml_getnext_stdin(XMLParser *x)
{
if (!*(x->str))
return getchar();
}
static int
xml_getnext_string(XMLParser *x)
{
struct xml_context_string *d = (struct xml_context_string *)x->getnext_data;
if (!*(d->str))
return EOF;
return (int)*(x->str++);
return (int)*(d->str++);
}
static int /* like getc(), but do some smart buffering */
xmlparser_fd_getnext(XMLParser *x)
xml_getnext_fd(XMLParser *x)
{
struct xml_context_fd *d = (struct xml_context_fd *)x->getnext_data;
ssize_t r;
/* previous read error was set */
if (x->readerrno)
if (d->readerrno)
return EOF;
if (x->readoffset >= x->readlastbytes) {
x->readoffset = 0;
if (d->offset >= d->nread) {
d->offset = 0;
again:
r = read(x->fd, x->readbuf, sizeof(x->readbuf));
r = read(d->fd, d->buf, sizeof(d->buf));
if (r == -1) {
if (errno == EINTR)
goto again;
x->readerrno = errno;
x->readlastbytes = 0;
d->readerrno = errno;
d->nread = 0;
return EOF;
} else if (!r) {
return EOF;
}
x->readlastbytes = r;
d->nread = r;
}
return (int)x->readbuf[x->readoffset++];
return (int)d->buf[d->offset++];
}
static int
xmlparser_getnext(XMLParser *x)
xml_getnext(XMLParser *x)
{
return x->getnext(x);
}
static void
xmlparser_parseattrs(XMLParser *x)
xml_parseattrs(XMLParser *x)
{
size_t namelen = 0, valuelen;
int c, endsep, endname = 0;
while ((c = xmlparser_getnext(x)) != EOF) {
while ((c = xml_getnext(x)) != EOF) {
if (isspace(c)) { /* TODO: simplify endname ? */
if (namelen)
endname = 1;
@ -82,7 +103,7 @@ xmlparser_parseattrs(XMLParser *x)
endsep = c; /* c is end separator */
if (x->xmlattrstart)
x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
for (valuelen = 0; (c = xmlparser_getnext(x)) != EOF;) {
for (valuelen = 0; (c = xml_getnext(x)) != EOF;) {
if (c == '&') { /* entities */
x->data[valuelen] = '\0';
/* call data function with data before entity if there is data */
@ -90,7 +111,7 @@ xmlparser_parseattrs(XMLParser *x)
x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
x->data[0] = c;
valuelen = 1;
while ((c = xmlparser_getnext(x)) != EOF) {
while ((c = xml_getnext(x)) != EOF) {
if (c == endsep)
break;
if (valuelen < sizeof(x->data) - 1)
@ -147,7 +168,7 @@ xmlparser_parseattrs(XMLParser *x)
}
static void
xmlparser_parsecomment(XMLParser *x)
xml_parsecomment(XMLParser *x)
{
static const char *end = "-->";
size_t datalen = 0, i = 0;
@ -156,7 +177,7 @@ xmlparser_parsecomment(XMLParser *x)
if (x->xmlcommentstart)
x->xmlcommentstart(x);
while ((c = xmlparser_getnext(x)) != EOF) {
while ((c = xml_getnext(x)) != EOF) {
if (c == end[i]) {
if (end[++i] == '\0') { /* end */
x->data[datalen] = '\0';
@ -191,7 +212,7 @@ xmlparser_parsecomment(XMLParser *x)
}
static void
xmlparser_parsecdata(XMLParser *x)
xml_parsecdata(XMLParser *x)
{
static const char *end = "]]>";
size_t datalen = 0, i = 0;
@ -200,7 +221,7 @@ xmlparser_parsecdata(XMLParser *x)
if (x->xmlcdatastart)
x->xmlcdatastart(x);
while ((c = xmlparser_getnext(x)) != EOF) {
while ((c = xml_getnext(x)) != EOF) {
if (c == end[i]) {
if (end[++i] == '\0') { /* end */
x->data[datalen] = '\0';
@ -351,44 +372,44 @@ xml_entitytostr(const char *e, char *buf, size_t bufsiz)
return xml_numericentitytostr(e, buf, bufsiz);
}
static void
xmlparser_parse(XMLParser *x)
void
xml_parse(XMLParser *x)
{
int c, ispi;
size_t datalen, tagdatalen, taglen;
while ((c = xmlparser_getnext(x)) != EOF && c != '<')
while ((c = xml_getnext(x)) != EOF && c != '<')
; /* skip until < */
while (c != EOF) {
if (c == '<') { /* parse tag */
if ((c = xmlparser_getnext(x)) == EOF)
if ((c = xml_getnext(x)) == EOF)
return;
x->tag[0] = '\0';
x->taglen = 0;
if (c == '!') { /* cdata and comments */
for (tagdatalen = 0; (c = xmlparser_getnext(x)) != EOF;) {
for (tagdatalen = 0; (c = xml_getnext(x)) != EOF;) {
if (tagdatalen <= sizeof("[CDATA[") - 1) /* if (d < sizeof(x->data)) */
x->data[tagdatalen++] = c; /* TODO: prevent overflow */
if (c == '>')
break;
else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
(x->data[0] == '-')) { /* comment */
xmlparser_parsecomment(x);
xml_parsecomment(x);
break;
} else if (c == '[') {
if (tagdatalen == sizeof("[CDATA[") - 1 &&
x->data[1] == 'C' && x->data[2] == 'D' &&
x->data[3] == 'A' && x->data[4] == 'T' &&
x->data[5] == 'A' && x->data[6] == '[') { /* CDATA */
xmlparser_parsecdata(x);
xml_parsecdata(x);
break;
}
}
}
} else { /* normal tag (open, short open, close), processing instruction. */
if (isspace(c))
while ((c = xmlparser_getnext(x)) != EOF && isspace(c))
while ((c = xml_getnext(x)) != EOF && isspace(c))
;
if (c == EOF)
return;
@ -396,7 +417,7 @@ xmlparser_parse(XMLParser *x)
ispi = (c == '?') ? 1 : 0;
x->isshorttag = ispi;
taglen = 1;
while ((c = xmlparser_getnext(x)) != EOF) {
while ((c = xml_getnext(x)) != EOF) {
if (c == '/') /* TODO: simplify short tag? */
x->isshorttag = 1; /* short tag */
else if (c == '>' || isspace(c)) {
@ -411,7 +432,7 @@ xmlparser_parse(XMLParser *x)
if (x->xmltagstart)
x->xmltagstart(x, x->tag, x->taglen);
if (isspace(c))
xmlparser_parseattrs(x);
xml_parseattrs(x);
if (x->xmltagstartparsed)
x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
}
@ -428,7 +449,7 @@ xmlparser_parse(XMLParser *x)
datalen = 0;
if (x->xmldatastart)
x->xmldatastart(x);
while ((c = xmlparser_getnext(x)) != EOF) {
while ((c = xml_getnext(x)) != EOF) {
if (c == '&') {
if (datalen) {
x->data[datalen] = '\0';
@ -437,7 +458,7 @@ xmlparser_parse(XMLParser *x)
}
x->data[0] = c;
datalen = 1;
while ((c = xmlparser_getnext(x)) != EOF) {
while ((c = xml_getnext(x)) != EOF) {
if (c == '<')
break;
if (datalen < sizeof(x->data) - 1)
@ -477,17 +498,24 @@ xmlparser_parse(XMLParser *x)
}
void
xmlparser_parse_string(XMLParser *x, const char *s)
xml_parse_string(XMLParser *x, const char *s)
{
x->str = s;
x->getnext = xmlparser_string_getnext;
xmlparser_parse(x);
struct xml_context_string ctx = { .str = s };
x->getnext = xml_getnext_string;
x->getnext_data = (void *)&ctx;
xml_parse(x);
}
void
xmlparser_parse_fd(XMLParser *x, int fd)
xml_parse_fd(XMLParser *x, int fd)
{
x->fd = fd;
x->getnext = xmlparser_fd_getnext;
xmlparser_parse(x);
struct xml_context_fd ctx;
memset(&ctx, 0, sizeof(ctx));
ctx.fd = fd;
x->getnext = xml_getnext_fd;
x->getnext_data = (void *)&ctx;
xml_parse(x);
}

21
xml.h
View File

@ -24,16 +24,7 @@ typedef struct xmlparser {
size_t, int);
int (*getnext)(struct xmlparser *);
/* for use with xmlparser_parse_fd */
/* errno set from read(). */
int readerrno;
int fd;
/* for use with "read" from string: xmlparser_parse_string */
const char *str;
/* private; internal state */
void *getnext_data; /* custom data for getnext */
/* current tag */
char tag[1024];
@ -44,11 +35,6 @@ typedef struct xmlparser {
char name[256];
/* data buffer used for tag data, cdata and attribute data */
char data[BUFSIZ];
size_t readoffset;
size_t readlastbytes;
/* read buffer used by xmlparser_parse_fd */
unsigned char readbuf[BUFSIZ];
} XMLParser;
int xml_codepointtoutf8(uint32_t, uint32_t *);
@ -56,5 +42,6 @@ ssize_t xml_entitytostr(const char *, char *, size_t);
ssize_t xml_namedentitytostr(const char *, char *, size_t);
ssize_t xml_numericetitytostr(const char *, char *, size_t);
void xmlparser_parse_fd(XMLParser *, int);
void xmlparser_parse_string(XMLParser *, const char *);
void xml_parse(XMLParser *);
void xml_parse_fd(XMLParser *, int);
void xml_parse_string(XMLParser *, const char *);