xml: separate reader context from parser
also: - rename xmlparser_ prefix to xml_. - make xml_parse public, this allows a custom reader like a direct mmap, see: XMLParser.getnext and (optionall) XMLParser.getnext_data. - improve the README text.
This commit is contained in:
parent
5c724b8b1a
commit
582131202a
29
README.xml
29
README.xml
|
@ -5,7 +5,7 @@ XML parser
|
|||
Dependencies
|
||||
------------
|
||||
|
||||
- C compiler (C99)
|
||||
- C compiler (C99).
|
||||
|
||||
|
||||
Features
|
||||
|
@ -25,19 +25,21 @@ Supports
|
|||
- Short attributes without an explicity set value (<input type="checkbox" checked />).
|
||||
- Comments
|
||||
- CDATA sections.
|
||||
- Helper function (xml_entitytostr) to convert XML 1.0 / HTML 2.0 named entities
|
||||
and numeric entities to UTF-8.
|
||||
- Reading XML from a fd, string buffer or implement a custom reader:
|
||||
see: XMLParser.getnext and XMLParser.getnext_data.
|
||||
|
||||
|
||||
Caveats
|
||||
-------
|
||||
|
||||
- Internally static buffers are used, callbacks like XMLParser.xmldata are
|
||||
- Internally fixed-size buffers are used, callbacks like XMLParser.xmldata are
|
||||
called multiple times for the same tag if the data size is bigger than the
|
||||
internal buffer size (sizeof(XMLParser.data)). To differentiate between new
|
||||
calls for data you can use the xml*start and xml*end handlers.
|
||||
- There is no table of (HTML / XML) named entities you should handle this with
|
||||
the XMLParser.xmldataentity handler yourself.
|
||||
- The XML is not checked for errors so it will continue parsing invalid XML
|
||||
data, this is by design.
|
||||
- The XML is not checked for errors so it will continue parsing XML data, this
|
||||
is by design.
|
||||
|
||||
|
||||
Files used
|
||||
|
@ -51,6 +53,20 @@ Interface / API
|
|||
|
||||
Should be trivial, see xml.c and xml.h and the examples below.
|
||||
|
||||
The most minimal implementation to read and parse from fd 0 (stdin) is:
|
||||
|
||||
#include "xml.h"
|
||||
|
||||
static XMLParser x;
|
||||
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
xml_parse_fd(&x, 0); /* xml_parse_string(&x, "<sup />"); */
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
@ -60,5 +76,6 @@ sfeed_opml_import.c or sfeed_web.c or sfeed_xmlenc.c
|
|||
|
||||
License
|
||||
-------
|
||||
|
||||
See LICENSE file.
|
||||
|
||||
|
|
2
sfeed.c
2
sfeed.c
|
@ -737,7 +737,7 @@ main(int argc, char *argv[])
|
|||
parser.xmltagstart = xml_handler_start_el;
|
||||
parser.xmltagstartparsed = xml_handler_start_el_parsed;
|
||||
|
||||
xmlparser_parse_fd(&parser, 0);
|
||||
xml_parse_fd(&parser, 0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -87,7 +87,7 @@ main(void)
|
|||
"# list of feeds to fetch:\n"
|
||||
"feeds() {\n"
|
||||
" # feed <name> <feedurl> [basesiteurl] [encoding]\n", stdout);
|
||||
xmlparser_parse_fd(&parser, 0);
|
||||
xml_parse_fd(&parser, 0);
|
||||
fputs("}\n", stdout);
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -94,7 +94,7 @@ main(int argc, char *argv[])
|
|||
parser.xmltagstart = xmltagstart;
|
||||
parser.xmltagstartparsed = xmltagstartparsed;
|
||||
|
||||
xmlparser_parse_fd(&parser, 0);
|
||||
xml_parse_fd(&parser, 0);
|
||||
|
||||
return found > 0 ? 0: 1;
|
||||
}
|
||||
|
|
|
@ -60,7 +60,7 @@ main(void)
|
|||
parser.xmltagstart = xmltagstart;
|
||||
parser.xmltagend = xmltagend;
|
||||
|
||||
xmlparser_parse_fd(&parser, 0);
|
||||
xml_parse_fd(&parser, 0);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
|
110
xml.c
110
xml.c
|
@ -8,54 +8,75 @@
|
|||
|
||||
#include "xml.h"
|
||||
|
||||
struct xml_context_fd {
|
||||
char buf[BUFSIZ];
|
||||
int readerrno;
|
||||
int fd;
|
||||
size_t nread;
|
||||
size_t offset;
|
||||
};
|
||||
|
||||
struct xml_context_string {
|
||||
const char *str;
|
||||
};
|
||||
|
||||
static int
|
||||
xmlparser_string_getnext(XMLParser *x)
|
||||
xml_getnext_stdin(XMLParser *x)
|
||||
{
|
||||
if (!*(x->str))
|
||||
return getchar();
|
||||
}
|
||||
|
||||
static int
|
||||
xml_getnext_string(XMLParser *x)
|
||||
{
|
||||
struct xml_context_string *d = (struct xml_context_string *)x->getnext_data;
|
||||
|
||||
if (!*(d->str))
|
||||
return EOF;
|
||||
return (int)*(x->str++);
|
||||
return (int)*(d->str++);
|
||||
}
|
||||
|
||||
static int /* like getc(), but do some smart buffering */
|
||||
xmlparser_fd_getnext(XMLParser *x)
|
||||
xml_getnext_fd(XMLParser *x)
|
||||
{
|
||||
struct xml_context_fd *d = (struct xml_context_fd *)x->getnext_data;
|
||||
ssize_t r;
|
||||
|
||||
/* previous read error was set */
|
||||
if (x->readerrno)
|
||||
if (d->readerrno)
|
||||
return EOF;
|
||||
|
||||
if (x->readoffset >= x->readlastbytes) {
|
||||
x->readoffset = 0;
|
||||
if (d->offset >= d->nread) {
|
||||
d->offset = 0;
|
||||
again:
|
||||
r = read(x->fd, x->readbuf, sizeof(x->readbuf));
|
||||
r = read(d->fd, d->buf, sizeof(d->buf));
|
||||
if (r == -1) {
|
||||
if (errno == EINTR)
|
||||
goto again;
|
||||
x->readerrno = errno;
|
||||
x->readlastbytes = 0;
|
||||
d->readerrno = errno;
|
||||
d->nread = 0;
|
||||
return EOF;
|
||||
} else if (!r) {
|
||||
return EOF;
|
||||
}
|
||||
x->readlastbytes = r;
|
||||
d->nread = r;
|
||||
}
|
||||
return (int)x->readbuf[x->readoffset++];
|
||||
return (int)d->buf[d->offset++];
|
||||
}
|
||||
|
||||
static int
|
||||
xmlparser_getnext(XMLParser *x)
|
||||
xml_getnext(XMLParser *x)
|
||||
{
|
||||
return x->getnext(x);
|
||||
}
|
||||
|
||||
static void
|
||||
xmlparser_parseattrs(XMLParser *x)
|
||||
xml_parseattrs(XMLParser *x)
|
||||
{
|
||||
size_t namelen = 0, valuelen;
|
||||
int c, endsep, endname = 0;
|
||||
|
||||
while ((c = xmlparser_getnext(x)) != EOF) {
|
||||
while ((c = xml_getnext(x)) != EOF) {
|
||||
if (isspace(c)) { /* TODO: simplify endname ? */
|
||||
if (namelen)
|
||||
endname = 1;
|
||||
|
@ -82,7 +103,7 @@ xmlparser_parseattrs(XMLParser *x)
|
|||
endsep = c; /* c is end separator */
|
||||
if (x->xmlattrstart)
|
||||
x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
|
||||
for (valuelen = 0; (c = xmlparser_getnext(x)) != EOF;) {
|
||||
for (valuelen = 0; (c = xml_getnext(x)) != EOF;) {
|
||||
if (c == '&') { /* entities */
|
||||
x->data[valuelen] = '\0';
|
||||
/* call data function with data before entity if there is data */
|
||||
|
@ -90,7 +111,7 @@ xmlparser_parseattrs(XMLParser *x)
|
|||
x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
|
||||
x->data[0] = c;
|
||||
valuelen = 1;
|
||||
while ((c = xmlparser_getnext(x)) != EOF) {
|
||||
while ((c = xml_getnext(x)) != EOF) {
|
||||
if (c == endsep)
|
||||
break;
|
||||
if (valuelen < sizeof(x->data) - 1)
|
||||
|
@ -147,7 +168,7 @@ xmlparser_parseattrs(XMLParser *x)
|
|||
}
|
||||
|
||||
static void
|
||||
xmlparser_parsecomment(XMLParser *x)
|
||||
xml_parsecomment(XMLParser *x)
|
||||
{
|
||||
static const char *end = "-->";
|
||||
size_t datalen = 0, i = 0;
|
||||
|
@ -156,7 +177,7 @@ xmlparser_parsecomment(XMLParser *x)
|
|||
|
||||
if (x->xmlcommentstart)
|
||||
x->xmlcommentstart(x);
|
||||
while ((c = xmlparser_getnext(x)) != EOF) {
|
||||
while ((c = xml_getnext(x)) != EOF) {
|
||||
if (c == end[i]) {
|
||||
if (end[++i] == '\0') { /* end */
|
||||
x->data[datalen] = '\0';
|
||||
|
@ -191,7 +212,7 @@ xmlparser_parsecomment(XMLParser *x)
|
|||
}
|
||||
|
||||
static void
|
||||
xmlparser_parsecdata(XMLParser *x)
|
||||
xml_parsecdata(XMLParser *x)
|
||||
{
|
||||
static const char *end = "]]>";
|
||||
size_t datalen = 0, i = 0;
|
||||
|
@ -200,7 +221,7 @@ xmlparser_parsecdata(XMLParser *x)
|
|||
|
||||
if (x->xmlcdatastart)
|
||||
x->xmlcdatastart(x);
|
||||
while ((c = xmlparser_getnext(x)) != EOF) {
|
||||
while ((c = xml_getnext(x)) != EOF) {
|
||||
if (c == end[i]) {
|
||||
if (end[++i] == '\0') { /* end */
|
||||
x->data[datalen] = '\0';
|
||||
|
@ -351,44 +372,44 @@ xml_entitytostr(const char *e, char *buf, size_t bufsiz)
|
|||
return xml_numericentitytostr(e, buf, bufsiz);
|
||||
}
|
||||
|
||||
static void
|
||||
xmlparser_parse(XMLParser *x)
|
||||
void
|
||||
xml_parse(XMLParser *x)
|
||||
{
|
||||
int c, ispi;
|
||||
size_t datalen, tagdatalen, taglen;
|
||||
|
||||
while ((c = xmlparser_getnext(x)) != EOF && c != '<')
|
||||
while ((c = xml_getnext(x)) != EOF && c != '<')
|
||||
; /* skip until < */
|
||||
|
||||
while (c != EOF) {
|
||||
if (c == '<') { /* parse tag */
|
||||
if ((c = xmlparser_getnext(x)) == EOF)
|
||||
if ((c = xml_getnext(x)) == EOF)
|
||||
return;
|
||||
x->tag[0] = '\0';
|
||||
x->taglen = 0;
|
||||
if (c == '!') { /* cdata and comments */
|
||||
for (tagdatalen = 0; (c = xmlparser_getnext(x)) != EOF;) {
|
||||
for (tagdatalen = 0; (c = xml_getnext(x)) != EOF;) {
|
||||
if (tagdatalen <= sizeof("[CDATA[") - 1) /* if (d < sizeof(x->data)) */
|
||||
x->data[tagdatalen++] = c; /* TODO: prevent overflow */
|
||||
if (c == '>')
|
||||
break;
|
||||
else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
|
||||
(x->data[0] == '-')) { /* comment */
|
||||
xmlparser_parsecomment(x);
|
||||
xml_parsecomment(x);
|
||||
break;
|
||||
} else if (c == '[') {
|
||||
if (tagdatalen == sizeof("[CDATA[") - 1 &&
|
||||
x->data[1] == 'C' && x->data[2] == 'D' &&
|
||||
x->data[3] == 'A' && x->data[4] == 'T' &&
|
||||
x->data[5] == 'A' && x->data[6] == '[') { /* CDATA */
|
||||
xmlparser_parsecdata(x);
|
||||
xml_parsecdata(x);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else { /* normal tag (open, short open, close), processing instruction. */
|
||||
if (isspace(c))
|
||||
while ((c = xmlparser_getnext(x)) != EOF && isspace(c))
|
||||
while ((c = xml_getnext(x)) != EOF && isspace(c))
|
||||
;
|
||||
if (c == EOF)
|
||||
return;
|
||||
|
@ -396,7 +417,7 @@ xmlparser_parse(XMLParser *x)
|
|||
ispi = (c == '?') ? 1 : 0;
|
||||
x->isshorttag = ispi;
|
||||
taglen = 1;
|
||||
while ((c = xmlparser_getnext(x)) != EOF) {
|
||||
while ((c = xml_getnext(x)) != EOF) {
|
||||
if (c == '/') /* TODO: simplify short tag? */
|
||||
x->isshorttag = 1; /* short tag */
|
||||
else if (c == '>' || isspace(c)) {
|
||||
|
@ -411,7 +432,7 @@ xmlparser_parse(XMLParser *x)
|
|||
if (x->xmltagstart)
|
||||
x->xmltagstart(x, x->tag, x->taglen);
|
||||
if (isspace(c))
|
||||
xmlparser_parseattrs(x);
|
||||
xml_parseattrs(x);
|
||||
if (x->xmltagstartparsed)
|
||||
x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
|
||||
}
|
||||
|
@ -428,7 +449,7 @@ xmlparser_parse(XMLParser *x)
|
|||
datalen = 0;
|
||||
if (x->xmldatastart)
|
||||
x->xmldatastart(x);
|
||||
while ((c = xmlparser_getnext(x)) != EOF) {
|
||||
while ((c = xml_getnext(x)) != EOF) {
|
||||
if (c == '&') {
|
||||
if (datalen) {
|
||||
x->data[datalen] = '\0';
|
||||
|
@ -437,7 +458,7 @@ xmlparser_parse(XMLParser *x)
|
|||
}
|
||||
x->data[0] = c;
|
||||
datalen = 1;
|
||||
while ((c = xmlparser_getnext(x)) != EOF) {
|
||||
while ((c = xml_getnext(x)) != EOF) {
|
||||
if (c == '<')
|
||||
break;
|
||||
if (datalen < sizeof(x->data) - 1)
|
||||
|
@ -477,17 +498,24 @@ xmlparser_parse(XMLParser *x)
|
|||
}
|
||||
|
||||
void
|
||||
xmlparser_parse_string(XMLParser *x, const char *s)
|
||||
xml_parse_string(XMLParser *x, const char *s)
|
||||
{
|
||||
x->str = s;
|
||||
x->getnext = xmlparser_string_getnext;
|
||||
xmlparser_parse(x);
|
||||
struct xml_context_string ctx = { .str = s };
|
||||
|
||||
x->getnext = xml_getnext_string;
|
||||
x->getnext_data = (void *)&ctx;
|
||||
xml_parse(x);
|
||||
}
|
||||
|
||||
void
|
||||
xmlparser_parse_fd(XMLParser *x, int fd)
|
||||
xml_parse_fd(XMLParser *x, int fd)
|
||||
{
|
||||
x->fd = fd;
|
||||
x->getnext = xmlparser_fd_getnext;
|
||||
xmlparser_parse(x);
|
||||
struct xml_context_fd ctx;
|
||||
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
ctx.fd = fd;
|
||||
|
||||
x->getnext = xml_getnext_fd;
|
||||
x->getnext_data = (void *)&ctx;
|
||||
xml_parse(x);
|
||||
}
|
||||
|
|
21
xml.h
21
xml.h
|
@ -24,16 +24,7 @@ typedef struct xmlparser {
|
|||
size_t, int);
|
||||
|
||||
int (*getnext)(struct xmlparser *);
|
||||
|
||||
/* for use with xmlparser_parse_fd */
|
||||
/* errno set from read(). */
|
||||
int readerrno;
|
||||
int fd;
|
||||
|
||||
/* for use with "read" from string: xmlparser_parse_string */
|
||||
const char *str;
|
||||
|
||||
/* private; internal state */
|
||||
void *getnext_data; /* custom data for getnext */
|
||||
|
||||
/* current tag */
|
||||
char tag[1024];
|
||||
|
@ -44,11 +35,6 @@ typedef struct xmlparser {
|
|||
char name[256];
|
||||
/* data buffer used for tag data, cdata and attribute data */
|
||||
char data[BUFSIZ];
|
||||
|
||||
size_t readoffset;
|
||||
size_t readlastbytes;
|
||||
/* read buffer used by xmlparser_parse_fd */
|
||||
unsigned char readbuf[BUFSIZ];
|
||||
} XMLParser;
|
||||
|
||||
int xml_codepointtoutf8(uint32_t, uint32_t *);
|
||||
|
@ -56,5 +42,6 @@ ssize_t xml_entitytostr(const char *, char *, size_t);
|
|||
ssize_t xml_namedentitytostr(const char *, char *, size_t);
|
||||
ssize_t xml_numericetitytostr(const char *, char *, size_t);
|
||||
|
||||
void xmlparser_parse_fd(XMLParser *, int);
|
||||
void xmlparser_parse_string(XMLParser *, const char *);
|
||||
void xml_parse(XMLParser *);
|
||||
void xml_parse_fd(XMLParser *, int);
|
||||
void xml_parse_string(XMLParser *, const char *);
|
||||
|
|
Loading…
Reference in New Issue