separate xml specific code into xml.c
This commit is contained in:
parent
4e1438caf5
commit
5646132f6b
140
sfeed.c
140
sfeed.c
|
@ -11,8 +11,6 @@
|
|||
#include "util.h"
|
||||
#include "xml.h"
|
||||
|
||||
/* fast isspace(c) && c != ' ' check. */
|
||||
#define ISWSNOSPACE(c) (((unsigned)c - '\t') < 5)
|
||||
#define ISINCONTENT(ctx) ((ctx).iscontent && !((ctx).iscontenttag))
|
||||
#define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag)
|
||||
/* string and size */
|
||||
|
@ -27,6 +25,7 @@ enum { ContentTypeNone = 0, ContentTypePlain = 1, ContentTypeHTML = 2 };
|
|||
static const char *contenttypes[] = { "", "plain", "html" };
|
||||
|
||||
static const int FieldSeparator = '\t'; /* output field seperator character */
|
||||
static const char *baseurl = "";
|
||||
|
||||
enum {
|
||||
TagUnknown = 0,
|
||||
|
@ -60,8 +59,8 @@ typedef struct feeditem {
|
|||
} FeedItem;
|
||||
|
||||
typedef struct feedtag {
|
||||
char *name;
|
||||
size_t namelen;
|
||||
char *name;
|
||||
size_t namelen;
|
||||
int id;
|
||||
} FeedTag;
|
||||
|
||||
|
@ -76,13 +75,10 @@ typedef struct feedcontext {
|
|||
int attrcount;
|
||||
} FeedContext;
|
||||
|
||||
static size_t codepointtoutf8(uint32_t, uint32_t *);
|
||||
static size_t entitytostr(const char *, char *, size_t);
|
||||
static int gettag(int, const char *, size_t);
|
||||
static int gettimetz(const char *, char *, size_t, int *);
|
||||
static int isattr(const char *, size_t, const char *, size_t);
|
||||
static int istag(const char *, size_t, const char *, size_t);
|
||||
static size_t namedentitytostr(const char *, char *, size_t);
|
||||
static int parsetime(const char *, char *, size_t, time_t *);
|
||||
static void printfields(void);
|
||||
static void string_append(String *, const char *, size_t);
|
||||
|
@ -106,7 +102,6 @@ static void xml_handler_start_element_parsed(XMLParser *, const char *,
|
|||
|
||||
static FeedContext ctx;
|
||||
static XMLParser parser; /* XML parser state */
|
||||
static char *append = NULL; /* append string after each output line */
|
||||
|
||||
/* unique number for parsed tag (faster comparison) */
|
||||
static int
|
||||
|
@ -163,109 +158,6 @@ gettag(int feedtype, const char *name, size_t namelen)
|
|||
return TagUnknown;
|
||||
}
|
||||
|
||||
static size_t
|
||||
codepointtoutf8(uint32_t cp, uint32_t *utf)
|
||||
{
|
||||
if(cp >= 0x10000) {
|
||||
/* 4 bytes */
|
||||
*utf = 0xf0808080 | ((cp & 0xfc0000) << 6) |
|
||||
((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
|
||||
(cp & 0x3f);
|
||||
return 4;
|
||||
} else if(cp >= 0x00800) {
|
||||
/* 3 bytes */
|
||||
*utf = 0xe08080 |
|
||||
((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
|
||||
(cp & 0x3f);
|
||||
return 3;
|
||||
} else if(cp >= 0x80) {
|
||||
/* 2 bytes */
|
||||
*utf = 0xc080 |
|
||||
((cp & 0xfc0) << 2) | (cp & 0x3f);
|
||||
return 2;
|
||||
}
|
||||
*utf = cp & 0xff;
|
||||
return *utf ? 1 : 0; /* 1 byte */
|
||||
}
|
||||
|
||||
static size_t
|
||||
namedentitytostr(const char *e, char *buffer, size_t bufsiz)
|
||||
{
|
||||
char *entities[6][2] = {
|
||||
{ "<", "<" },
|
||||
{ ">", ">" },
|
||||
{ "'", "'" },
|
||||
{ "&", "&" },
|
||||
{ """, "\"" },
|
||||
{ NULL, NULL }
|
||||
};
|
||||
size_t i;
|
||||
|
||||
if(*e != '&' || bufsiz < 2) /* doesn't start with & */
|
||||
return 0;
|
||||
for(i = 0; entities[i][0]; i++) {
|
||||
/* NOTE: compares max 7 chars */
|
||||
if(!strncasecmp(e, entities[i][0], 6)) {
|
||||
buffer[0] = *(entities[i][1]);
|
||||
buffer[1] = '\0';
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* convert named- or numeric entity string to buffer string
|
||||
* returns byte-length of string. */
|
||||
static size_t
|
||||
entitytostr(const char *e, char *buffer, size_t bufsiz)
|
||||
{
|
||||
uint32_t l = 0, cp = 0;
|
||||
size_t len = 0, b;
|
||||
int c;
|
||||
char *end;
|
||||
|
||||
/* doesn't start with & or insufficient buffer size */
|
||||
if(e[0] != '&' || bufsiz < 5)
|
||||
return 0;
|
||||
/* named entity */
|
||||
if(e[1] != '#')
|
||||
return namedentitytostr(e, buffer, bufsiz);
|
||||
|
||||
/* e[1] == '#', numeric / hexadecimal entity */
|
||||
e += 2; /* skip "&#" */
|
||||
errno = 0;
|
||||
/* hex (16) or decimal (10) */
|
||||
if(*e == 'x')
|
||||
l = strtoul(e + 1, &end, 16);
|
||||
else
|
||||
l = strtoul(e, &end, 10);
|
||||
/* invalid value or not a well-formed entity */
|
||||
if(errno != 0 || (*end != '\0' && *end != ';'))
|
||||
return 0;
|
||||
if(!(len = codepointtoutf8(l, &cp)))
|
||||
return 0;
|
||||
/* make string */
|
||||
for(b = 0; b < len; b++)
|
||||
buffer[b] = (cp >> (8 * (len - 1 - b))) & 0xff;
|
||||
buffer[len] = '\0';
|
||||
/* escape whitespace */
|
||||
if(ISWSNOSPACE(buffer[0])) {
|
||||
switch(buffer[0]) {
|
||||
case '\n': c = 'n'; break;
|
||||
case '\\': c = '\\'; break;
|
||||
case '\t': c = 't'; break;
|
||||
default: c = '\0'; break;
|
||||
}
|
||||
if(c != '\0') {
|
||||
buffer[0] = '\\';
|
||||
buffer[1] = c;
|
||||
buffer[2] = '\0';
|
||||
len = 2;
|
||||
}
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
/* clear string only; don't free, prevents unnecessary reallocation */
|
||||
static void
|
||||
string_clear(String *s)
|
||||
|
@ -479,10 +371,6 @@ printfields(void)
|
|||
string_print(&ctx.item.author);
|
||||
putchar(FieldSeparator);
|
||||
fputs(feedtypes[ctx.item.feedtype], stdout);
|
||||
if(append) {
|
||||
putchar(FieldSeparator);
|
||||
fputs(append, stdout);
|
||||
}
|
||||
putchar('\n');
|
||||
}
|
||||
|
||||
|
@ -703,12 +591,17 @@ static void
|
|||
xml_handler_data_entity(XMLParser *p, const char *data, size_t datalen)
|
||||
{
|
||||
char buffer[16];
|
||||
size_t len;
|
||||
int len;
|
||||
|
||||
/* try to translate entity, else just pass as data to
|
||||
* xml_data_handler */
|
||||
if((len = entitytostr(data, buffer, sizeof(buffer))) > 0)
|
||||
xml_handler_data(p, buffer, len);
|
||||
* xml_data_handler */
|
||||
len = xml_entitytostr(data, buffer, sizeof(buffer));
|
||||
/* this should never happen (buffer too small) */
|
||||
if(len < 0)
|
||||
return;
|
||||
|
||||
if(len > 0)
|
||||
xml_handler_data(p, buffer, (size_t)len);
|
||||
else
|
||||
xml_handler_data(p, data, datalen);
|
||||
}
|
||||
|
@ -786,13 +679,8 @@ xml_handler_end_element(XMLParser *p, const char *name, size_t namelen, int issh
|
|||
int
|
||||
main(int argc, char *argv[])
|
||||
{
|
||||
if(argc > 1) {
|
||||
append = argv[1];
|
||||
if(!strcmp(argv[1], "-v")) {
|
||||
printf("%s\n", VERSION);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
if(argc > 1)
|
||||
baseurl = argv[1];
|
||||
|
||||
/* init strings and initial memory pool size */
|
||||
string_buffer_init(&ctx.item.timestamp, 64);
|
||||
|
|
136
xml.c
136
xml.c
|
@ -7,6 +7,18 @@
|
|||
|
||||
#include "xml.h"
|
||||
|
||||
static const struct {
|
||||
char *entity;
|
||||
size_t len;
|
||||
int c;
|
||||
} entities[] = {
|
||||
{ .entity = "<", .len = 4, .c = '<' },
|
||||
{ .entity = ">", .len = 4, .c = '>' },
|
||||
{ .entity = "'", .len = 6, .c = '\'' },
|
||||
{ .entity = "&", .len = 5, .c = '&' },
|
||||
{ .entity = """, .len = 6, .c = '"' }
|
||||
};
|
||||
|
||||
static int
|
||||
xmlparser_string_getnext(XMLParser *x)
|
||||
{
|
||||
|
@ -185,7 +197,7 @@ xmlparser_parsecomment(XMLParser *x)
|
|||
}
|
||||
|
||||
/* TODO:
|
||||
* <test><![CDATA[1234567dddd8]]]>
|
||||
* <test><![CDATA[1234567dddd8]]>
|
||||
*
|
||||
* with x->data of sizeof(15) gives 2 ] at end of cdata, should be 1
|
||||
* test comment function too for similar bug?
|
||||
|
@ -194,12 +206,31 @@ xmlparser_parsecomment(XMLParser *x)
|
|||
static __inline__ void
|
||||
xmlparser_parsecdata(XMLParser *x)
|
||||
{
|
||||
static const char *end = "]]>";
|
||||
static const size_t endsiz = sizeof(end);
|
||||
size_t datalen = 0, i = 0;
|
||||
int c;
|
||||
|
||||
if(x->xmlcdatastart)
|
||||
x->xmlcdatastart(x);
|
||||
while((c = xmlparser_getnext(x)) != EOF) {
|
||||
if(c == end[i++]) {
|
||||
if(!end[i]) { /* end of match */
|
||||
if(datalen >= endsiz) {
|
||||
datalen -= endsiz;
|
||||
x->data[datalen] = '\0';
|
||||
}
|
||||
if(x->xmlcdata)
|
||||
x->xmlcdata(x, x->data, datalen);
|
||||
if(x->xmlcdataend)
|
||||
x->xmlcdataend(x);
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
i = 0;
|
||||
}
|
||||
#if 0
|
||||
if(c == ']' && i < 2) {
|
||||
i++;
|
||||
} else if(c == '>') {
|
||||
|
@ -216,6 +247,7 @@ xmlparser_parsecdata(XMLParser *x)
|
|||
}
|
||||
i = 0;
|
||||
}
|
||||
#endif
|
||||
/* TODO: what if the end has ]>, and it's cut on the boundary */
|
||||
if(datalen < sizeof(x->data) - 1) {
|
||||
x->data[datalen++] = c;
|
||||
|
@ -229,6 +261,108 @@ xmlparser_parsecdata(XMLParser *x)
|
|||
}
|
||||
}
|
||||
|
||||
int
|
||||
xml_codepointtoutf8(uint32_t cp, uint32_t *utf)
|
||||
{
|
||||
if(cp >= 0x10000) {
|
||||
/* 4 bytes */
|
||||
*utf = 0xf0808080 | ((cp & 0xfc0000) << 6) |
|
||||
((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
|
||||
(cp & 0x3f);
|
||||
return 4;
|
||||
} else if(cp >= 0x00800) {
|
||||
/* 3 bytes */
|
||||
*utf = 0xe08080 |
|
||||
((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
|
||||
(cp & 0x3f);
|
||||
return 3;
|
||||
} else if(cp >= 0x80) {
|
||||
/* 2 bytes */
|
||||
*utf = 0xc080 |
|
||||
((cp & 0xfc0) << 2) | (cp & 0x3f);
|
||||
return 2;
|
||||
}
|
||||
*utf = cp & 0xff;
|
||||
return *utf ? 1 : 0; /* 1 byte */
|
||||
}
|
||||
|
||||
ssize_t
|
||||
xml_namedentitytostr(const char *e, char *buf, size_t bufsiz)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
/* buffer is too small */
|
||||
if(bufsiz < 2)
|
||||
return -1;
|
||||
|
||||
/* doesn't start with &: can't match */
|
||||
if(*e != '&')
|
||||
return 0;
|
||||
|
||||
for(i = 0; sizeof(entities) / sizeof(*entities); i++) {
|
||||
/* NOTE: compares max 6 chars */
|
||||
if(!strncasecmp(e, entities[i].entity, 6)) {
|
||||
buf[0] = entities[i].c;
|
||||
buf[1] = '\0';
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
ssize_t
|
||||
xml_numericentitytostr(const char *e, char *buf, size_t bufsiz)
|
||||
{
|
||||
uint32_t l = 0, cp = 0;
|
||||
size_t b, len;
|
||||
char *end;
|
||||
|
||||
/* buffer is too small */
|
||||
if(bufsiz < 5)
|
||||
return -1;
|
||||
|
||||
/* not a numeric entity */
|
||||
if(!(e[0] == '&' && e[1] == '#'))
|
||||
return 0;
|
||||
|
||||
/* e[1] == '#', numeric / hexadecimal entity */
|
||||
e += 2; /* skip "&#" */
|
||||
errno = 0;
|
||||
/* hex (16) or decimal (10) */
|
||||
if(*e == 'x')
|
||||
l = strtoul(e + 1, &end, 16);
|
||||
else
|
||||
l = strtoul(e, &end, 10);
|
||||
/* invalid value or not a well-formed entity */
|
||||
if(errno != 0 || (*end != '\0' && *end != ';'))
|
||||
return 0;
|
||||
if(!(len = xml_codepointtoutf8(l, &cp)))
|
||||
return 0;
|
||||
/* make string */
|
||||
for(b = 0; b < len; b++)
|
||||
buf[b] = (cp >> (8 * (len - 1 - b))) & 0xff;
|
||||
buf[len] = '\0';
|
||||
return (ssize_t)len;
|
||||
}
|
||||
|
||||
/* convert named- or numeric entity string to buffer string
|
||||
* returns byte-length of string. */
|
||||
ssize_t
|
||||
xml_entitytostr(const char *e, char *buf, size_t bufsiz)
|
||||
{
|
||||
/* buffer is too small */
|
||||
if(bufsiz < 5)
|
||||
return -1;
|
||||
/* doesn't start with & */
|
||||
if(e[0] != '&')
|
||||
return 0;
|
||||
/* named entity */
|
||||
if(e[1] != '#')
|
||||
return xml_namedentitytostr(e, buf, bufsiz);
|
||||
else /* numeric entity */
|
||||
return xml_numericentitytostr(e, buf, bufsiz);
|
||||
}
|
||||
|
||||
static void
|
||||
xmlparser_parse(XMLParser *x)
|
||||
{
|
||||
|
|
27
xml.h
27
xml.h
|
@ -29,22 +29,35 @@ typedef struct xmlparser {
|
|||
|
||||
int (*getnext)(struct xmlparser *);
|
||||
|
||||
int readerrno; /* errno set from read(). */
|
||||
int fd; /* fd to read from */
|
||||
/* for use with xmlparser_parse_fd */
|
||||
/* errno set from read(). */
|
||||
int readerrno;
|
||||
int fd;
|
||||
|
||||
const char *str; /* "read" from string */
|
||||
/* for use with "read" from string: xmlparser_parse_string */
|
||||
const char *str;
|
||||
|
||||
/* private; internal state */
|
||||
char tag[1024]; /* current tag */
|
||||
int isshorttag; /* current tag is in short form ? */
|
||||
|
||||
/* current tag */
|
||||
char tag[1024];
|
||||
/* current tag is in short form ? */
|
||||
int isshorttag;
|
||||
size_t taglen;
|
||||
char name[256]; /* current attribute name */
|
||||
char data[BUFSIZ]; /* data buffer used for tag and attribute data */
|
||||
/* current attribute name */
|
||||
char name[256];
|
||||
/* data buffer used for tag data, cdata and attribute data */
|
||||
char data[BUFSIZ];
|
||||
size_t readoffset;
|
||||
size_t readlastbytes;
|
||||
/* read buffer used by xmlparser_getnext */
|
||||
unsigned char readbuf[BUFSIZ];
|
||||
} XMLParser;
|
||||
|
||||
int xml_codepointtoutf8(uint32_t, uint32_t *);
|
||||
ssize_t xml_entitytostr(const char *, char *, size_t);
|
||||
ssize_t xml_namedentitytostr(const char *, char *, size_t);
|
||||
ssize_t xml_numericetitytostr(const char *, char *, size_t);
|
||||
|
||||
void xmlparser_parse_fd(XMLParser *, int);
|
||||
void xmlparser_parse_string(XMLParser *, const char *);
|
||||
|
|
Loading…
Reference in New Issue