separate xml specific code into xml.c

2015-06-21 00:18:44 +02:00 · 2015-06-21 00:18:44 +02:00 · 5646132f6b
parent 4e1438caf5
commit 5646132f6b
3 changed files with 169 additions and 134 deletions
--- a/sfeed.c
+++ b/sfeed.c
@ -11,8 +11,6 @@
 #include "util.h"
 #include "xml.h"

-/* fast isspace(c) && c != ' ' check. */
-#define ISWSNOSPACE(c)    (((unsigned)c - '\t') < 5)
 #define ISINCONTENT(ctx)  ((ctx).iscontent && !((ctx).iscontenttag))
 #define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag)
 /* string and size */
@ -27,6 +25,7 @@ enum { ContentTypeNone = 0, ContentTypePlain = 1, ContentTypeHTML = 2 };
 static const char *contenttypes[] = { "", "plain", "html" };

 static const int FieldSeparator = '\t'; /* output field seperator character */
+static const char *baseurl = "";

 enum {
 	TagUnknown = 0,
@ -60,8 +59,8 @@ typedef struct feeditem {
 } FeedItem;

 typedef struct feedtag {
-	char *name;
-	size_t namelen;
+	char   *name;
+	size_t  namelen;
 	int id;
 } FeedTag;

@ -76,13 +75,10 @@ typedef struct feedcontext {
 	int       attrcount;
 } FeedContext;

-static size_t codepointtoutf8(uint32_t, uint32_t *);
-static size_t entitytostr(const char *, char *, size_t);
 static int    gettag(int, const char *, size_t);
 static int    gettimetz(const char *, char *, size_t, int *);
 static int    isattr(const char *, size_t, const char *, size_t);
 static int    istag(const char *, size_t, const char *, size_t);
-static size_t namedentitytostr(const char *, char *, size_t);
 static int    parsetime(const char *, char *, size_t, time_t *);
 static void   printfields(void);
 static void   string_append(String *, const char *, size_t);
@ -106,7 +102,6 @@ static void   xml_handler_start_element_parsed(XMLParser *, const char *,

 static FeedContext ctx;
 static XMLParser parser; /* XML parser state */
-static char *append = NULL; /* append string after each output line */

 /* unique number for parsed tag (faster comparison) */
 static int
@ -163,109 +158,6 @@ gettag(int feedtype, const char *name, size_t namelen)
 	return TagUnknown;
 }

-static size_t
-codepointtoutf8(uint32_t cp, uint32_t *utf)
-{
-	if(cp >= 0x10000) {
-		/* 4 bytes */
-		*utf = 0xf0808080 | ((cp & 0xfc0000) << 6) |
-		       ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
-		       (cp & 0x3f);
-		return 4;
-	} else if(cp >= 0x00800) {
-		/* 3 bytes */
-		*utf = 0xe08080 |
-		       ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
-		       (cp & 0x3f);
-		return 3;
-	} else if(cp >= 0x80) {
-		/* 2 bytes */
-		*utf = 0xc080 |
-		       ((cp & 0xfc0) << 2) | (cp & 0x3f);
-		return 2;
-	}
-	*utf = cp & 0xff;
-	return *utf ? 1 : 0; /* 1 byte */
-}
-
-static size_t
-namedentitytostr(const char *e, char *buffer, size_t bufsiz)
-{
-	char *entities[6][2] = {
-		{ "&lt;",   "<"  },
-		{ "&gt;",   ">"  },
-		{ "&apos;", "'"  },
-		{ "&amp;",  "&"  },
-		{ "&quot;", "\"" },
-		{ NULL,     NULL }
-	};
-	size_t i;
-
-	if(*e != '&' || bufsiz < 2) /* doesn't start with & */
-		return 0;
-	for(i = 0; entities[i][0]; i++) {
-		/* NOTE: compares max 7 chars */
-		if(!strncasecmp(e, entities[i][0], 6)) {
-			buffer[0] = *(entities[i][1]);
-			buffer[1] = '\0';
-			return 1;
-		}
-	}
-	return 0;
-}
-
-/* convert named- or numeric entity string to buffer string
- * returns byte-length of string. */
-static size_t
-entitytostr(const char *e, char *buffer, size_t bufsiz)
-{
-	uint32_t l = 0, cp = 0;
-	size_t len = 0, b;
-	int c;
-	char *end;
-
-	/* doesn't start with & or insufficient buffer size */
-	if(e[0] != '&' || bufsiz < 5)
-		return 0;
-	/* named entity */
-	if(e[1] != '#')
-		return namedentitytostr(e, buffer, bufsiz);
-
-	/* e[1] == '#', numeric / hexadecimal entity */
-	e += 2; /* skip "&#" */
-	errno = 0;
-	/* hex (16) or decimal (10) */
-	if(*e == 'x')
-		l = strtoul(e + 1, &end, 16);
-	else
-		l = strtoul(e, &end, 10);
-	/* invalid value or not a well-formed entity */
-	if(errno != 0 || (*end != '\0' && *end != ';'))
-		return 0;
-	if(!(len = codepointtoutf8(l, &cp)))
-		return 0;
-	/* make string */
-	for(b = 0; b < len; b++)
-		buffer[b] = (cp >> (8 * (len - 1 - b))) & 0xff;
-	buffer[len] = '\0';
-	/* escape whitespace */
-	if(ISWSNOSPACE(buffer[0])) {
-		switch(buffer[0]) {
-		case '\n': c = 'n';  break;
-		case '\\': c = '\\'; break;
-		case '\t': c = 't';  break;
-		default:   c = '\0'; break;
-		}
-		if(c != '\0') {
-			buffer[0] = '\\';
-			buffer[1] = c;
-			buffer[2] = '\0';
-			len = 2;
-		}
-	}
-	return len;
-}
-
 /* clear string only; don't free, prevents unnecessary reallocation */
 static void
 string_clear(String *s)
@ -479,10 +371,6 @@ printfields(void)
 	string_print(&ctx.item.author);
 	putchar(FieldSeparator);
 	fputs(feedtypes[ctx.item.feedtype], stdout);
-	if(append) {
-		putchar(FieldSeparator);
-		fputs(append, stdout);
-	}
 	putchar('\n');
 }

@ -703,12 +591,17 @@ static void
 xml_handler_data_entity(XMLParser *p, const char *data, size_t datalen)
 {
 	char buffer[16];
-	size_t len;
+	int len;

 	/* try to translate entity, else just pass as data to
-         * xml_data_handler */
-	if((len = entitytostr(data, buffer, sizeof(buffer))) > 0)
-		xml_handler_data(p, buffer, len);
+	 * xml_data_handler */
+	len = xml_entitytostr(data, buffer, sizeof(buffer));
+	/* this should never happen (buffer too small) */
+	if(len < 0)
+		return;
+
+	if(len > 0)
+		xml_handler_data(p, buffer, (size_t)len);
 	else
 		xml_handler_data(p, data, datalen);
 }
@ -786,13 +679,8 @@ xml_handler_end_element(XMLParser *p, const char *name, size_t namelen, int issh
 int
 main(int argc, char *argv[])
 {
-	if(argc > 1) {
-		append = argv[1];
-		if(!strcmp(argv[1], "-v")) {
-			printf("%s\n", VERSION);
-			return 0;
-		}
-	}
+	if(argc > 1)
+		baseurl = argv[1];

 	/* init strings and initial memory pool size */
 	string_buffer_init(&ctx.item.timestamp, 64);
--- a/xml.c
+++ b/xml.c
@ -7,6 +7,18 @@

 #include "xml.h"

+static const struct {
+	char *entity;
+	size_t len;
+	int c;
+} entities[] = {
+	{ .entity = "&lt;",   .len = 4, .c = '<'  },
+	{ .entity = "&gt;",   .len = 4, .c = '>'  },
+	{ .entity = "&apos;", .len = 6, .c = '\'' },
+	{ .entity = "&amp;",  .len = 5, .c = '&'  },
+	{ .entity = "&quot;", .len = 6, .c = '"'  }
+};
+
 static int
 xmlparser_string_getnext(XMLParser *x)
 {
@ -185,7 +197,7 @@ xmlparser_parsecomment(XMLParser *x)
 }

 /* TODO:
- * <test><![CDATA[1234567dddd8]]]>
+ * <test><![CDATA[1234567dddd8]]>
 *
 * with x->data of sizeof(15) gives 2 ] at end of cdata, should be 1
 * test comment function too for similar bug?
@ -194,12 +206,31 @@ xmlparser_parsecomment(XMLParser *x)
 static __inline__ void
 xmlparser_parsecdata(XMLParser *x)
 {
+	static const char *end = "]]>";
+	static const size_t endsiz = sizeof(end);
 	size_t datalen = 0, i = 0;
 	int c;

 	if(x->xmlcdatastart)
 		x->xmlcdatastart(x);
 	while((c = xmlparser_getnext(x)) != EOF) {
+		if(c == end[i++]) {
+			if(!end[i]) { /* end of match */
+				if(datalen >= endsiz) {
+					datalen -= endsiz;
+					x->data[datalen] = '\0';
+				}
+				if(x->xmlcdata)
+					x->xmlcdata(x, x->data, datalen);
+				if(x->xmlcdataend)
+					x->xmlcdataend(x);
+				break;
+			}
+			continue;
+		} else {
+			i = 0;
+		}
+#if 0
 		if(c == ']' && i < 2) {
 			i++;
 		} else if(c == '>') {
@ -216,6 +247,7 @@ xmlparser_parsecdata(XMLParser *x)
 			}
 			i = 0;
 		}
+#endif
 		/* TODO: what if the end has ]>, and it's cut on the boundary */
 		if(datalen < sizeof(x->data) - 1) {
 			x->data[datalen++] = c;
@ -229,6 +261,108 @@ xmlparser_parsecdata(XMLParser *x)
 	}
 }

+int
+xml_codepointtoutf8(uint32_t cp, uint32_t *utf)
+{
+	if(cp >= 0x10000) {
+		/* 4 bytes */
+		*utf = 0xf0808080 | ((cp & 0xfc0000) << 6) |
+		       ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
+		       (cp & 0x3f);
+		return 4;
+	} else if(cp >= 0x00800) {
+		/* 3 bytes */
+		*utf = 0xe08080 |
+		       ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
+		       (cp & 0x3f);
+		return 3;
+	} else if(cp >= 0x80) {
+		/* 2 bytes */
+		*utf = 0xc080 |
+		       ((cp & 0xfc0) << 2) | (cp & 0x3f);
+		return 2;
+	}
+	*utf = cp & 0xff;
+	return *utf ? 1 : 0; /* 1 byte */
+}
+
+ssize_t
+xml_namedentitytostr(const char *e, char *buf, size_t bufsiz)
+{
+	size_t i;
+
+	/* buffer is too small */
+	if(bufsiz < 2)
+		return -1;
+
+	/* doesn't start with &: can't match */
+	if(*e != '&')
+		return 0;
+
+	for(i = 0; sizeof(entities) / sizeof(*entities); i++) {
+		/* NOTE: compares max 6 chars */
+		if(!strncasecmp(e, entities[i].entity, 6)) {
+			buf[0] = entities[i].c;
+			buf[1] = '\0';
+			return 1;
+		}
+	}
+	return 0;
+}
+
+ssize_t
+xml_numericentitytostr(const char *e, char *buf, size_t bufsiz)
+{
+	uint32_t l = 0, cp = 0;
+	size_t b, len;
+	char *end;
+
+	/* buffer is too small */
+	if(bufsiz < 5)
+		return -1;
+
+	/* not a numeric entity */
+	if(!(e[0] == '&' && e[1] == '#'))
+		return 0;
+
+	/* e[1] == '#', numeric / hexadecimal entity */
+	e += 2; /* skip "&#" */
+	errno = 0;
+	/* hex (16) or decimal (10) */
+	if(*e == 'x')
+		l = strtoul(e + 1, &end, 16);
+	else
+		l = strtoul(e, &end, 10);
+	/* invalid value or not a well-formed entity */
+	if(errno != 0 || (*end != '\0' && *end != ';'))
+		return 0;
+	if(!(len = xml_codepointtoutf8(l, &cp)))
+		return 0;
+	/* make string */
+	for(b = 0; b < len; b++)
+		buf[b] = (cp >> (8 * (len - 1 - b))) & 0xff;
+	buf[len] = '\0';
+	return (ssize_t)len;
+}
+
+/* convert named- or numeric entity string to buffer string
+ * returns byte-length of string. */
+ssize_t
+xml_entitytostr(const char *e, char *buf, size_t bufsiz)
+{
+	/* buffer is too small */
+	if(bufsiz < 5)
+		return -1;
+	/* doesn't start with & */
+	if(e[0] != '&')
+		return 0;
+	/* named entity */
+	if(e[1] != '#')
+		return xml_namedentitytostr(e, buf, bufsiz);
+	else /* numeric entity */
+		return xml_numericentitytostr(e, buf, bufsiz);
+}
+
 static void
 xmlparser_parse(XMLParser *x)
 {
--- a/xml.h
+++ b/xml.h
@ -29,22 +29,35 @@ typedef struct xmlparser {

 	int (*getnext)(struct xmlparser *);

-	int readerrno; /* errno set from read(). */
-	int fd; /* fd to read from */
+	/* for use with xmlparser_parse_fd */
+	/* errno set from read(). */
+	int readerrno;
+	int fd;

-	const char *str; /* "read" from string */
+	/* for use with "read" from string: xmlparser_parse_string */
+	const char *str;

 	/* private; internal state */
-	char tag[1024]; /* current tag */
-	int isshorttag; /* current tag is in short form ? */
+
+	/* current tag */
+	char tag[1024];
+	/* current tag is in short form ? */
+	int isshorttag;
 	size_t taglen;
-	char name[256]; /* current attribute name */
-	char data[BUFSIZ]; /* data buffer used for tag and attribute data */
+	/* current attribute name */
+	char name[256];
+	/* data buffer used for tag data, cdata and attribute data */
+	char data[BUFSIZ];
 	size_t readoffset;
 	size_t readlastbytes;
 	/* read buffer used by xmlparser_getnext */
 	unsigned char readbuf[BUFSIZ];
 } XMLParser;

+int     xml_codepointtoutf8(uint32_t, uint32_t *);
+ssize_t xml_entitytostr(const char *, char *, size_t);
+ssize_t xml_namedentitytostr(const char *, char *, size_t);
+ssize_t xml_numericetitytostr(const char *, char *, size_t);
+
 void xmlparser_parse_fd(XMLParser *, int);
 void xmlparser_parse_string(XMLParser *, const char *);