sync XML improvements (from sfeed) - xmlparser - XML parser
 (HTM) git clone git://git.codemadness.org/xmlparser
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 908a3c3d0c612673b32c2714d9f46bc723c7a38b
 (DIR) parent b2078dbb866bea46507ebb9d3d4c12c93c4f39f8
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sun, 16 Jun 2019 22:19:31 +0200
       
       sync XML improvements (from sfeed)
       
       Diffstat:
         M README                              |      10 +++++++++-
         M skeleton.c                          |       2 ++
         M xml.c                               |     125 ++++++++++++++-----------------
         M xml.h                               |       7 +++++++
       
       4 files changed, 76 insertions(+), 68 deletions(-)
       ---
 (DIR) diff --git a/README b/README
       @@ -5,7 +5,7 @@ XML parser
        Dependencies
        ------------
        
       -- C compiler (C99 expected).
       +- C compiler (C99).
        
        
        Features
       @@ -36,6 +36,8 @@ Caveats
        -------
        
        - It is not a compliant XML parser.
       +- Performance: data is buffered even if a handler is not set: to make parsing
       +  faster change this code from xml.c.
        - The XML is not checked for errors so it will continue parsing XML data, this
          is by design.
        - Internally fixed-size buffers are used, callbacks like XMLParser.xmldata are
       @@ -59,6 +61,12 @@ Interface / API
        Should be trivial, see xml.c and xml.h and the examples below.
        
        
       +Examples
       +--------
       +
       +See skeleton.c for a base program to start quickly.
       +
       +
        License
        -------
        
 (DIR) diff --git a/skeleton.c b/skeleton.c
       @@ -114,7 +114,9 @@ main(void)
                x.xmltagstart = xmltagstart;
                x.xmltagstartparsed = xmltagstartparsed;
        
       +#ifndef GETNEXT
                x.getnext = getchar;
       +#endif
        
                xml_parse(&x);
        
 (DIR) diff --git a/xml.c b/xml.c
       @@ -15,7 +15,7 @@ xml_parseattrs(XMLParser *x)
                size_t namelen = 0, valuelen;
                int c, endsep, endname = 0, valuestart = 0;
        
       -        while ((c = x->getnext()) != EOF) {
       +        while ((c = GETNEXT()) != EOF) {
                        if (isspace(c)) {
                                if (namelen)
                                        endname = 1;
       @@ -51,7 +51,7 @@ xml_parseattrs(XMLParser *x)
                                        goto startvalue;
                                }
        
       -                        while ((c = x->getnext()) != EOF) {
       +                        while ((c = GETNEXT()) != EOF) {
        startvalue:
                                        if (c == '&') { /* entities */
                                                x->data[valuelen] = '\0';
       @@ -60,7 +60,7 @@ startvalue:
                                                        x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
                                                x->data[0] = c;
                                                valuelen = 1;
       -                                        while ((c = x->getnext()) != EOF) {
       +                                        while ((c = GETNEXT()) != EOF) {
                                                        if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c))))
                                                                break;
                                                        if (valuelen < sizeof(x->data) - 1)
       @@ -124,9 +124,9 @@ xml_parsecomment(XMLParser *x)
        
                if (x->xmlcommentstart)
                        x->xmlcommentstart(x);
       -        while ((c = x->getnext()) != EOF) {
       +        while ((c = GETNEXT()) != EOF) {
                        if (c == '-' || c == '>') {
       -                        if (x->xmlcomment) {
       +                        if (x->xmlcomment && datalen) {
                                        x->data[datalen] = '\0';
                                        x->xmlcomment(x, x->data, datalen);
                                        datalen = 0;
       @@ -173,9 +173,9 @@ xml_parsecdata(XMLParser *x)
        
                if (x->xmlcdatastart)
                        x->xmlcdatastart(x);
       -        while ((c = x->getnext()) != EOF) {
       +        while ((c = GETNEXT()) != EOF) {
                        if (c == ']' || c == '>') {
       -                        if (x->xmlcdata) {
       +                        if (x->xmlcdata && datalen) {
                                        x->data[datalen] = '\0';
                                        x->xmlcdata(x, x->data, datalen);
                                        datalen = 0;
       @@ -247,19 +247,19 @@ static int
        namedentitytostr(const char *e, char *buf, size_t bufsiz)
        {
                static const struct {
       -                char *entity;
       +                const char *entity;
                        int c;
                } entities[] = {
       -                { "&amp;",  '&'  },
       -                { "&lt;",   '<'  },
       -                { "&gt;",   '>'  },
       -                { "&apos;", '\'' },
       -                { "&quot;", '"'  },
       -                { "&AMP;",  '&'  },
       -                { "&LT;",   '<'  },
       -                { "&GT;",   '>'  },
       -                { "&APOS;", '\'' },
       -                { "&QUOT;", '"'  }
       +                { "amp;",  '&'  },
       +                { "lt;",   '<'  },
       +                { "gt;",   '>'  },
       +                { "apos;", '\'' },
       +                { "quot;", '"'  },
       +                { "AMP;",  '&'  },
       +                { "LT;",   '<'  },
       +                { "GT;",   '>'  },
       +                { "APOS;", '\'' },
       +                { "QUOT;", '"'  }
                };
                size_t i;
        
       @@ -267,10 +267,6 @@ namedentitytostr(const char *e, char *buf, size_t bufsiz)
                if (bufsiz < 2)
                        return -1;
        
       -        /* doesn't start with &: can't match */
       -        if (*e != '&')
       -                return 0;
       -
                for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
                        if (!strcmp(e, entities[i].entity)) {
                                buf[0] = entities[i].c;
       @@ -292,12 +288,6 @@ numericentitytostr(const char *e, char *buf, size_t bufsiz)
                if (bufsiz < 5)
                        return -1;
        
       -        /* not a numeric entity */
       -        if (e[0] != '&' || e[1] != '#')
       -                return 0;
       -
       -        /* e[1] == '#', numeric / hexadecimal entity */
       -        e += 2; /* skip "&#" */
                errno = 0;
                /* hex (16) or decimal (10) */
                if (*e == 'x')
       @@ -318,37 +308,32 @@ numericentitytostr(const char *e, char *buf, size_t bufsiz)
        int
        xml_entitytostr(const char *e, char *buf, size_t bufsiz)
        {
       -        /* buffer is too small */
       -        if (bufsiz < 5)
       -                return -1;
                /* doesn't start with & */
                if (e[0] != '&')
                        return 0;
       -        /* named entity */
       -        if (e[1] != '#')
       -                return namedentitytostr(e, buf, bufsiz);
       -        else /* numeric entity */
       -                return numericentitytostr(e, buf, bufsiz);
       +        /* numeric entity */
       +        if (e[1] == '#')
       +                return numericentitytostr(e + 2, buf, bufsiz);
       +        else /* named entity */
       +                return namedentitytostr(e + 1, buf, bufsiz);
        }
        
        void
        xml_parse(XMLParser *x)
        {
       -        int c, ispi;
       -        size_t datalen, tagdatalen, taglen;
       +        size_t datalen, tagdatalen;
       +        int c, isend;
        
       -        if (!x->getnext)
       -                return;
       -        while ((c = x->getnext()) != EOF && c != '<')
       +        while ((c = GETNEXT()) != EOF && c != '<')
                        ; /* skip until < */
        
                while (c != EOF) {
                        if (c == '<') { /* parse tag */
       -                        if ((c = x->getnext()) == EOF)
       +                        if ((c = GETNEXT()) == EOF)
                                        return;
        
                                if (c == '!') { /* cdata and comments */
       -                                for (tagdatalen = 0; (c = x->getnext()) != EOF;) {
       +                                for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
                                                /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */
                                                if (tagdatalen <= sizeof("[CDATA[") - 1)
                                                        x->data[tagdatalen++] = c;
       @@ -367,30 +352,32 @@ xml_parse(XMLParser *x)
                                                }
                                        }
                                } else {
       -                                x->tag[0] = '\0';
       -                                x->taglen = 0;
       -
                                        /* normal tag (open, short open, close), processing instruction. */
       -                                if (isspace(c))
       -                                        while ((c = x->getnext()) != EOF && isspace(c))
       -                                                ;
       -                                if (c == EOF)
       -                                        return;
                                        x->tag[0] = c;
       -                                ispi = (c == '?') ? 1 : 0;
       -                                x->isshorttag = ispi;
       -                                taglen = 1;
       -                                while ((c = x->getnext()) != EOF) {
       +                                x->taglen = 1;
       +                                x->isshorttag = isend = 0;
       +
       +                                /* treat processing instruction as shorttag, don't strip "?" prefix. */
       +                                if (c == '?') {
       +                                        x->isshorttag = 1;
       +                                } else if (c == '/') {
       +                                        if ((c = GETNEXT()) == EOF)
       +                                                return;
       +                                        x->tag[0] = c;
       +                                        isend = 1;
       +                                }
       +
       +                                while ((c = GETNEXT()) != EOF) {
                                                if (c == '/')
                                                        x->isshorttag = 1; /* short tag */
                                                else if (c == '>' || isspace(c)) {
       -                                                x->tag[taglen] = '\0';
       -                                                if (x->tag[0] == '/') { /* end tag, starts with </ */
       -                                                        x->taglen = --taglen; /* len -1 because of / */
       -                                                        if (taglen && x->xmltagend)
       -                                                                x->xmltagend(x, &(x->tag)[1], x->taglen, 0);
       +                                                x->tag[x->taglen] = '\0';
       +                                                if (isend) { /* end tag, starts with </ */
       +                                                        if (x->xmltagend)
       +                                                                x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
       +                                                        x->tag[0] = '\0';
       +                                                        x->taglen = 0;
                                                        } else {
       -                                                        x->taglen = taglen;
                                                                /* start tag */
                                                                if (x->xmltagstart)
                                                                        x->xmltagstart(x, x->tag, x->taglen);
       @@ -400,11 +387,15 @@ xml_parse(XMLParser *x)
                                                                        x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
                                                        }
                                                        /* call tagend for shortform or processing instruction */
       -                                                if ((x->isshorttag || ispi) && x->xmltagend)
       -                                                        x->xmltagend(x, x->tag, x->taglen, 1);
       +                                                if (x->isshorttag) {
       +                                                        if (x->xmltagend)
       +                                                                x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
       +                                                        x->tag[0] = '\0';
       +                                                        x->taglen = 0;
       +                                                }
                                                        break;
       -                                        } else if (taglen < sizeof(x->tag) - 1)
       -                                                x->tag[taglen++] = c; /* NOTE: tag name truncation */
       +                                        } else if (x->taglen < sizeof(x->tag) - 1)
       +                                                x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
                                        }
                                }
                        } else {
       @@ -412,7 +403,7 @@ xml_parse(XMLParser *x)
                                datalen = 0;
                                if (x->xmldatastart)
                                        x->xmldatastart(x);
       -                        while ((c = x->getnext()) != EOF) {
       +                        while ((c = GETNEXT()) != EOF) {
                                        if (c == '&') {
                                                if (datalen) {
                                                        x->data[datalen] = '\0';
       @@ -421,7 +412,7 @@ xml_parse(XMLParser *x)
                                                }
                                                x->data[0] = c;
                                                datalen = 1;
       -                                        while ((c = x->getnext()) != EOF) {
       +                                        while ((c = GETNEXT()) != EOF) {
                                                        if (c == '<')
                                                                break;
                                                        if (datalen < sizeof(x->data) - 1)
 (DIR) diff --git a/xml.h b/xml.h
       @@ -1,3 +1,6 @@
       +#ifndef _XML_H
       +#define _XML_H
       +
        typedef struct xmlparser {
                /* handlers */
                void (*xmlattr)(struct xmlparser *, const char *, size_t,
       @@ -23,7 +26,10 @@ typedef struct xmlparser {
                void (*xmltagstartparsed)(struct xmlparser *, const char *,
                      size_t, int);
        
       +#ifndef GETNEXT
       +        #define GETNEXT (x)->getnext
                int (*getnext)(void);
       +#endif
        
                /* current tag */
                char tag[1024];
       @@ -38,3 +44,4 @@ typedef struct xmlparser {
        
        int xml_entitytostr(const char *, char *, size_t);
        void xml_parse(XMLParser *);
       +#endif