xml.c - frontends - front-ends for some sites (experiment)
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       xml.c (11376B)
       ---
            1 #include <errno.h>
            2 #include <stdio.h>
            3 #include <stdlib.h>
            4 #include <string.h>
            5 
            6 #include "xml.h"
            7 
            8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
            9 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
           10 
           11 /* data buffers, size and offset used for parsing XML, see getnext() */
           12 static const unsigned char *xml_data_buf;
           13 static size_t xml_data_size;
           14 static size_t xml_data_off;
           15 
           16 void
           17 setxmldata(const char *s, size_t len)
           18 {
           19         xml_data_off = 0;
           20         xml_data_size = len;
           21         xml_data_buf = (unsigned char *)s;
           22 }
           23 
           24 static int
           25 getnext(void)
           26 {
           27         if (xml_data_off >= xml_data_size)
           28                 return EOF;
           29         return xml_data_buf[xml_data_off++];
           30 }
           31 
           32 static void
           33 xml_parseattrs(XMLParser *x)
           34 {
           35         size_t namelen = 0, valuelen;
           36         int c, endsep, endname = 0, valuestart = 0;
           37 
           38         while ((c = GETNEXT()) != EOF) {
           39                 if (ISSPACE(c)) {
           40                         if (namelen)
           41                                 endname = 1;
           42                         continue;
           43                 } else if (c == '?')
           44                         ; /* ignore */
           45                 else if (c == '=') {
           46                         x->name[namelen] = '\0';
           47                         valuestart = 1;
           48                         endname = 1;
           49                 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
           50                         /* attribute without value */
           51                         x->name[namelen] = '\0';
           52                         if (x->xmlattrstart)
           53                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           54                         if (x->xmlattr)
           55                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
           56                         if (x->xmlattrend)
           57                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
           58                         endname = 0;
           59                         x->name[0] = c;
           60                         namelen = 1;
           61                 } else if (namelen && valuestart) {
           62                         /* attribute with value */
           63                         if (x->xmlattrstart)
           64                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           65 
           66                         valuelen = 0;
           67                         if (c == '\'' || c == '"') {
           68                                 endsep = c;
           69                         } else {
           70                                 endsep = ' '; /* ISSPACE() */
           71                                 goto startvalue;
           72                         }
           73 
           74                         while ((c = GETNEXT()) != EOF) {
           75 startvalue:
           76                                 if (c == '&') { /* entities */
           77                                         x->data[valuelen] = '\0';
           78                                         /* call data function with data before entity if there is data */
           79                                         if (valuelen && x->xmlattr)
           80                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           81                                         x->data[0] = c;
           82                                         valuelen = 1;
           83                                         while ((c = GETNEXT()) != EOF) {
           84                                                 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
           85                                                         break;
           86                                                 if (valuelen < sizeof(x->data) - 1)
           87                                                         x->data[valuelen++] = c;
           88                                                 else {
           89                                                         /* entity too long for buffer, handle as normal data */
           90                                                         x->data[valuelen] = '\0';
           91                                                         if (x->xmlattr)
           92                                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           93                                                         x->data[0] = c;
           94                                                         valuelen = 1;
           95                                                         break;
           96                                                 }
           97                                                 if (c == ';') {
           98                                                         x->data[valuelen] = '\0';
           99                                                         if (x->xmlattrentity)
          100                                                                 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          101                                                         valuelen = 0;
          102                                                         break;
          103                                                 }
          104                                         }
          105                                 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
          106                                         if (valuelen < sizeof(x->data) - 1) {
          107                                                 x->data[valuelen++] = c;
          108                                         } else {
          109                                                 x->data[valuelen] = '\0';
          110                                                 if (x->xmlattr)
          111                                                         x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          112                                                 x->data[0] = c;
          113                                                 valuelen = 1;
          114                                         }
          115                                 }
          116                                 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
          117                                         x->data[valuelen] = '\0';
          118                                         if (x->xmlattr)
          119                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          120                                         if (x->xmlattrend)
          121                                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
          122                                         break;
          123                                 }
          124                         }
          125                         namelen = endname = valuestart = 0;
          126                 } else if (namelen < sizeof(x->name) - 1) {
          127                         x->name[namelen++] = c;
          128                 }
          129                 if (c == '>') {
          130                         break;
          131                 } else if (c == '/') {
          132                         x->isshorttag = 1;
          133                         x->name[0] = '\0';
          134                         namelen = 0;
          135                 }
          136         }
          137 }
          138 
          139 static void
          140 xml_parsecomment(XMLParser *x)
          141 {
          142         size_t datalen = 0, i = 0;
          143         int c;
          144 
          145         if (x->xmlcommentstart)
          146                 x->xmlcommentstart(x);
          147         while ((c = GETNEXT()) != EOF) {
          148                 if (c == '-' || c == '>') {
          149                         if (x->xmlcomment && datalen) {
          150                                 x->data[datalen] = '\0';
          151                                 x->xmlcomment(x, x->data, datalen);
          152                                 datalen = 0;
          153                         }
          154                 }
          155 
          156                 if (c == '-') {
          157                         if (++i > 2) {
          158                                 if (x->xmlcomment)
          159                                         for (; i > 2; i--)
          160                                                 x->xmlcomment(x, "-", 1);
          161                                 i = 2;
          162                         }
          163                         continue;
          164                 } else if (c == '>' && i == 2) {
          165                         if (x->xmlcommentend)
          166                                 x->xmlcommentend(x);
          167                         return;
          168                 } else if (i) {
          169                         if (x->xmlcomment) {
          170                                 for (; i > 0; i--)
          171                                         x->xmlcomment(x, "-", 1);
          172                         }
          173                         i = 0;
          174                 }
          175 
          176                 if (datalen < sizeof(x->data) - 1) {
          177                         x->data[datalen++] = c;
          178                 } else {
          179                         x->data[datalen] = '\0';
          180                         if (x->xmlcomment)
          181                                 x->xmlcomment(x, x->data, datalen);
          182                         x->data[0] = c;
          183                         datalen = 1;
          184                 }
          185         }
          186 }
          187 
          188 static void
          189 xml_parsecdata(XMLParser *x)
          190 {
          191         size_t datalen = 0, i = 0;
          192         int c;
          193 
          194         if (x->xmlcdatastart)
          195                 x->xmlcdatastart(x);
          196         while ((c = GETNEXT()) != EOF) {
          197                 if (c == ']' || c == '>') {
          198                         if (x->xmlcdata && datalen) {
          199                                 x->data[datalen] = '\0';
          200                                 x->xmlcdata(x, x->data, datalen);
          201                                 datalen = 0;
          202                         }
          203                 }
          204 
          205                 if (c == ']') {
          206                         if (++i > 2) {
          207                                 if (x->xmlcdata)
          208                                         for (; i > 2; i--)
          209                                                 x->xmlcdata(x, "]", 1);
          210                                 i = 2;
          211                         }
          212                         continue;
          213                 } else if (c == '>' && i == 2) {
          214                         if (x->xmlcdataend)
          215                                 x->xmlcdataend(x);
          216                         return;
          217                 } else if (i) {
          218                         if (x->xmlcdata)
          219                                 for (; i > 0; i--)
          220                                         x->xmlcdata(x, "]", 1);
          221                         i = 0;
          222                 }
          223 
          224                 if (datalen < sizeof(x->data) - 1) {
          225                         x->data[datalen++] = c;
          226                 } else {
          227                         x->data[datalen] = '\0';
          228                         if (x->xmlcdata)
          229                                 x->xmlcdata(x, x->data, datalen);
          230                         x->data[0] = c;
          231                         datalen = 1;
          232                 }
          233         }
          234 }
          235 
          236 static int
          237 codepointtoutf8(long r, char *s)
          238 {
          239         if (r == 0) {
          240                 return 0; /* NUL byte */
          241         } else if (r <= 0x7F) {
          242                 /* 1 byte: 0aaaaaaa */
          243                 s[0] = r;
          244                 return 1;
          245         } else if (r <= 0x07FF) {
          246                 /* 2 bytes: 00000aaa aabbbbbb */
          247                 s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
          248                 s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
          249                 return 2;
          250         } else if (r <= 0xFFFF) {
          251                 /* 3 bytes: aaaabbbb bbcccccc */
          252                 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
          253                 s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
          254                 s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
          255                 return 3;
          256         } else {
          257                 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
          258                 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
          259                 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
          260                 s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
          261                 s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
          262                 return 4;
          263         }
          264 }
          265 
          266 static int
          267 namedentitytostr(const char *e, char *buf, size_t bufsiz)
          268 {
          269         static const struct {
          270                 const char *entity;
          271                 int c;
          272         } entities[] = {
          273                 { "amp;",  '&'  },
          274                 { "lt;",   '<'  },
          275                 { "gt;",   '>'  },
          276                 { "apos;", '\'' },
          277                 { "quot;", '"'  },
          278                 { "AMP;",  '&'  },
          279                 { "LT;",   '<'  },
          280                 { "GT;",   '>'  },
          281                 { "APOS;", '\'' },
          282                 { "QUOT;", '"'  }
          283         };
          284         size_t i;
          285 
          286         /* buffer is too small */
          287         if (bufsiz < 2)
          288                 return -1;
          289 
          290         for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
          291                 if (!strcmp(e, entities[i].entity)) {
          292                         buf[0] = entities[i].c;
          293                         buf[1] = '\0';
          294                         return 1;
          295                 }
          296         }
          297         return -1;
          298 }
          299 
          300 static int
          301 numericentitytostr(const char *e, char *buf, size_t bufsiz)
          302 {
          303         long l;
          304         int len;
          305         char *end;
          306 
          307         /* buffer is too small */
          308         if (bufsiz < 5)
          309                 return -1;
          310 
          311         errno = 0;
          312         /* hex (16) or decimal (10) */
          313         if (*e == 'x')
          314                 l = strtol(++e, &end, 16);
          315         else
          316                 l = strtol(e, &end, 10);
          317         /* invalid value or not a well-formed entity or invalid code point */
          318         if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
          319             (l >= 0xd800 && l <= 0xdfff))
          320                 return -1;
          321         len = codepointtoutf8(l, buf);
          322         buf[len] = '\0';
          323 
          324         return len;
          325 }
          326 
          327 /* convert named- or numeric entity string to buffer string
          328  * returns byte-length of string or -1 on failure. */
          329 int
          330 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
          331 {
          332         /* doesn't start with & */
          333         if (e[0] != '&')
          334                 return -1;
          335         /* numeric entity */
          336         if (e[1] == '#')
          337                 return numericentitytostr(e + 2, buf, bufsiz);
          338         else /* named entity */
          339                 return namedentitytostr(e + 1, buf, bufsiz);
          340 }
          341 
          342 void
          343 xml_parse(XMLParser *x)
          344 {
          345         size_t datalen, tagdatalen;
          346         int c, isend;
          347 
          348         while ((c = GETNEXT()) != EOF && c != '<')
          349                 ; /* skip until < */
          350 
          351         while (c != EOF) {
          352                 if (c == '<') { /* parse tag */
          353                         if ((c = GETNEXT()) == EOF)
          354                                 return;
          355 
          356                         if (c == '!') { /* cdata and comments */
          357                                 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
          358                                         /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
          359                                         if (tagdatalen <= sizeof("[CDATA[") - 1)
          360                                                 x->data[tagdatalen++] = c;
          361                                         if (c == '>')
          362                                                 break;
          363                                         else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
          364                                                         (x->data[0] == '-')) {
          365                                                 xml_parsecomment(x);
          366                                                 break;
          367                                         } else if (c == '[') {
          368                                                 if (tagdatalen == sizeof("[CDATA[") - 1 &&
          369                                                     !strncmp(x->data, "[CDATA[", tagdatalen)) {
          370                                                         xml_parsecdata(x);
          371                                                         break;
          372                                                 }
          373                                         }
          374                                 }
          375                         } else {
          376                                 /* normal tag (open, short open, close), processing instruction. */
          377                                 x->tag[0] = c;
          378                                 x->taglen = 1;
          379                                 x->isshorttag = isend = 0;
          380 
          381                                 /* treat processing instruction as shorttag, don't strip "?" prefix. */
          382                                 if (c == '?') {
          383                                         x->isshorttag = 1;
          384                                 } else if (c == '/') {
          385                                         if ((c = GETNEXT()) == EOF)
          386                                                 return;
          387                                         x->tag[0] = c;
          388                                         isend = 1;
          389                                 }
          390 
          391                                 while ((c = GETNEXT()) != EOF) {
          392                                         if (c == '/')
          393                                                 x->isshorttag = 1; /* short tag */
          394                                         else if (c == '>' || ISSPACE(c)) {
          395                                                 x->tag[x->taglen] = '\0';
          396                                                 if (isend) { /* end tag, starts with </ */
          397                                                         if (x->xmltagend)
          398                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          399                                                         x->tag[0] = '\0';
          400                                                         x->taglen = 0;
          401                                                 } else {
          402                                                         /* start tag */
          403                                                         if (x->xmltagstart)
          404                                                                 x->xmltagstart(x, x->tag, x->taglen);
          405                                                         if (ISSPACE(c))
          406                                                                 xml_parseattrs(x);
          407                                                         if (x->xmltagstartparsed)
          408                                                                 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
          409                                                 }
          410                                                 /* call tagend for shortform or processing instruction */
          411                                                 if (x->isshorttag) {
          412                                                         if (x->xmltagend)
          413                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          414                                                         x->tag[0] = '\0';
          415                                                         x->taglen = 0;
          416                                                 }
          417                                                 break;
          418                                         } else if (x->taglen < sizeof(x->tag) - 1)
          419                                                 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
          420                                 }
          421                         }
          422                 } else {
          423                         /* parse tag data */
          424                         datalen = 0;
          425                         if (x->xmldatastart)
          426                                 x->xmldatastart(x);
          427                         while ((c = GETNEXT()) != EOF) {
          428                                 if (c == '&') {
          429                                         if (datalen) {
          430                                                 x->data[datalen] = '\0';
          431                                                 if (x->xmldata)
          432                                                         x->xmldata(x, x->data, datalen);
          433                                         }
          434                                         x->data[0] = c;
          435                                         datalen = 1;
          436                                         while ((c = GETNEXT()) != EOF) {
          437                                                 if (c == '<')
          438                                                         break;
          439                                                 if (datalen < sizeof(x->data) - 1)
          440                                                         x->data[datalen++] = c;
          441                                                 else {
          442                                                         /* entity too long for buffer, handle as normal data */
          443                                                         x->data[datalen] = '\0';
          444                                                         if (x->xmldata)
          445                                                                 x->xmldata(x, x->data, datalen);
          446                                                         x->data[0] = c;
          447                                                         datalen = 1;
          448                                                         break;
          449                                                 }
          450                                                 if (c == ';') {
          451                                                         x->data[datalen] = '\0';
          452                                                         if (x->xmldataentity)
          453                                                                 x->xmldataentity(x, x->data, datalen);
          454                                                         datalen = 0;
          455                                                         break;
          456                                                 }
          457                                         }
          458                                 } else if (c != '<') {
          459                                         if (datalen < sizeof(x->data) - 1) {
          460                                                 x->data[datalen++] = c;
          461                                         } else {
          462                                                 x->data[datalen] = '\0';
          463                                                 if (x->xmldata)
          464                                                         x->xmldata(x, x->data, datalen);
          465                                                 x->data[0] = c;
          466                                                 datalen = 1;
          467                                         }
          468                                 }
          469                                 if (c == '<') {
          470                                         x->data[datalen] = '\0';
          471                                         if (x->xmldata && datalen)
          472                                                 x->xmldata(x, x->data, datalen);
          473                                         if (x->xmldataend)
          474                                                 x->xmldataend(x);
          475                                         break;
          476                                 }
          477                         }
          478                 }
          479         }
          480 }