xml.c - webdump - HTML to plain-text converter for webpages
 (HTM) git clone git://git.codemadness.org/webdump
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       xml.c (11720B)
       ---
            1 #include <errno.h>
            2 #include <stdio.h>
            3 #include <stdlib.h>
            4 #include <string.h>
            5 
            6 #include "xml.h"
            7 
            8 /* ifdef for HTML mode. To differentiate xml.c and webdump HTML changes */
            9 #define HTML_MODE
           10 
           11 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
           12 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
           13 
           14 static void
           15 xml_parseattrs(XMLParser *x)
           16 {
           17         size_t namelen = 0, valuelen;
           18         int c, endsep, endname = 0, valuestart = 0;
           19 
           20         while ((c = GETNEXT()) != EOF) {
           21                 if (ISSPACE(c)) {
           22                         if (namelen)
           23                                 endname = 1;
           24                         continue;
           25                 } else if (c == '?')
           26                         ; /* ignore */
           27                 else if (c == '=') {
           28                         x->name[namelen] = '\0';
           29                         valuestart = 1;
           30                         endname = 1;
           31                 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
           32                         /* attribute without value */
           33                         x->name[namelen] = '\0';
           34                         if (x->xmlattrstart)
           35                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           36                         if (x->xmlattr)
           37                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
           38                         if (x->xmlattrend)
           39                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
           40                         endname = 0;
           41                         x->name[0] = c;
           42                         namelen = 1;
           43                 } else if (namelen && valuestart) {
           44                         /* attribute with value */
           45                         if (x->xmlattrstart)
           46                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           47 
           48                         valuelen = 0;
           49                         if (c == '\'' || c == '"') {
           50                                 endsep = c;
           51                         } else {
           52                                 endsep = ' '; /* ISSPACE() */
           53                                 goto startvalue;
           54                         }
           55 
           56                         while ((c = GETNEXT()) != EOF) {
           57 startvalue:
           58                                 if (c == '&') { /* entities */
           59                                         x->data[valuelen] = '\0';
           60                                         /* call data function with data before entity if there is data */
           61                                         if (valuelen && x->xmlattr)
           62                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           63                                         x->data[0] = c;
           64                                         valuelen = 1;
           65                                         while ((c = GETNEXT()) != EOF) {
           66                                                 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
           67                                                         break;
           68                                                 if (valuelen < sizeof(x->data) - 1)
           69                                                         x->data[valuelen++] = c;
           70                                                 else {
           71                                                         /* entity too long for buffer, handle as normal data */
           72                                                         x->data[valuelen] = '\0';
           73                                                         if (x->xmlattr)
           74                                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           75                                                         x->data[0] = c;
           76                                                         valuelen = 1;
           77                                                         break;
           78                                                 }
           79                                                 if (c == ';') {
           80                                                         x->data[valuelen] = '\0';
           81                                                         if (x->xmlattrentity)
           82                                                                 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           83                                                         valuelen = 0;
           84                                                         break;
           85                                                 }
           86                                         }
           87                                 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
           88                                         if (valuelen < sizeof(x->data) - 1) {
           89                                                 x->data[valuelen++] = c;
           90                                         } else {
           91                                                 x->data[valuelen] = '\0';
           92                                                 if (x->xmlattr)
           93                                                         x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           94                                                 x->data[0] = c;
           95                                                 valuelen = 1;
           96                                         }
           97                                 }
           98                                 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
           99                                         x->data[valuelen] = '\0';
          100                                         if (x->xmlattr)
          101                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          102                                         if (x->xmlattrend)
          103                                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
          104                                         break;
          105                                 }
          106                         }
          107                         namelen = endname = valuestart = 0;
          108                 } else if (namelen < sizeof(x->name) - 1) {
          109                         x->name[namelen++] = c;
          110                 }
          111                 if (c == '>') {
          112                         break;
          113                 } else if (c == '/') {
          114                         x->isshorttag = 1;
          115                         x->name[0] = '\0';
          116                         namelen = 0;
          117                 }
          118         }
          119 }
          120 
          121 static void
          122 xml_parsecomment(XMLParser *x)
          123 {
          124         size_t datalen = 0, i = 0;
          125         int c;
          126 
          127         if (x->xmlcommentstart)
          128                 x->xmlcommentstart(x);
          129         while ((c = GETNEXT()) != EOF) {
          130                 if (c == '-' || c == '>') {
          131                         if (x->xmlcomment && datalen) {
          132                                 x->data[datalen] = '\0';
          133                                 x->xmlcomment(x, x->data, datalen);
          134                                 datalen = 0;
          135                         }
          136                 }
          137 
          138                 if (c == '-') {
          139                         if (++i > 2) {
          140                                 if (x->xmlcomment)
          141                                         for (; i > 2; i--)
          142                                                 x->xmlcomment(x, "-", 1);
          143                                 i = 2;
          144                         }
          145                         continue;
          146                 } else if (c == '>' && i == 2) {
          147                         if (x->xmlcommentend)
          148                                 x->xmlcommentend(x);
          149                         return;
          150                 } else if (i) {
          151                         if (x->xmlcomment) {
          152                                 for (; i > 0; i--)
          153                                         x->xmlcomment(x, "-", 1);
          154                         }
          155                         i = 0;
          156                 }
          157 
          158                 if (datalen < sizeof(x->data) - 1) {
          159                         x->data[datalen++] = c;
          160                 } else {
          161                         x->data[datalen] = '\0';
          162                         if (x->xmlcomment)
          163                                 x->xmlcomment(x, x->data, datalen);
          164                         x->data[0] = c;
          165                         datalen = 1;
          166                 }
          167         }
          168 }
          169 
          170 static void
          171 xml_parsecdata(XMLParser *x)
          172 {
          173         size_t datalen = 0, i = 0;
          174         int c;
          175 
          176         if (x->xmlcdatastart)
          177                 x->xmlcdatastart(x);
          178         while ((c = GETNEXT()) != EOF) {
          179                 if (c == ']' || c == '>') {
          180                         if (x->xmlcdata && datalen) {
          181                                 x->data[datalen] = '\0';
          182                                 x->xmlcdata(x, x->data, datalen);
          183                                 datalen = 0;
          184                         }
          185                 }
          186 
          187                 if (c == ']') {
          188                         if (++i > 2) {
          189                                 if (x->xmlcdata)
          190                                         for (; i > 2; i--)
          191                                                 x->xmlcdata(x, "]", 1);
          192                                 i = 2;
          193                         }
          194                         continue;
          195                 } else if (c == '>' && i == 2) {
          196                         if (x->xmlcdataend)
          197                                 x->xmlcdataend(x);
          198                         return;
          199                 } else if (i) {
          200                         if (x->xmlcdata)
          201                                 for (; i > 0; i--)
          202                                         x->xmlcdata(x, "]", 1);
          203                         i = 0;
          204                 }
          205 
          206                 if (datalen < sizeof(x->data) - 1) {
          207                         x->data[datalen++] = c;
          208                 } else {
          209                         x->data[datalen] = '\0';
          210                         if (x->xmlcdata)
          211                                 x->xmlcdata(x, x->data, datalen);
          212                         x->data[0] = c;
          213                         datalen = 1;
          214                 }
          215         }
          216 }
          217 
          218 static int
          219 codepointtoutf8(long r, char *s)
          220 {
          221         if (r == 0) {
          222                 return 0; /* NUL byte */
          223         } else if (r <= 0x7F) {
          224                 /* 1 byte: 0aaaaaaa */
          225                 s[0] = r;
          226                 return 1;
          227         } else if (r <= 0x07FF) {
          228                 /* 2 bytes: 00000aaa aabbbbbb */
          229                 s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
          230                 s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
          231                 return 2;
          232         } else if (r <= 0xFFFF) {
          233                 /* 3 bytes: aaaabbbb bbcccccc */
          234                 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
          235                 s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
          236                 s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
          237                 return 3;
          238         } else {
          239                 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
          240                 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
          241                 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
          242                 s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
          243                 s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
          244                 return 4;
          245         }
          246 }
          247 
          248 struct namedentity {
          249         const char *entity;
          250         long cp;
          251 };
          252 
          253 static int
          254 namedentitycmp(const void *v1, const void *v2)
          255 {
          256         struct namedentity *n1 = (struct namedentity *)v1;
          257         struct namedentity *n2 = (struct namedentity *)v2;
          258 
          259         return strcmp(n1->entity, n2->entity);
          260 }
          261 
          262 static const struct namedentity entities[] = {
          263 #include "namedentities.h"
          264 };
          265 
          266 static int
          267 namedentitytostr(const char *e, char *buf, size_t bufsiz)
          268 {
          269         struct namedentity find, *found;
          270         size_t i;
          271 
          272         /* buffer is too small */
          273         if (bufsiz < 5)
          274                 return -1;
          275 
          276         find.entity = e;
          277         found = bsearch(&find, entities, sizeof(entities) / sizeof(*entities),
          278                 sizeof(*entities), namedentitycmp);
          279         if (found) {
          280                 i = codepointtoutf8(found->cp, buf);
          281                 buf[i] = '\0';
          282                 return i;
          283         }
          284         return -1;
          285 }
          286 
          287 static int
          288 numericentitytostr(const char *e, char *buf, size_t bufsiz)
          289 {
          290         long l;
          291         int len;
          292         char *end;
          293 
          294         /* buffer is too small */
          295         if (bufsiz < 5)
          296                 return -1;
          297 
          298         errno = 0;
          299         /* hex (16) or decimal (10) */
          300         if (*e == 'x')
          301                 l = strtol(++e, &end, 16);
          302         else
          303                 l = strtol(e, &end, 10);
          304         /* invalid value or not a well-formed entity or invalid code point */
          305         if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
          306             (l >= 0xd800 && l <= 0xdfff))
          307                 return -1;
          308         len = codepointtoutf8(l, buf);
          309         buf[len] = '\0';
          310 
          311         return len;
          312 }
          313 
          314 /* convert named- or numeric entity string to buffer string
          315  * returns byte-length of string or -1 on failure. */
          316 int
          317 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
          318 {
          319         /* doesn't start with & */
          320         if (e[0] != '&')
          321                 return -1;
          322         /* numeric entity */
          323         if (e[1] == '#')
          324                 return numericentitytostr(e + 2, buf, bufsiz);
          325         else /* named entity */
          326                 return namedentitytostr(e + 1, buf, bufsiz);
          327 }
          328 
          329 void
          330 xml_parse(XMLParser *x)
          331 {
          332         size_t datalen, tagdatalen;
          333         int c, isend;
          334 
          335 #ifdef HTML_MODE
          336         goto read_data;
          337 #else
          338         /* HTML: process data before a tag occured aswell */
          339         while ((c = GETNEXT()) != EOF && c != '<')
          340                 ; /* skip until < */
          341 #endif
          342 
          343         while (c != EOF) {
          344                 if (c == '<') { /* parse tag */
          345                         if ((c = GETNEXT()) == EOF)
          346                                 return;
          347 
          348                         if (c == '!') { /* CDATA and comments */
          349                                 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
          350                                         /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
          351                                         if (tagdatalen <= sizeof("[CDATA[") - 1)
          352                                                 x->data[tagdatalen++] = c;
          353                                         if (c == '>')
          354                                                 break;
          355                                         else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
          356                                                         (x->data[0] == '-')) {
          357                                                 xml_parsecomment(x);
          358                                                 break;
          359                                         } else if (c == '[') {
          360                                                 if (tagdatalen == sizeof("[CDATA[") - 1 &&
          361                                                     !strncmp(x->data, "[CDATA[", tagdatalen)) {
          362                                                         xml_parsecdata(x);
          363                                                         break;
          364                                                 }
          365                                         }
          366                                 }
          367                         } else {
          368                                 /* normal tag (open, short open, close), processing instruction. */
          369                                 x->tag[0] = c;
          370                                 x->taglen = 1;
          371                                 x->isshorttag = isend = 0;
          372 
          373                                 /* treat processing instruction as short tag, don't strip "?" prefix. */
          374                                 if (c == '?') {
          375                                         x->isshorttag = 1;
          376                                 } else if (c == '/') {
          377                                         if ((c = GETNEXT()) == EOF)
          378                                                 return;
          379                                         x->tag[0] = c;
          380                                         isend = 1;
          381                                 }
          382 
          383                                 while ((c = GETNEXT()) != EOF) {
          384                                         if (c == '/')
          385                                                 x->isshorttag = 1; /* short tag */
          386                                         else if (c == '>' || ISSPACE(c)) {
          387                                                 x->tag[x->taglen] = '\0';
          388                                                 if (isend) { /* end tag, starts with </ */
          389                                                         while (c != '>' && c != EOF) /* skip until > */
          390                                                                 c = GETNEXT();
          391                                                         if (x->xmltagend)
          392                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          393                                                         x->tag[0] = '\0';
          394                                                         x->taglen = 0;
          395                                                 } else {
          396                                                         /* start tag */
          397                                                         if (x->xmltagstart)
          398                                                                 x->xmltagstart(x, x->tag, x->taglen);
          399                                                         if (ISSPACE(c))
          400                                                                 xml_parseattrs(x);
          401                                                         if (x->xmltagstartparsed)
          402                                                                 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
          403                                                 }
          404                                                 /* call tagend for short tag or processing instruction */
          405                                                 if (x->isshorttag) {
          406                                                         if (x->xmltagend)
          407                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          408                                                         x->tag[0] = '\0';
          409                                                         x->taglen = 0;
          410                                                 }
          411                                                 break;
          412                                         } else if (x->taglen < sizeof(x->tag) - 1)
          413                                                 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
          414                                 }
          415                         }
          416                 } else {
          417 #ifdef HTML_MODE
          418 read_data:
          419 #endif
          420                         /* parse tag data */
          421                         datalen = 0;
          422                         if (x->xmldatastart)
          423                                 x->xmldatastart(x);
          424                         while ((c = GETNEXT()) != EOF) {
          425                                 if (c == '&') {
          426                                         if (datalen) {
          427                                                 x->data[datalen] = '\0';
          428                                                 if (x->xmldata)
          429                                                         x->xmldata(x, x->data, datalen);
          430                                         }
          431                                         x->data[0] = c;
          432                                         datalen = 1;
          433                                         while ((c = GETNEXT()) != EOF) {
          434                                                 if (c == '<')
          435                                                         break;
          436                                                 if (datalen < sizeof(x->data) - 1)
          437                                                         x->data[datalen++] = c;
          438                                                 else {
          439                                                         /* entity too long for buffer, handle as normal data */
          440                                                         x->data[datalen] = '\0';
          441                                                         if (x->xmldata)
          442                                                                 x->xmldata(x, x->data, datalen);
          443                                                         x->data[0] = c;
          444                                                         datalen = 1;
          445                                                         break;
          446                                                 }
          447                                                 if (c == ';') {
          448                                                         x->data[datalen] = '\0';
          449                                                         if (x->xmldataentity)
          450                                                                 x->xmldataentity(x, x->data, datalen);
          451                                                         datalen = 0;
          452                                                         break;
          453                                                 }
          454                                         }
          455                                 } else if (c != '<') {
          456                                         if (datalen < sizeof(x->data) - 1) {
          457                                                 x->data[datalen++] = c;
          458                                         } else {
          459                                                 x->data[datalen] = '\0';
          460                                                 if (x->xmldata)
          461                                                         x->xmldata(x, x->data, datalen);
          462                                                 x->data[0] = c;
          463                                                 datalen = 1;
          464                                         }
          465                                 }
          466                                 if (c == '<') {
          467                                         x->data[datalen] = '\0';
          468                                         if (x->xmldata && datalen)
          469                                                 x->xmldata(x, x->data, datalen);
          470                                         if (x->xmldataend)
          471                                                 x->xmldataend(x);
          472 #ifdef HTML_MODE
          473                                         datalen = 0;
          474 #endif
          475                                         break;
          476                                 }
          477                         }
          478 
          479 #ifdef HTML_MODE
          480                         /* pending data, even if a tag didn't close (EOF, etc). */
          481                         if (datalen) {
          482                                 x->data[datalen] = '\0';
          483                                 if (x->xmldata && datalen)
          484                                         x->xmldata(x, x->data, datalen);
          485                                 if (x->xmldataend)
          486                                         x->xmldataend(x);
          487                                 datalen = 0;
          488                         }
          489 #endif
          490                 }
          491         }
          492 }