xml.c - webdump - HTML to plain-text converter for webpages (HTM) git clone git://git.codemadness.org/webdump (DIR) Log (DIR) Files (DIR) Refs (DIR) README (DIR) LICENSE --- xml.c (11720B) --- 1 #include <errno.h> 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <string.h> 5 6 #include "xml.h" 7 8 /* ifdef for HTML mode. To differentiate xml.c and webdump HTML changes */ 9 #define HTML_MODE 10 11 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26) 12 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5)) 13 14 static void 15 xml_parseattrs(XMLParser *x) 16 { 17 size_t namelen = 0, valuelen; 18 int c, endsep, endname = 0, valuestart = 0; 19 20 while ((c = GETNEXT()) != EOF) { 21 if (ISSPACE(c)) { 22 if (namelen) 23 endname = 1; 24 continue; 25 } else if (c == '?') 26 ; /* ignore */ 27 else if (c == '=') { 28 x->name[namelen] = '\0'; 29 valuestart = 1; 30 endname = 1; 31 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) { 32 /* attribute without value */ 33 x->name[namelen] = '\0'; 34 if (x->xmlattrstart) 35 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); 36 if (x->xmlattr) 37 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0); 38 if (x->xmlattrend) 39 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); 40 endname = 0; 41 x->name[0] = c; 42 namelen = 1; 43 } else if (namelen && valuestart) { 44 /* attribute with value */ 45 if (x->xmlattrstart) 46 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); 47 48 valuelen = 0; 49 if (c == '\'' || c == '"') { 50 endsep = c; 51 } else { 52 endsep = ' '; /* ISSPACE() */ 53 goto startvalue; 54 } 55 56 while ((c = GETNEXT()) != EOF) { 57 startvalue: 58 if (c == '&') { /* entities */ 59 x->data[valuelen] = '\0'; 60 /* call data function with data before entity if there is data */ 61 if (valuelen && x->xmlattr) 62 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 63 x->data[0] = c; 64 valuelen = 1; 65 while ((c = GETNEXT()) != EOF) { 66 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) 67 break; 68 if (valuelen < sizeof(x->data) - 1) 69 x->data[valuelen++] = c; 70 else { 71 /* entity too long for buffer, handle as normal data */ 72 x->data[valuelen] = '\0'; 73 if (x->xmlattr) 74 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 75 x->data[0] = c; 76 valuelen = 1; 77 break; 78 } 79 if (c == ';') { 80 x->data[valuelen] = '\0'; 81 if (x->xmlattrentity) 82 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 83 valuelen = 0; 84 break; 85 } 86 } 87 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) { 88 if (valuelen < sizeof(x->data) - 1) { 89 x->data[valuelen++] = c; 90 } else { 91 x->data[valuelen] = '\0'; 92 if (x->xmlattr) 93 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 94 x->data[0] = c; 95 valuelen = 1; 96 } 97 } 98 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) { 99 x->data[valuelen] = '\0'; 100 if (x->xmlattr) 101 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 102 if (x->xmlattrend) 103 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); 104 break; 105 } 106 } 107 namelen = endname = valuestart = 0; 108 } else if (namelen < sizeof(x->name) - 1) { 109 x->name[namelen++] = c; 110 } 111 if (c == '>') { 112 break; 113 } else if (c == '/') { 114 x->isshorttag = 1; 115 x->name[0] = '\0'; 116 namelen = 0; 117 } 118 } 119 } 120 121 static void 122 xml_parsecomment(XMLParser *x) 123 { 124 size_t datalen = 0, i = 0; 125 int c; 126 127 if (x->xmlcommentstart) 128 x->xmlcommentstart(x); 129 while ((c = GETNEXT()) != EOF) { 130 if (c == '-' || c == '>') { 131 if (x->xmlcomment && datalen) { 132 x->data[datalen] = '\0'; 133 x->xmlcomment(x, x->data, datalen); 134 datalen = 0; 135 } 136 } 137 138 if (c == '-') { 139 if (++i > 2) { 140 if (x->xmlcomment) 141 for (; i > 2; i--) 142 x->xmlcomment(x, "-", 1); 143 i = 2; 144 } 145 continue; 146 } else if (c == '>' && i == 2) { 147 if (x->xmlcommentend) 148 x->xmlcommentend(x); 149 return; 150 } else if (i) { 151 if (x->xmlcomment) { 152 for (; i > 0; i--) 153 x->xmlcomment(x, "-", 1); 154 } 155 i = 0; 156 } 157 158 if (datalen < sizeof(x->data) - 1) { 159 x->data[datalen++] = c; 160 } else { 161 x->data[datalen] = '\0'; 162 if (x->xmlcomment) 163 x->xmlcomment(x, x->data, datalen); 164 x->data[0] = c; 165 datalen = 1; 166 } 167 } 168 } 169 170 static void 171 xml_parsecdata(XMLParser *x) 172 { 173 size_t datalen = 0, i = 0; 174 int c; 175 176 if (x->xmlcdatastart) 177 x->xmlcdatastart(x); 178 while ((c = GETNEXT()) != EOF) { 179 if (c == ']' || c == '>') { 180 if (x->xmlcdata && datalen) { 181 x->data[datalen] = '\0'; 182 x->xmlcdata(x, x->data, datalen); 183 datalen = 0; 184 } 185 } 186 187 if (c == ']') { 188 if (++i > 2) { 189 if (x->xmlcdata) 190 for (; i > 2; i--) 191 x->xmlcdata(x, "]", 1); 192 i = 2; 193 } 194 continue; 195 } else if (c == '>' && i == 2) { 196 if (x->xmlcdataend) 197 x->xmlcdataend(x); 198 return; 199 } else if (i) { 200 if (x->xmlcdata) 201 for (; i > 0; i--) 202 x->xmlcdata(x, "]", 1); 203 i = 0; 204 } 205 206 if (datalen < sizeof(x->data) - 1) { 207 x->data[datalen++] = c; 208 } else { 209 x->data[datalen] = '\0'; 210 if (x->xmlcdata) 211 x->xmlcdata(x, x->data, datalen); 212 x->data[0] = c; 213 datalen = 1; 214 } 215 } 216 } 217 218 static int 219 codepointtoutf8(long r, char *s) 220 { 221 if (r == 0) { 222 return 0; /* NUL byte */ 223 } else if (r <= 0x7F) { 224 /* 1 byte: 0aaaaaaa */ 225 s[0] = r; 226 return 1; 227 } else if (r <= 0x07FF) { 228 /* 2 bytes: 00000aaa aabbbbbb */ 229 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ 230 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ 231 return 2; 232 } else if (r <= 0xFFFF) { 233 /* 3 bytes: aaaabbbb bbcccccc */ 234 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ 235 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ 236 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ 237 return 3; 238 } else { 239 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */ 240 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ 241 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ 242 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ 243 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ 244 return 4; 245 } 246 } 247 248 struct namedentity { 249 const char *entity; 250 long cp; 251 }; 252 253 static int 254 namedentitycmp(const void *v1, const void *v2) 255 { 256 struct namedentity *n1 = (struct namedentity *)v1; 257 struct namedentity *n2 = (struct namedentity *)v2; 258 259 return strcmp(n1->entity, n2->entity); 260 } 261 262 static const struct namedentity entities[] = { 263 #include "namedentities.h" 264 }; 265 266 static int 267 namedentitytostr(const char *e, char *buf, size_t bufsiz) 268 { 269 struct namedentity find, *found; 270 size_t i; 271 272 /* buffer is too small */ 273 if (bufsiz < 5) 274 return -1; 275 276 find.entity = e; 277 found = bsearch(&find, entities, sizeof(entities) / sizeof(*entities), 278 sizeof(*entities), namedentitycmp); 279 if (found) { 280 i = codepointtoutf8(found->cp, buf); 281 buf[i] = '\0'; 282 return i; 283 } 284 return -1; 285 } 286 287 static int 288 numericentitytostr(const char *e, char *buf, size_t bufsiz) 289 { 290 long l; 291 int len; 292 char *end; 293 294 /* buffer is too small */ 295 if (bufsiz < 5) 296 return -1; 297 298 errno = 0; 299 /* hex (16) or decimal (10) */ 300 if (*e == 'x') 301 l = strtol(++e, &end, 16); 302 else 303 l = strtol(e, &end, 10); 304 /* invalid value or not a well-formed entity or invalid code point */ 305 if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff || 306 (l >= 0xd800 && l <= 0xdfff)) 307 return -1; 308 len = codepointtoutf8(l, buf); 309 buf[len] = '\0'; 310 311 return len; 312 } 313 314 /* convert named- or numeric entity string to buffer string 315 * returns byte-length of string or -1 on failure. */ 316 int 317 xml_entitytostr(const char *e, char *buf, size_t bufsiz) 318 { 319 /* doesn't start with & */ 320 if (e[0] != '&') 321 return -1; 322 /* numeric entity */ 323 if (e[1] == '#') 324 return numericentitytostr(e + 2, buf, bufsiz); 325 else /* named entity */ 326 return namedentitytostr(e + 1, buf, bufsiz); 327 } 328 329 void 330 xml_parse(XMLParser *x) 331 { 332 size_t datalen, tagdatalen; 333 int c, isend; 334 335 #ifdef HTML_MODE 336 goto read_data; 337 #else 338 /* HTML: process data before a tag occured aswell */ 339 while ((c = GETNEXT()) != EOF && c != '<') 340 ; /* skip until < */ 341 #endif 342 343 while (c != EOF) { 344 if (c == '<') { /* parse tag */ 345 if ((c = GETNEXT()) == EOF) 346 return; 347 348 if (c == '!') { /* CDATA and comments */ 349 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) { 350 /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */ 351 if (tagdatalen <= sizeof("[CDATA[") - 1) 352 x->data[tagdatalen++] = c; 353 if (c == '>') 354 break; 355 else if (c == '-' && tagdatalen == sizeof("--") - 1 && 356 (x->data[0] == '-')) { 357 xml_parsecomment(x); 358 break; 359 } else if (c == '[') { 360 if (tagdatalen == sizeof("[CDATA[") - 1 && 361 !strncmp(x->data, "[CDATA[", tagdatalen)) { 362 xml_parsecdata(x); 363 break; 364 } 365 } 366 } 367 } else { 368 /* normal tag (open, short open, close), processing instruction. */ 369 x->tag[0] = c; 370 x->taglen = 1; 371 x->isshorttag = isend = 0; 372 373 /* treat processing instruction as short tag, don't strip "?" prefix. */ 374 if (c == '?') { 375 x->isshorttag = 1; 376 } else if (c == '/') { 377 if ((c = GETNEXT()) == EOF) 378 return; 379 x->tag[0] = c; 380 isend = 1; 381 } 382 383 while ((c = GETNEXT()) != EOF) { 384 if (c == '/') 385 x->isshorttag = 1; /* short tag */ 386 else if (c == '>' || ISSPACE(c)) { 387 x->tag[x->taglen] = '\0'; 388 if (isend) { /* end tag, starts with </ */ 389 while (c != '>' && c != EOF) /* skip until > */ 390 c = GETNEXT(); 391 if (x->xmltagend) 392 x->xmltagend(x, x->tag, x->taglen, x->isshorttag); 393 x->tag[0] = '\0'; 394 x->taglen = 0; 395 } else { 396 /* start tag */ 397 if (x->xmltagstart) 398 x->xmltagstart(x, x->tag, x->taglen); 399 if (ISSPACE(c)) 400 xml_parseattrs(x); 401 if (x->xmltagstartparsed) 402 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag); 403 } 404 /* call tagend for short tag or processing instruction */ 405 if (x->isshorttag) { 406 if (x->xmltagend) 407 x->xmltagend(x, x->tag, x->taglen, x->isshorttag); 408 x->tag[0] = '\0'; 409 x->taglen = 0; 410 } 411 break; 412 } else if (x->taglen < sizeof(x->tag) - 1) 413 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */ 414 } 415 } 416 } else { 417 #ifdef HTML_MODE 418 read_data: 419 #endif 420 /* parse tag data */ 421 datalen = 0; 422 if (x->xmldatastart) 423 x->xmldatastart(x); 424 while ((c = GETNEXT()) != EOF) { 425 if (c == '&') { 426 if (datalen) { 427 x->data[datalen] = '\0'; 428 if (x->xmldata) 429 x->xmldata(x, x->data, datalen); 430 } 431 x->data[0] = c; 432 datalen = 1; 433 while ((c = GETNEXT()) != EOF) { 434 if (c == '<') 435 break; 436 if (datalen < sizeof(x->data) - 1) 437 x->data[datalen++] = c; 438 else { 439 /* entity too long for buffer, handle as normal data */ 440 x->data[datalen] = '\0'; 441 if (x->xmldata) 442 x->xmldata(x, x->data, datalen); 443 x->data[0] = c; 444 datalen = 1; 445 break; 446 } 447 if (c == ';') { 448 x->data[datalen] = '\0'; 449 if (x->xmldataentity) 450 x->xmldataentity(x, x->data, datalen); 451 datalen = 0; 452 break; 453 } 454 } 455 } else if (c != '<') { 456 if (datalen < sizeof(x->data) - 1) { 457 x->data[datalen++] = c; 458 } else { 459 x->data[datalen] = '\0'; 460 if (x->xmldata) 461 x->xmldata(x, x->data, datalen); 462 x->data[0] = c; 463 datalen = 1; 464 } 465 } 466 if (c == '<') { 467 x->data[datalen] = '\0'; 468 if (x->xmldata && datalen) 469 x->xmldata(x, x->data, datalen); 470 if (x->xmldataend) 471 x->xmldataend(x); 472 #ifdef HTML_MODE 473 datalen = 0; 474 #endif 475 break; 476 } 477 } 478 479 #ifdef HTML_MODE 480 /* pending data, even if a tag didn't close (EOF, etc). */ 481 if (datalen) { 482 x->data[datalen] = '\0'; 483 if (x->xmldata && datalen) 484 x->xmldata(x, x->data, datalen); 485 if (x->xmldataend) 486 x->xmldataend(x); 487 datalen = 0; 488 } 489 #endif 490 } 491 } 492 }