xml.c - frontends - front-ends for some sites (experiment) (DIR) Log (DIR) Files (DIR) Refs (DIR) README (DIR) LICENSE --- xml.c (11376B) --- 1 #include <errno.h> 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <string.h> 5 6 #include "xml.h" 7 8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26) 9 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5)) 10 11 /* data buffers, size and offset used for parsing XML, see getnext() */ 12 static const unsigned char *xml_data_buf; 13 static size_t xml_data_size; 14 static size_t xml_data_off; 15 16 void 17 setxmldata(const char *s, size_t len) 18 { 19 xml_data_off = 0; 20 xml_data_size = len; 21 xml_data_buf = (unsigned char *)s; 22 } 23 24 static int 25 getnext(void) 26 { 27 if (xml_data_off >= xml_data_size) 28 return EOF; 29 return xml_data_buf[xml_data_off++]; 30 } 31 32 static void 33 xml_parseattrs(XMLParser *x) 34 { 35 size_t namelen = 0, valuelen; 36 int c, endsep, endname = 0, valuestart = 0; 37 38 while ((c = GETNEXT()) != EOF) { 39 if (ISSPACE(c)) { 40 if (namelen) 41 endname = 1; 42 continue; 43 } else if (c == '?') 44 ; /* ignore */ 45 else if (c == '=') { 46 x->name[namelen] = '\0'; 47 valuestart = 1; 48 endname = 1; 49 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) { 50 /* attribute without value */ 51 x->name[namelen] = '\0'; 52 if (x->xmlattrstart) 53 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); 54 if (x->xmlattr) 55 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0); 56 if (x->xmlattrend) 57 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); 58 endname = 0; 59 x->name[0] = c; 60 namelen = 1; 61 } else if (namelen && valuestart) { 62 /* attribute with value */ 63 if (x->xmlattrstart) 64 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); 65 66 valuelen = 0; 67 if (c == '\'' || c == '"') { 68 endsep = c; 69 } else { 70 endsep = ' '; /* ISSPACE() */ 71 goto startvalue; 72 } 73 74 while ((c = GETNEXT()) != EOF) { 75 startvalue: 76 if (c == '&') { /* entities */ 77 x->data[valuelen] = '\0'; 78 /* call data function with data before entity if there is data */ 79 if (valuelen && x->xmlattr) 80 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 81 x->data[0] = c; 82 valuelen = 1; 83 while ((c = GETNEXT()) != EOF) { 84 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) 85 break; 86 if (valuelen < sizeof(x->data) - 1) 87 x->data[valuelen++] = c; 88 else { 89 /* entity too long for buffer, handle as normal data */ 90 x->data[valuelen] = '\0'; 91 if (x->xmlattr) 92 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 93 x->data[0] = c; 94 valuelen = 1; 95 break; 96 } 97 if (c == ';') { 98 x->data[valuelen] = '\0'; 99 if (x->xmlattrentity) 100 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 101 valuelen = 0; 102 break; 103 } 104 } 105 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) { 106 if (valuelen < sizeof(x->data) - 1) { 107 x->data[valuelen++] = c; 108 } else { 109 x->data[valuelen] = '\0'; 110 if (x->xmlattr) 111 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 112 x->data[0] = c; 113 valuelen = 1; 114 } 115 } 116 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) { 117 x->data[valuelen] = '\0'; 118 if (x->xmlattr) 119 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 120 if (x->xmlattrend) 121 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); 122 break; 123 } 124 } 125 namelen = endname = valuestart = 0; 126 } else if (namelen < sizeof(x->name) - 1) { 127 x->name[namelen++] = c; 128 } 129 if (c == '>') { 130 break; 131 } else if (c == '/') { 132 x->isshorttag = 1; 133 x->name[0] = '\0'; 134 namelen = 0; 135 } 136 } 137 } 138 139 static void 140 xml_parsecomment(XMLParser *x) 141 { 142 size_t datalen = 0, i = 0; 143 int c; 144 145 if (x->xmlcommentstart) 146 x->xmlcommentstart(x); 147 while ((c = GETNEXT()) != EOF) { 148 if (c == '-' || c == '>') { 149 if (x->xmlcomment && datalen) { 150 x->data[datalen] = '\0'; 151 x->xmlcomment(x, x->data, datalen); 152 datalen = 0; 153 } 154 } 155 156 if (c == '-') { 157 if (++i > 2) { 158 if (x->xmlcomment) 159 for (; i > 2; i--) 160 x->xmlcomment(x, "-", 1); 161 i = 2; 162 } 163 continue; 164 } else if (c == '>' && i == 2) { 165 if (x->xmlcommentend) 166 x->xmlcommentend(x); 167 return; 168 } else if (i) { 169 if (x->xmlcomment) { 170 for (; i > 0; i--) 171 x->xmlcomment(x, "-", 1); 172 } 173 i = 0; 174 } 175 176 if (datalen < sizeof(x->data) - 1) { 177 x->data[datalen++] = c; 178 } else { 179 x->data[datalen] = '\0'; 180 if (x->xmlcomment) 181 x->xmlcomment(x, x->data, datalen); 182 x->data[0] = c; 183 datalen = 1; 184 } 185 } 186 } 187 188 static void 189 xml_parsecdata(XMLParser *x) 190 { 191 size_t datalen = 0, i = 0; 192 int c; 193 194 if (x->xmlcdatastart) 195 x->xmlcdatastart(x); 196 while ((c = GETNEXT()) != EOF) { 197 if (c == ']' || c == '>') { 198 if (x->xmlcdata && datalen) { 199 x->data[datalen] = '\0'; 200 x->xmlcdata(x, x->data, datalen); 201 datalen = 0; 202 } 203 } 204 205 if (c == ']') { 206 if (++i > 2) { 207 if (x->xmlcdata) 208 for (; i > 2; i--) 209 x->xmlcdata(x, "]", 1); 210 i = 2; 211 } 212 continue; 213 } else if (c == '>' && i == 2) { 214 if (x->xmlcdataend) 215 x->xmlcdataend(x); 216 return; 217 } else if (i) { 218 if (x->xmlcdata) 219 for (; i > 0; i--) 220 x->xmlcdata(x, "]", 1); 221 i = 0; 222 } 223 224 if (datalen < sizeof(x->data) - 1) { 225 x->data[datalen++] = c; 226 } else { 227 x->data[datalen] = '\0'; 228 if (x->xmlcdata) 229 x->xmlcdata(x, x->data, datalen); 230 x->data[0] = c; 231 datalen = 1; 232 } 233 } 234 } 235 236 static int 237 codepointtoutf8(long r, char *s) 238 { 239 if (r == 0) { 240 return 0; /* NUL byte */ 241 } else if (r <= 0x7F) { 242 /* 1 byte: 0aaaaaaa */ 243 s[0] = r; 244 return 1; 245 } else if (r <= 0x07FF) { 246 /* 2 bytes: 00000aaa aabbbbbb */ 247 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ 248 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ 249 return 2; 250 } else if (r <= 0xFFFF) { 251 /* 3 bytes: aaaabbbb bbcccccc */ 252 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ 253 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ 254 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ 255 return 3; 256 } else { 257 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */ 258 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ 259 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ 260 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ 261 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ 262 return 4; 263 } 264 } 265 266 static int 267 namedentitytostr(const char *e, char *buf, size_t bufsiz) 268 { 269 static const struct { 270 const char *entity; 271 int c; 272 } entities[] = { 273 { "amp;", '&' }, 274 { "lt;", '<' }, 275 { "gt;", '>' }, 276 { "apos;", '\'' }, 277 { "quot;", '"' }, 278 { "AMP;", '&' }, 279 { "LT;", '<' }, 280 { "GT;", '>' }, 281 { "APOS;", '\'' }, 282 { "QUOT;", '"' } 283 }; 284 size_t i; 285 286 /* buffer is too small */ 287 if (bufsiz < 2) 288 return -1; 289 290 for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) { 291 if (!strcmp(e, entities[i].entity)) { 292 buf[0] = entities[i].c; 293 buf[1] = '\0'; 294 return 1; 295 } 296 } 297 return -1; 298 } 299 300 static int 301 numericentitytostr(const char *e, char *buf, size_t bufsiz) 302 { 303 long l; 304 int len; 305 char *end; 306 307 /* buffer is too small */ 308 if (bufsiz < 5) 309 return -1; 310 311 errno = 0; 312 /* hex (16) or decimal (10) */ 313 if (*e == 'x') 314 l = strtol(++e, &end, 16); 315 else 316 l = strtol(e, &end, 10); 317 /* invalid value or not a well-formed entity or invalid code point */ 318 if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff || 319 (l >= 0xd800 && l <= 0xdfff)) 320 return -1; 321 len = codepointtoutf8(l, buf); 322 buf[len] = '\0'; 323 324 return len; 325 } 326 327 /* convert named- or numeric entity string to buffer string 328 * returns byte-length of string or -1 on failure. */ 329 int 330 xml_entitytostr(const char *e, char *buf, size_t bufsiz) 331 { 332 /* doesn't start with & */ 333 if (e[0] != '&') 334 return -1; 335 /* numeric entity */ 336 if (e[1] == '#') 337 return numericentitytostr(e + 2, buf, bufsiz); 338 else /* named entity */ 339 return namedentitytostr(e + 1, buf, bufsiz); 340 } 341 342 void 343 xml_parse(XMLParser *x) 344 { 345 size_t datalen, tagdatalen; 346 int c, isend; 347 348 while ((c = GETNEXT()) != EOF && c != '<') 349 ; /* skip until < */ 350 351 while (c != EOF) { 352 if (c == '<') { /* parse tag */ 353 if ((c = GETNEXT()) == EOF) 354 return; 355 356 if (c == '!') { /* cdata and comments */ 357 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) { 358 /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */ 359 if (tagdatalen <= sizeof("[CDATA[") - 1) 360 x->data[tagdatalen++] = c; 361 if (c == '>') 362 break; 363 else if (c == '-' && tagdatalen == sizeof("--") - 1 && 364 (x->data[0] == '-')) { 365 xml_parsecomment(x); 366 break; 367 } else if (c == '[') { 368 if (tagdatalen == sizeof("[CDATA[") - 1 && 369 !strncmp(x->data, "[CDATA[", tagdatalen)) { 370 xml_parsecdata(x); 371 break; 372 } 373 } 374 } 375 } else { 376 /* normal tag (open, short open, close), processing instruction. */ 377 x->tag[0] = c; 378 x->taglen = 1; 379 x->isshorttag = isend = 0; 380 381 /* treat processing instruction as shorttag, don't strip "?" prefix. */ 382 if (c == '?') { 383 x->isshorttag = 1; 384 } else if (c == '/') { 385 if ((c = GETNEXT()) == EOF) 386 return; 387 x->tag[0] = c; 388 isend = 1; 389 } 390 391 while ((c = GETNEXT()) != EOF) { 392 if (c == '/') 393 x->isshorttag = 1; /* short tag */ 394 else if (c == '>' || ISSPACE(c)) { 395 x->tag[x->taglen] = '\0'; 396 if (isend) { /* end tag, starts with </ */ 397 if (x->xmltagend) 398 x->xmltagend(x, x->tag, x->taglen, x->isshorttag); 399 x->tag[0] = '\0'; 400 x->taglen = 0; 401 } else { 402 /* start tag */ 403 if (x->xmltagstart) 404 x->xmltagstart(x, x->tag, x->taglen); 405 if (ISSPACE(c)) 406 xml_parseattrs(x); 407 if (x->xmltagstartparsed) 408 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag); 409 } 410 /* call tagend for shortform or processing instruction */ 411 if (x->isshorttag) { 412 if (x->xmltagend) 413 x->xmltagend(x, x->tag, x->taglen, x->isshorttag); 414 x->tag[0] = '\0'; 415 x->taglen = 0; 416 } 417 break; 418 } else if (x->taglen < sizeof(x->tag) - 1) 419 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */ 420 } 421 } 422 } else { 423 /* parse tag data */ 424 datalen = 0; 425 if (x->xmldatastart) 426 x->xmldatastart(x); 427 while ((c = GETNEXT()) != EOF) { 428 if (c == '&') { 429 if (datalen) { 430 x->data[datalen] = '\0'; 431 if (x->xmldata) 432 x->xmldata(x, x->data, datalen); 433 } 434 x->data[0] = c; 435 datalen = 1; 436 while ((c = GETNEXT()) != EOF) { 437 if (c == '<') 438 break; 439 if (datalen < sizeof(x->data) - 1) 440 x->data[datalen++] = c; 441 else { 442 /* entity too long for buffer, handle as normal data */ 443 x->data[datalen] = '\0'; 444 if (x->xmldata) 445 x->xmldata(x, x->data, datalen); 446 x->data[0] = c; 447 datalen = 1; 448 break; 449 } 450 if (c == ';') { 451 x->data[datalen] = '\0'; 452 if (x->xmldataentity) 453 x->xmldataentity(x, x->data, datalen); 454 datalen = 0; 455 break; 456 } 457 } 458 } else if (c != '<') { 459 if (datalen < sizeof(x->data) - 1) { 460 x->data[datalen++] = c; 461 } else { 462 x->data[datalen] = '\0'; 463 if (x->xmldata) 464 x->xmldata(x, x->data, datalen); 465 x->data[0] = c; 466 datalen = 1; 467 } 468 } 469 if (c == '<') { 470 x->data[datalen] = '\0'; 471 if (x->xmldata && datalen) 472 x->xmldata(x, x->data, datalen); 473 if (x->xmldataend) 474 x->xmldataend(x); 475 break; 476 } 477 } 478 } 479 } 480 }