parse.c - bag - Dutch BAG Kadaster Extract parser (subset) (HTM) git clone git://git.codemadness.org/bag (DIR) Log (DIR) Files (DIR) Refs (DIR) README (DIR) LICENSE --- parse.c (16615B) --- 1 #define USE_MMAP 2 3 #if WIN32 4 #include <io.h> /* for setmode() */ 5 #endif 6 7 #ifdef USE_MMAP 8 #include <sys/mman.h> 9 #include <sys/stat.h> 10 #include <sys/types.h> 11 12 #include <err.h> 13 #include <fcntl.h> 14 #endif 15 16 #include <errno.h> 17 #include <limits.h> 18 #include <stdio.h> 19 #include <stdlib.h> 20 #include <string.h> 21 #include <unistd.h> 22 23 /* ctype-like macros, but always compatible with ASCII / UTF-8 */ 24 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26) 25 #define ISCNTRL(c) ((c) < ' ' || (c) == 0x7f) 26 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5)) 27 28 #define PUTCHAR putchar_unlocked 29 /*#define PUTCHAR putchar*/ 30 31 struct address { 32 char bagnr[64]; 33 char oppervlakte[256]; 34 char status[256]; 35 char gebruiksdoel[256]; 36 char huisnummer[32]; 37 char huisletter[32]; 38 char huisnummertoevoeging[32]; 39 char postcode[8]; 40 }; 41 42 typedef struct xmlparser { 43 /* current tag */ 44 char tag[1024]; 45 size_t taglen; 46 /* current tag is a short tag ? <tag /> */ 47 int isshorttag; 48 /* current attribute name */ 49 char name[1024]; 50 /* data buffer used for tag data, CDATA and attribute data */ 51 char data[BUFSIZ]; 52 } XMLParser; 53 54 int xml_entitytostr(const char *, char *, size_t); 55 void xml_parse(XMLParser *); 56 57 static void xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, 58 const char *v, size_t vl); 59 static void xmldata(XMLParser *x, const char *d, size_t dl); 60 static void xmltagend(XMLParser *x, const char *t, size_t tl, int isshort); 61 static void xmltagstart(XMLParser *x, const char *t, size_t tl); 62 63 static XMLParser x; 64 static struct address address; 65 static int inbagobject, innummeraanduiding, inhoofdadres; 66 static int isbagnrtype; 67 static int eindgeldig; 68 69 /* different readers, performance differs per platform */ 70 #ifdef USE_MMAP 71 72 static int fd; 73 struct stat st; 74 unsigned char *reg; 75 size_t len, off; 76 77 #define GETNEXT() (off >= len ? EOF : reg[off++]) 78 79 #else 80 81 #if 1 82 #define GETNEXT getchar_unlocked 83 #else 84 static int roffset, rtotal; 85 static char rbuf[4096*4]; 86 87 int 88 getnext(void) 89 { 90 ssize_t n; 91 92 if (roffset >= rtotal) { 93 n = fread(rbuf, 1, sizeof(rbuf), stdin); 94 if (ferror(stdin)) { 95 perror(NULL); 96 exit(1); 97 } 98 if (feof(stdin) || n == 0) { 99 roffset = 0; 100 rtotal = 0; 101 return EOF; 102 } 103 roffset = 0; 104 rtotal = n; 105 } 106 return rbuf[roffset++]; 107 } 108 109 #define GETNEXT getnext 110 #endif 111 #endif 112 113 static void 114 xml_parseattrs(XMLParser *x) 115 { 116 size_t namelen = 0, valuelen; 117 int c, endsep, endname = 0, valuestart = 0; 118 119 while ((c = GETNEXT()) != EOF) { 120 if (ISSPACE(c)) { 121 if (namelen) 122 endname = 1; 123 continue; 124 } else if (c == '?') 125 ; /* ignore */ 126 else if (c == '=') { 127 x->name[namelen] = '\0'; 128 valuestart = 1; 129 endname = 1; 130 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) { 131 /* attribute without value */ 132 x->name[namelen] = '\0'; 133 xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0); 134 endname = 0; 135 x->name[0] = c; 136 namelen = 1; 137 } else if (namelen && valuestart) { 138 /* attribute with value */ 139 140 valuelen = 0; 141 if (c == '\'' || c == '"') { 142 endsep = c; 143 } else { 144 endsep = ' '; /* ISSPACE() */ 145 goto startvalue; 146 } 147 148 while ((c = GETNEXT()) != EOF) { 149 startvalue: 150 if (c == '&') { /* entities */ 151 x->data[valuelen] = '\0'; 152 /* call data function with data before entity if there is data */ 153 if (valuelen) 154 xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 155 x->data[0] = c; 156 valuelen = 1; 157 while ((c = GETNEXT()) != EOF) { 158 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) 159 break; 160 if (valuelen < sizeof(x->data) - 1) 161 x->data[valuelen++] = c; 162 else { 163 /* entity too long for buffer, handle as normal data */ 164 x->data[valuelen] = '\0'; 165 xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 166 x->data[0] = c; 167 valuelen = 1; 168 break; 169 } 170 if (c == ';') { 171 x->data[valuelen] = '\0'; 172 valuelen = 0; 173 break; 174 } 175 } 176 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) { 177 if (valuelen < sizeof(x->data) - 1) { 178 x->data[valuelen++] = c; 179 } else { 180 x->data[valuelen] = '\0'; 181 xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 182 x->data[0] = c; 183 valuelen = 1; 184 } 185 } 186 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) { 187 x->data[valuelen] = '\0'; 188 xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 189 break; 190 } 191 } 192 namelen = endname = valuestart = 0; 193 } else if (namelen < sizeof(x->name) - 1) { 194 x->name[namelen++] = c; 195 } 196 if (c == '>') { 197 break; 198 } else if (c == '/') { 199 x->isshorttag = 1; 200 x->name[0] = '\0'; 201 namelen = 0; 202 } 203 } 204 } 205 206 static void 207 xml_parsecomment(XMLParser *x) 208 { 209 size_t i = 0; 210 int c; 211 212 while ((c = GETNEXT()) != EOF) { 213 if (c == '-') { 214 if (++i > 2) { 215 i = 2; 216 } 217 continue; 218 } else if (c == '>' && i == 2) { 219 return; 220 } else if (i) { 221 i = 0; 222 } 223 } 224 } 225 226 static void 227 xml_parsecdata(XMLParser *x) 228 { 229 size_t datalen = 0, i = 0; 230 int c; 231 232 while ((c = GETNEXT()) != EOF) { 233 if (c == ']' || c == '>') { 234 if (datalen) { 235 x->data[datalen] = '\0'; 236 xmldata(x, x->data, datalen); 237 datalen = 0; 238 } 239 } 240 241 if (c == ']') { 242 if (++i > 2) { 243 for (; i > 2; i--) 244 xmldata(x, "]", 1); 245 i = 2; 246 } 247 continue; 248 } else if (c == '>' && i == 2) { 249 return; 250 } else if (i) { 251 for (; i > 0; i--) 252 xmldata(x, "]", 1); 253 i = 0; 254 } 255 256 if (datalen < sizeof(x->data) - 1) { 257 x->data[datalen++] = c; 258 } else { 259 x->data[datalen] = '\0'; 260 xmldata(x, x->data, datalen); 261 x->data[0] = c; 262 datalen = 1; 263 } 264 } 265 } 266 267 static int 268 codepointtoutf8(long r, char *s) 269 { 270 if (r == 0) { 271 return 0; /* NUL byte */ 272 } else if (r <= 0x7F) { 273 /* 1 byte: 0aaaaaaa */ 274 s[0] = r; 275 return 1; 276 } else if (r <= 0x07FF) { 277 /* 2 bytes: 00000aaa aabbbbbb */ 278 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ 279 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ 280 return 2; 281 } else if (r <= 0xFFFF) { 282 /* 3 bytes: aaaabbbb bbcccccc */ 283 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ 284 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ 285 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ 286 return 3; 287 } else { 288 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */ 289 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ 290 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ 291 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ 292 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ 293 return 4; 294 } 295 } 296 297 static int 298 namedentitytostr(const char *e, char *buf, size_t bufsiz) 299 { 300 static const struct { 301 const char *entity; 302 int c; 303 } entities[] = { 304 { "amp;", '&' }, 305 { "lt;", '<' }, 306 { "gt;", '>' }, 307 { "apos;", '\'' }, 308 { "quot;", '"' }, 309 }; 310 size_t i; 311 312 /* buffer is too small */ 313 if (bufsiz < 2) 314 return -1; 315 316 for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) { 317 if (!strcmp(e, entities[i].entity)) { 318 buf[0] = entities[i].c; 319 buf[1] = '\0'; 320 return 1; 321 } 322 } 323 return -1; 324 } 325 326 static int 327 numericentitytostr(const char *e, char *buf, size_t bufsiz) 328 { 329 long l; 330 int len; 331 char *end; 332 333 /* buffer is too small */ 334 if (bufsiz < 5) 335 return -1; 336 337 errno = 0; 338 /* hex (16) or decimal (10) */ 339 if (*e == 'x') 340 l = strtol(++e, &end, 16); 341 else 342 l = strtol(e, &end, 10); 343 /* invalid value or not a well-formed entity or invalid code point */ 344 if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff || 345 (l >= 0xd800 && l <= 0xdfff)) 346 return -1; 347 len = codepointtoutf8(l, buf); 348 buf[len] = '\0'; 349 350 return len; 351 } 352 353 /* convert named- or numeric entity string to buffer string 354 * returns byte-length of string or -1 on failure. */ 355 int 356 xml_entitytostr(const char *e, char *buf, size_t bufsiz) 357 { 358 /* doesn't start with & */ 359 if (e[0] != '&') 360 return -1; 361 /* numeric entity */ 362 if (e[1] == '#') 363 return numericentitytostr(e + 2, buf, bufsiz); 364 else /* named entity */ 365 return namedentitytostr(e + 1, buf, bufsiz); 366 } 367 368 void 369 xml_parse(XMLParser *x) 370 { 371 size_t datalen, tagdatalen; 372 int c, isend; 373 374 while ((c = GETNEXT()) != EOF && c != '<') 375 ; /* skip until < */ 376 377 while (c != EOF) { 378 if (c == '<') { /* parse tag */ 379 if ((c = GETNEXT()) == EOF) 380 return; 381 382 if (c == '!') { /* CDATA and comments */ 383 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) { 384 /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */ 385 if (tagdatalen <= sizeof("[CDATA[") - 1) 386 x->data[tagdatalen++] = c; 387 if (c == '>') 388 break; 389 else if (c == '-' && tagdatalen == sizeof("--") - 1 && 390 (x->data[0] == '-')) { 391 xml_parsecomment(x); 392 break; 393 } else if (c == '[') { 394 if (tagdatalen == sizeof("[CDATA[") - 1 && 395 !strncmp(x->data, "[CDATA[", tagdatalen)) { 396 xml_parsecdata(x); 397 break; 398 } 399 } 400 } 401 } else { 402 /* normal tag (open, short open, close), processing instruction. */ 403 x->tag[0] = c; 404 x->taglen = 1; 405 x->isshorttag = isend = 0; 406 407 /* treat processing instruction as short tag, don't strip "?" prefix. */ 408 if (c == '?') { 409 x->isshorttag = 1; 410 } else if (c == '/') { 411 if ((c = GETNEXT()) == EOF) 412 return; 413 x->tag[0] = c; 414 isend = 1; 415 } 416 417 while ((c = GETNEXT()) != EOF) { 418 if (c == '/') 419 x->isshorttag = 1; /* short tag */ 420 else if (c == '>' || ISSPACE(c)) { 421 x->tag[x->taglen] = '\0'; 422 if (isend) { /* end tag, starts with </ */ 423 xmltagend(x, x->tag, x->taglen, x->isshorttag); 424 x->tag[0] = '\0'; 425 x->taglen = 0; 426 } else { 427 /* start tag */ 428 xmltagstart(x, x->tag, x->taglen); 429 if (ISSPACE(c)) 430 xml_parseattrs(x); 431 } 432 /* call tagend for short tag or processing instruction */ 433 if (x->isshorttag) { 434 xmltagend(x, x->tag, x->taglen, x->isshorttag); 435 x->tag[0] = '\0'; 436 x->taglen = 0; 437 } 438 break; 439 } else if (x->taglen < sizeof(x->tag) - 1) 440 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */ 441 } 442 } 443 } else { 444 /* parse tag data */ 445 datalen = 0; 446 while ((c = GETNEXT()) != EOF) { 447 if (c == '&') { 448 if (datalen) { 449 x->data[datalen] = '\0'; 450 xmldata(x, x->data, datalen); 451 } 452 x->data[0] = c; 453 datalen = 1; 454 while ((c = GETNEXT()) != EOF) { 455 if (c == '<') 456 break; 457 if (datalen < sizeof(x->data) - 1) 458 x->data[datalen++] = c; 459 else { 460 /* entity too long for buffer, handle as normal data */ 461 x->data[datalen] = '\0'; 462 xmldata(x, x->data, datalen); 463 x->data[0] = c; 464 datalen = 1; 465 break; 466 } 467 if (c == ';') { 468 x->data[datalen] = '\0'; 469 datalen = 0; 470 break; 471 } 472 } 473 } else if (c != '<') { 474 if (datalen < sizeof(x->data) - 1) { 475 x->data[datalen++] = c; 476 } else { 477 x->data[datalen] = '\0'; 478 xmldata(x, x->data, datalen); 479 x->data[0] = c; 480 datalen = 1; 481 } 482 } 483 if (c == '<') { 484 x->data[datalen] = '\0'; 485 if (datalen) 486 xmldata(x, x->data, datalen); 487 break; 488 } 489 } 490 } 491 } 492 } 493 494 static void 495 clearaddress(struct address *a) 496 { 497 a->bagnr[0] = '\0'; 498 a->oppervlakte[0] = '\0'; 499 a->status[0] = '\0'; 500 a->gebruiksdoel[0] = '\0'; 501 a->huisnummer[0] = '\0'; 502 a->huisletter[0] = '\0'; 503 a->huisnummertoevoeging[0] = '\0'; 504 a->postcode[0] = '\0'; 505 } 506 507 static char * 508 ltrim(const char *s) 509 { 510 for (; ISSPACE((unsigned char)*s); s++) 511 ; 512 return (char *)s; 513 } 514 515 /* changed version of strlcpy: copy all non-control characters */ 516 static size_t 517 concat(char *dst, const char *src, size_t dsize) 518 { 519 const char *odst = dst; 520 const char *osrc = src; 521 size_t n = dsize; 522 size_t dlen; 523 524 dst = ltrim(dst); 525 526 /* Find the end of dst and adjust bytes left but don't go past end. */ 527 while (n-- != 0 && *dst != '\0') 528 dst++; 529 dlen = dst - odst; 530 n = dsize - dlen; 531 532 if (n-- == 0) 533 return(dlen + strlen(src)); 534 while (*src != '\0') { 535 if (n != 0 && !ISCNTRL((unsigned char)*src)) { 536 *dst++ = *src; 537 n--; 538 } 539 src++; 540 } 541 *dst = '\0'; 542 543 return(dlen + (src - osrc)); /* count does not include NUL */ 544 } 545 546 static void 547 printfield(const char *s) 548 { 549 /* for (; *s; s++) 550 PUTCHAR(*s);*/ 551 fputs(s, stdout); 552 } 553 554 static void 555 printaddress(void) 556 { 557 if (!address.bagnr[0]) 558 return; 559 /* historical: ignore */ 560 if (eindgeldig) 561 return; 562 563 printfield(address.bagnr); 564 PUTCHAR('\t'); 565 /* NUM */ 566 printfield(address.postcode); 567 PUTCHAR('\t'); 568 printfield(address.huisnummer); 569 PUTCHAR('\t'); 570 printfield(address.huisletter); 571 PUTCHAR('\t'); 572 printfield(address.huisnummertoevoeging); 573 PUTCHAR('\t'); 574 /* VBO */ 575 printfield(address.status); 576 PUTCHAR('\t'); 577 printfield(address.oppervlakte); 578 PUTCHAR('\t'); 579 printfield(address.gebruiksdoel); 580 PUTCHAR('\n'); 581 } 582 583 static void 584 xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, 585 const char *v, size_t vl) 586 { 587 if (a[0] != 'd' || t[0] != 'O') 588 return; 589 if (!strcmp(t, "Objecten:identificatie") || !strcmp(t, "Objecten-ref:NummeraanduidingRef")) 590 if (!strcmp(a, "domein") && !strcmp(v, "NL.IMBAG.Nummeraanduiding")) { 591 isbagnrtype = 1; 592 } 593 } 594 595 static void 596 xmldata(XMLParser *x, const char *d, size_t dl) 597 { 598 if (x->tag[0] != 'O') 599 return; 600 601 if (!strcmp(x->tag, "Objecten:postcode")) { 602 concat(address.postcode, d, sizeof(address.postcode)); 603 } else if (!strcmp(x->tag, "Objecten:huisnummer")) { 604 concat(address.huisnummer, d, sizeof(address.huisnummer)); 605 } else if (!strcmp(x->tag, "Objecten:huisletter")) { 606 concat(address.huisletter, d, sizeof(address.huisletter)); 607 } else if (!strcmp(x->tag, "Objecten:huisnummertoevoeging")) { 608 concat(address.huisnummertoevoeging, d, sizeof(address.huisnummertoevoeging)); 609 } else if (isbagnrtype && !strcmp(x->tag, "Objecten:identificatie")) { 610 concat(address.bagnr, d, sizeof(address.bagnr)); 611 } else if (inhoofdadres && isbagnrtype && !strcmp(x->tag, "Objecten-ref:NummeraanduidingRef")) { 612 concat(address.bagnr, d, sizeof(address.bagnr)); 613 } else if (!strcmp(x->tag, "Objecten:oppervlakte")) { 614 concat(address.oppervlakte, d, sizeof(address.oppervlakte)); 615 } else if (!strcmp(x->tag, "Objecten:status")) { 616 concat(address.status, d, sizeof(address.status)); 617 } else if (!strcmp(x->tag, "Objecten:gebruiksdoel")) { 618 if (address.gebruiksdoel[0]) 619 concat(address.gebruiksdoel, ", ", sizeof(address.gebruiksdoel)); 620 concat(address.gebruiksdoel, d, sizeof(address.gebruiksdoel)); 621 } 622 } 623 624 static void 625 xmltagend(XMLParser *x, const char *t, size_t tl, int isshort) 626 { 627 if (t[0] != 's' && t[0] != 'O') 628 return; 629 if (inbagobject && !strcmp(t, "sl-bag-extract:bagObject")) { 630 printaddress(); 631 632 inbagobject = 0; 633 innummeraanduiding = 0; 634 inhoofdadres = 0; 635 eindgeldig = 0; 636 clearaddress(&address); 637 } else if (innummeraanduiding) { 638 if (!strcmp(t, "Objecten:Nummeraanduiding") || !strcmp(t, "Objecten-ref:NummeraanduidingRef")) { 639 innummeraanduiding = 0; 640 isbagnrtype = 0; 641 } 642 } else if (isbagnrtype && !strcmp(t, "Objecten:identificatie")) { 643 isbagnrtype = 0; 644 } else if (inhoofdadres && !strcmp(t, "Objecten:heeftAlsHoofdadres")) { 645 inhoofdadres = 0; 646 } 647 } 648 649 static void 650 xmltagstart(XMLParser *x, const char *t, size_t tl) 651 { 652 if (t[0] != 's' && t[0] != 'O' && t[0] != 'H') 653 return; 654 if (!inbagobject && !strcmp(t, "sl-bag-extract:bagObject")) { 655 inbagobject = 1; 656 eindgeldig = 0; 657 clearaddress(&address); 658 } else if (inbagobject) { 659 if (!innummeraanduiding && !strcmp(t, "Objecten:Nummeraanduiding")) 660 innummeraanduiding = 1; 661 662 if (!inhoofdadres && !strcmp(t, "Objecten:heeftAlsHoofdadres")) 663 inhoofdadres = 1; 664 665 if (isbagnrtype) { 666 if (!strcmp(x->tag, "Objecten:identificatie") || !strcmp(x->tag, "Objecten-ref:NummeraanduidingRef")) 667 isbagnrtype = 0; 668 } 669 /* historical document */ 670 if (!strcmp(x->tag, "Historie:eindGeldigheid")) { 671 eindgeldig = 1; 672 } 673 } 674 } 675 676 int 677 main(int argc, char *argv[]) 678 { 679 #ifdef USE_MMAP 680 if (argc < 2) { 681 fprintf(stderr, "usage: %s <file>\n", argv[0]); 682 return 1; 683 } 684 685 if ((fd = open(argv[1], O_RDONLY)) < 0) 686 err(1, "open"); 687 if (fstat(fd, &st) < 0) 688 err(1, "fstat"); 689 690 off = 0; 691 len = st.st_size; 692 /*posix_fadvise(fd, 0, len, POSIX_FADV_SEQUENTIAL);*/ /* Linux */ 693 if ((reg = mmap(0, len, PROT_READ, MAP_SHARED|MAP_FILE, fd, off)) == MAP_FAILED) 694 err(1, "mmap"); 695 696 xml_parse(&x); 697 698 /* progress meter */ 699 /*fprintf(stderr, "\rProgress: %.2f%%\n", 100.0);*/ 700 701 munmap(reg, len); 702 close(fd); 703 #else 704 /* required for Windows binary mode aka more retarded bullshit. */ 705 #if WIN32 706 /* binary mode for stdin, stdout and stderr */ 707 _setmode(0, 0x8000); /* 0x8000 is O_BINARY */ 708 _setmode(1, 0x8000); 709 _setmode(2, 0x8000); 710 #endif 711 712 xml_parse(&x); 713 #endif 714 715 printaddress(); 716 717 return 0; 718 }