initial repo - bag - BAG Kadaster Extract parser (subset) (HTM) git clone git://git.codemadness.org/bag (DIR) Log (DIR) Files (DIR) Refs (DIR) README (DIR) LICENSE --- (DIR) commit bc7bd116af0cada05627c574f5b0f6c69a82da36 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org> Date: Sat, 18 Nov 2023 23:23:31 +0100 initial repo Diffstat: A LICENSE | 15 +++++++++++++++ A Makefile | 6 ++++++ A README | 21 +++++++++++++++++++++ A glue.awk | 39 +++++++++++++++++++++++++++++++ A glue.c | 146 +++++++++++++++++++++++++++++++ A parse.c | 718 +++++++++++++++++++++++++++++++ A process.sh | 71 +++++++++++++++++++++++++++++++ A xml.c | 480 +++++++++++++++++++++++++++++++ A xml.h | 44 +++++++++++++++++++++++++++++++ 9 files changed, 1540 insertions(+), 0 deletions(-) --- (DIR) diff --git a/LICENSE b/LICENSE @@ -0,0 +1,15 @@ +ISC License + +Copyright (c) 2023 Hiltjo Posthuma <hiltjo@codemadness.org> + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. (DIR) diff --git a/Makefile b/Makefile @@ -0,0 +1,6 @@ +build: + ${CC} -o parse parse.c -O3 -Wall + ${CC} -o glue glue.c -O3 -Wall + +clean: + rm -f glue parse (DIR) diff --git a/README b/README @@ -0,0 +1,21 @@ +BAG Kadaster extract parser + + +# Usage + +Download extract: +https://www.kadaster.nl/zakelijk/producten/adressen-en-gebouwen/bag-2.0-extract +Free version: https://www.kadaster.nl/-/kosteloze-download-bag-2-0-extract + +* unzip 9999VBO*.zip and 9999NUM*.zip files into the same directory. +* Edit parse.c if needed. +* Compile by running: + make +* Edit settings such as the files directory in process.sh if needed. +* Run: + ./process.sh + + +# Tested + +Tested on Linux, OpenBSD and Windows (mingw gcc and tcc). (DIR) diff --git a/glue.awk b/glue.awk @@ -0,0 +1,39 @@ +BEGIN { + FS = OFS = "\t"; +} +# fields: +# 1. bagnr +# 2. postcode +# 3. huisnummer +# 4. huisletter +# 5. huisnummertoevoeging +# 6. status +# 7. oppervlakte +# 8. gebruiksdoel +{ + if ($1 != prev) { + print v1 "\t" v2 "\t" v3 "\t" v4 "\t" v5 "\t" v6 "\t" v7 "\t" v8; + v1 = v2 = v3 = v4 = v5 = v6 = v7 = v8 = ""; + prev = $1; + } + + if ($1 != "") + v1 = $1; + if ($2 != "") + v2 = $2; + if ($3 != "") + v3 = $3; + if ($4 != "") + v4 = $4; + if ($5 != "") + v5 = $5; + if ($6 != "") + v6 = $6; + if ($7 != "") + v7 = $7; + if ($8 != "") + v8 = $8; +} +END { + print v1 "\t" v2 "\t" v3 "\t" v4 "\t" v5 "\t" v6 "\t" v7 "\t" v8; +} (DIR) diff --git a/glue.c b/glue.c @@ -0,0 +1,146 @@ +#if WIN32 +#include <io.h> +#endif + +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +#define PUTCHAR putchar +#define FPUTS fputs + +#define FieldLast 8 + +struct string { + char *data; + size_t len; + size_t cap; +}; + +static struct string mergedfields[FieldLast]; +static char *fields[FieldLast]; + +/* Splits fields in the line buffer by replacing TAB separators with NUL ('\0') +* terminators and assign these fields as pointers. If there are less fields +* than expected then the field is an empty string constant. */ +void +parseline(char *line, char *fields[FieldLast]) +{ + char *prev, *s; + size_t i; + + for (prev = line, i = 0; + (s = strchr(prev, '\t')) && i < FieldLast - 1; + ++i) { + *s = '\0'; + fields[i] = prev; + prev = s + 1; + } + fields[i++] = prev; + /* make non-parsed fields empty. */ + for (; i < FieldLast; i++) + fields[i] = ""; +} + +void +printfields(void) +{ + if (!mergedfields[0].len) + return; + + fputs(mergedfields[0].data, stdout); + fputs("\t", stdout); + fputs(mergedfields[1].data, stdout); + fputs("\t", stdout); + fputs(mergedfields[2].data, stdout); + fputs("\t", stdout); + fputs(mergedfields[3].data, stdout); + fputs("\t", stdout); + fputs(mergedfields[4].data, stdout); + fputs("\t", stdout); + fputs(mergedfields[5].data, stdout); + fputs("\t", stdout); + fputs(mergedfields[6].data, stdout); + fputs("\t", stdout); + fputs(mergedfields[7].data, stdout); + fputs("\n", stdout); +} + +void +string_reset(struct string *d) +{ + d->data[0] = '\0'; + d->len = 0; +} + +void +string_set(struct string *d, const char *data) +{ + size_t len; + + len = strlen(data); + if (len + 1 >= d->cap) { + d->cap = d->cap + len + 1; + if (!(d->data = realloc(d->data, d->cap))) { + perror(NULL); + exit(1); + } + } + memcpy(d->data, data, len+ 1 ); /* copy including NUL byte */ + //d->data[len] = '\0'; + d->len = len; +} + +int +main(void) +{ + char line[4096], *p; + size_t i; + + /* required for Windows binary mode aka more retarded bullshit. */ +#if WIN32 + /* binary mode for stdin, stdout and stderr */ + _setmode(0, 0x8000); /* 0x8000 is O_BINARY */ + _setmode(1, 0x8000); + _setmode(2, 0x8000); +#endif + + for (i = 0; i < FieldLast; ++i) { + mergedfields[i].cap = 4096; + if (!(mergedfields[i].data = calloc(1, 4096))) { + perror(NULL); + exit(1); + } + mergedfields[i].len = 0; + } + + while (fgets(line, sizeof(line), stdin)) { + if ((p = strchr(line, '\n'))) + *p = '\0'; + + parseline(line, fields); + + /* primary key */ + if (strcmp(fields[0], mergedfields[0].data)) { + printfields(); + for (i = 0; i < FieldLast; ++i) + string_reset(&mergedfields[i]); + string_set(&mergedfields[0], fields[0]); + } + + for (i = 1; i < FieldLast; ++i) { + /* field is set: override with next */ + if (!fields[i][0]) + continue; + string_set(&mergedfields[i], fields[i]); + } + } + printfields(); + + if (ferror(stdin) || (fflush(stdout) && ferror(stdout))) { + perror(NULL); + exit(1); + } + + return 0; +} (DIR) diff --git a/parse.c b/parse.c @@ -0,0 +1,718 @@ +#define USE_MMAP + +#if WIN32 +#include <io.h> +#endif + +#ifdef USE_MMAP +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> + +#include <err.h> +#include <fcntl.h> +#endif + +#include <errno.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +/* ctype-like macros, but always compatible with ASCII / UTF-8 */ +#define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26) +#define ISCNTRL(c) ((c) < ' ' || (c) == 0x7f) +#define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5)) + +#define PUTCHAR putchar_unlocked +/*#define PUTCHAR putchar*/ + +struct address { + char bagnr[64]; + char oppervlakte[256]; + char status[256]; + char gebruiksdoel[256]; + char huisnummer[32]; + char huisletter[32]; + char huisnummertoevoeging[32]; + char postcode[8]; +}; + +typedef struct xmlparser { + /* current tag */ + char tag[1024]; + size_t taglen; + /* current tag is a short tag ? <tag /> */ + int isshorttag; + /* current attribute name */ + char name[1024]; + /* data buffer used for tag data, CDATA and attribute data */ + char data[BUFSIZ]; +} XMLParser; + +int xml_entitytostr(const char *, char *, size_t); +void xml_parse(XMLParser *); + +static void xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, + const char *v, size_t vl); +static void xmldata(XMLParser *x, const char *d, size_t dl); +static void xmltagend(XMLParser *x, const char *t, size_t tl, int isshort); +static void xmltagstart(XMLParser *x, const char *t, size_t tl); + +static XMLParser x; +static struct address address; +static int inbagobject, innummeraanduiding, inhoofdadres; +static int isbagnrtype; +static int eindgeldig; + +/* different readers, performance differs per platform */ +#ifdef USE_MMAP + +static int fd; +struct stat st; +unsigned char *reg; +size_t len, off; + +#define GETNEXT() (off >= len ? EOF : reg[off++]) + +#else + +#if 1 +#define GETNEXT getchar_unlocked +#else +static int roffset, rtotal; +static char rbuf[4096*4]; + +int +getnext(void) +{ + ssize_t n; + + if (roffset >= rtotal) { + n = fread(rbuf, 1, sizeof(rbuf), stdin); + if (ferror(stdin)) { + perror(NULL); + exit(1); + } + if (feof(stdin) || n == 0) { + roffset = 0; + rtotal = 0; + return EOF; + } + roffset = 0; + rtotal = n; + } + return rbuf[roffset++]; +} + +#define GETNEXT getnext +#endif +#endif + +static void +xml_parseattrs(XMLParser *x) +{ + size_t namelen = 0, valuelen; + int c, endsep, endname = 0, valuestart = 0; + + while ((c = GETNEXT()) != EOF) { + if (ISSPACE(c)) { + if (namelen) + endname = 1; + continue; + } else if (c == '?') + ; /* ignore */ + else if (c == '=') { + x->name[namelen] = '\0'; + valuestart = 1; + endname = 1; + } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) { + /* attribute without value */ + x->name[namelen] = '\0'; + xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0); + endname = 0; + x->name[0] = c; + namelen = 1; + } else if (namelen && valuestart) { + /* attribute with value */ + + valuelen = 0; + if (c == '\'' || c == '"') { + endsep = c; + } else { + endsep = ' '; /* ISSPACE() */ + goto startvalue; + } + + while ((c = GETNEXT()) != EOF) { +startvalue: + if (c == '&') { /* entities */ + x->data[valuelen] = '\0'; + /* call data function with data before entity if there is data */ + if (valuelen) + xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + while ((c = GETNEXT()) != EOF) { + if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) + break; + if (valuelen < sizeof(x->data) - 1) + x->data[valuelen++] = c; + else { + /* entity too long for buffer, handle as normal data */ + x->data[valuelen] = '\0'; + xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + break; + } + if (c == ';') { + x->data[valuelen] = '\0'; + valuelen = 0; + break; + } + } + } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) { + if (valuelen < sizeof(x->data) - 1) { + x->data[valuelen++] = c; + } else { + x->data[valuelen] = '\0'; + xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + } + } + if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) { + x->data[valuelen] = '\0'; + xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + break; + } + } + namelen = endname = valuestart = 0; + } else if (namelen < sizeof(x->name) - 1) { + x->name[namelen++] = c; + } + if (c == '>') { + break; + } else if (c == '/') { + x->isshorttag = 1; + x->name[0] = '\0'; + namelen = 0; + } + } +} + +static void +xml_parsecomment(XMLParser *x) +{ + size_t i = 0; + int c; + + while ((c = GETNEXT()) != EOF) { + if (c == '-') { + if (++i > 2) { + i = 2; + } + continue; + } else if (c == '>' && i == 2) { + return; + } else if (i) { + i = 0; + } + } +} + +static void +xml_parsecdata(XMLParser *x) +{ + size_t datalen = 0, i = 0; + int c; + + while ((c = GETNEXT()) != EOF) { + if (c == ']' || c == '>') { + if (datalen) { + x->data[datalen] = '\0'; + xmldata(x, x->data, datalen); + datalen = 0; + } + } + + if (c == ']') { + if (++i > 2) { + for (; i > 2; i--) + xmldata(x, "]", 1); + i = 2; + } + continue; + } else if (c == '>' && i == 2) { + return; + } else if (i) { + for (; i > 0; i--) + xmldata(x, "]", 1); + i = 0; + } + + if (datalen < sizeof(x->data) - 1) { + x->data[datalen++] = c; + } else { + x->data[datalen] = '\0'; + xmldata(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + } + } +} + +static int +codepointtoutf8(long r, char *s) +{ + if (r == 0) { + return 0; /* NUL byte */ + } else if (r <= 0x7F) { + /* 1 byte: 0aaaaaaa */ + s[0] = r; + return 1; + } else if (r <= 0x07FF) { + /* 2 bytes: 00000aaa aabbbbbb */ + s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ + s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ + return 2; + } else if (r <= 0xFFFF) { + /* 3 bytes: aaaabbbb bbcccccc */ + s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ + s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ + s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ + return 3; + } else { + /* 4 bytes: 000aaabb bbbbcccc ccdddddd */ + s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ + s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ + s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ + s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ + return 4; + } +} + +static int +namedentitytostr(const char *e, char *buf, size_t bufsiz) +{ + static const struct { + const char *entity; + int c; + } entities[] = { + { "amp;", '&' }, + { "lt;", '<' }, + { "gt;", '>' }, + { "apos;", '\'' }, + { "quot;", '"' }, + }; + size_t i; + + /* buffer is too small */ + if (bufsiz < 2) + return -1; + + for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) { + if (!strcmp(e, entities[i].entity)) { + buf[0] = entities[i].c; + buf[1] = '\0'; + return 1; + } + } + return -1; +} + +static int +numericentitytostr(const char *e, char *buf, size_t bufsiz) +{ + long l; + int len; + char *end; + + /* buffer is too small */ + if (bufsiz < 5) + return -1; + + errno = 0; + /* hex (16) or decimal (10) */ + if (*e == 'x') + l = strtol(++e, &end, 16); + else + l = strtol(e, &end, 10); + /* invalid value or not a well-formed entity or invalid code point */ + if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff || + (l >= 0xd800 && l <= 0xdfff)) + return -1; + len = codepointtoutf8(l, buf); + buf[len] = '\0'; + + return len; +} + +/* convert named- or numeric entity string to buffer string + * returns byte-length of string or -1 on failure. */ +int +xml_entitytostr(const char *e, char *buf, size_t bufsiz) +{ + /* doesn't start with & */ + if (e[0] != '&') + return -1; + /* numeric entity */ + if (e[1] == '#') + return numericentitytostr(e + 2, buf, bufsiz); + else /* named entity */ + return namedentitytostr(e + 1, buf, bufsiz); +} + +void +xml_parse(XMLParser *x) +{ + size_t datalen, tagdatalen; + int c, isend; + + while ((c = GETNEXT()) != EOF && c != '<') + ; /* skip until < */ + + while (c != EOF) { + if (c == '<') { /* parse tag */ + if ((c = GETNEXT()) == EOF) + return; + + if (c == '!') { /* CDATA and comments */ + for (tagdatalen = 0; (c = GETNEXT()) != EOF;) { + /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */ + if (tagdatalen <= sizeof("[CDATA[") - 1) + x->data[tagdatalen++] = c; + if (c == '>') + break; + else if (c == '-' && tagdatalen == sizeof("--") - 1 && + (x->data[0] == '-')) { + xml_parsecomment(x); + break; + } else if (c == '[') { + if (tagdatalen == sizeof("[CDATA[") - 1 && + !strncmp(x->data, "[CDATA[", tagdatalen)) { + xml_parsecdata(x); + break; + } + } + } + } else { + /* normal tag (open, short open, close), processing instruction. */ + x->tag[0] = c; + x->taglen = 1; + x->isshorttag = isend = 0; + + /* treat processing instruction as short tag, don't strip "?" prefix. */ + if (c == '?') { + x->isshorttag = 1; + } else if (c == '/') { + if ((c = GETNEXT()) == EOF) + return; + x->tag[0] = c; + isend = 1; + } + + while ((c = GETNEXT()) != EOF) { + if (c == '/') + x->isshorttag = 1; /* short tag */ + else if (c == '>' || ISSPACE(c)) { + x->tag[x->taglen] = '\0'; + if (isend) { /* end tag, starts with </ */ + xmltagend(x, x->tag, x->taglen, x->isshorttag); + x->tag[0] = '\0'; + x->taglen = 0; + } else { + /* start tag */ + xmltagstart(x, x->tag, x->taglen); + if (ISSPACE(c)) + xml_parseattrs(x); + } + /* call tagend for short tag or processing instruction */ + if (x->isshorttag) { + xmltagend(x, x->tag, x->taglen, x->isshorttag); + x->tag[0] = '\0'; + x->taglen = 0; + } + break; + } else if (x->taglen < sizeof(x->tag) - 1) + x->tag[x->taglen++] = c; /* NOTE: tag name truncation */ + } + } + } else { + /* parse tag data */ + datalen = 0; + while ((c = GETNEXT()) != EOF) { + if (c == '&') { + if (datalen) { + x->data[datalen] = '\0'; + xmldata(x, x->data, datalen); + } + x->data[0] = c; + datalen = 1; + while ((c = GETNEXT()) != EOF) { + if (c == '<') + break; + if (datalen < sizeof(x->data) - 1) + x->data[datalen++] = c; + else { + /* entity too long for buffer, handle as normal data */ + x->data[datalen] = '\0'; + xmldata(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + break; + } + if (c == ';') { + x->data[datalen] = '\0'; + datalen = 0; + break; + } + } + } else if (c != '<') { + if (datalen < sizeof(x->data) - 1) { + x->data[datalen++] = c; + } else { + x->data[datalen] = '\0'; + xmldata(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + } + } + if (c == '<') { + x->data[datalen] = '\0'; + if (datalen) + xmldata(x, x->data, datalen); + break; + } + } + } + } +} + +static void +clearaddress(struct address *a) +{ + a->bagnr[0] = '\0'; + a->oppervlakte[0] = '\0'; + a->status[0] = '\0'; + a->gebruiksdoel[0] = '\0'; + a->huisnummer[0] = '\0'; + a->huisletter[0] = '\0'; + a->huisnummertoevoeging[0] = '\0'; + a->postcode[0] = '\0'; +} + +static char * +ltrim(const char *s) +{ + for (; ISSPACE((unsigned char)*s); s++) + ; + return (char *)s; +} + +/* changed version of strlcpy: copy all non-control characters */ +static size_t +concat(char *dst, const char *src, size_t dsize) +{ + const char *odst = dst; + const char *osrc = src; + size_t n = dsize; + size_t dlen; + + dst = ltrim(dst); + + /* Find the end of dst and adjust bytes left but don't go past end. */ + while (n-- != 0 && *dst != '\0') + dst++; + dlen = dst - odst; + n = dsize - dlen; + + if (n-- == 0) + return(dlen + strlen(src)); + while (*src != '\0') { + if (n != 0 && !ISCNTRL((unsigned char)*src)) { + *dst++ = *src; + n--; + } + src++; + } + *dst = '\0'; + + return(dlen + (src - osrc)); /* count does not include NUL */ +} + +static void +printfield(const char *s) +{ +/* for (; *s; s++) + PUTCHAR(*s);*/ + fputs(s, stdout); +} + +static void +printaddress(void) +{ + if (!address.bagnr[0]) + return; + /* historical: ignore */ + if (eindgeldig) + return; + + printfield(address.bagnr); + PUTCHAR('\t'); + /* NUM */ + printfield(address.postcode); + PUTCHAR('\t'); + printfield(address.huisnummer); + PUTCHAR('\t'); + printfield(address.huisletter); + PUTCHAR('\t'); + printfield(address.huisnummertoevoeging); + PUTCHAR('\t'); + /* VBO */ + printfield(address.status); + PUTCHAR('\t'); + printfield(address.oppervlakte); + PUTCHAR('\t'); + printfield(address.gebruiksdoel); + PUTCHAR('\n'); +} + +static void +xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, + const char *v, size_t vl) +{ + if (a[0] != 'd' || t[0] != 'O') + return; + if (!strcmp(t, "Objecten:identificatie") || !strcmp(t, "Objecten-ref:NummeraanduidingRef")) + if (!strcmp(a, "domein") && !strcmp(v, "NL.IMBAG.Nummeraanduiding")) { + isbagnrtype = 1; + } +} + +static void +xmldata(XMLParser *x, const char *d, size_t dl) +{ + if (x->tag[0] != 'O') + return; + + if (!strcmp(x->tag, "Objecten:postcode")) { + concat(address.postcode, d, sizeof(address.postcode)); + } else if (!strcmp(x->tag, "Objecten:huisnummer")) { + concat(address.huisnummer, d, sizeof(address.huisnummer)); + } else if (!strcmp(x->tag, "Objecten:huisletter")) { + concat(address.huisletter, d, sizeof(address.huisletter)); + } else if (!strcmp(x->tag, "Objecten:huisnummertoevoeging")) { + concat(address.huisnummertoevoeging, d, sizeof(address.huisnummertoevoeging)); + } else if (isbagnrtype && !strcmp(x->tag, "Objecten:identificatie")) { + concat(address.bagnr, d, sizeof(address.bagnr)); + } else if (inhoofdadres && isbagnrtype && !strcmp(x->tag, "Objecten-ref:NummeraanduidingRef")) { + concat(address.bagnr, d, sizeof(address.bagnr)); + } else if (!strcmp(x->tag, "Objecten:oppervlakte")) { + concat(address.oppervlakte, d, sizeof(address.oppervlakte)); + } else if (!strcmp(x->tag, "Objecten:status")) { + concat(address.status, d, sizeof(address.status)); + } else if (!strcmp(x->tag, "Objecten:gebruiksdoel")) { + if (address.gebruiksdoel[0]) + concat(address.gebruiksdoel, ", ", sizeof(address.gebruiksdoel)); + concat(address.gebruiksdoel, d, sizeof(address.gebruiksdoel)); + } +} + +static void +xmltagend(XMLParser *x, const char *t, size_t tl, int isshort) +{ + if (t[0] != 's' && t[0] != 'O') + return; + if (inbagobject && !strcmp(t, "sl-bag-extract:bagObject")) { + printaddress(); + + inbagobject = 0; + innummeraanduiding = 0; + inhoofdadres = 0; + eindgeldig = 0; + clearaddress(&address); + } else if (innummeraanduiding) { + if (!strcmp(t, "Objecten:Nummeraanduiding") || !strcmp(t, "Objecten-ref:NummeraanduidingRef")) { + innummeraanduiding = 0; + isbagnrtype = 0; + } + } else if (isbagnrtype && !strcmp(t, "Objecten:identificatie")) { + isbagnrtype = 0; + } else if (inhoofdadres && !strcmp(t, "Objecten:heeftAlsHoofdadres")) { + inhoofdadres = 0; + } +} + +static void +xmltagstart(XMLParser *x, const char *t, size_t tl) +{ + if (t[0] != 's' && t[0] != 'O' && t[0] != 'H') + return; + if (!inbagobject && !strcmp(t, "sl-bag-extract:bagObject")) { + inbagobject = 1; + eindgeldig = 0; + clearaddress(&address); + } else if (inbagobject) { + if (!innummeraanduiding && !strcmp(t, "Objecten:Nummeraanduiding")) + innummeraanduiding = 1; + + if (!inhoofdadres && !strcmp(t, "Objecten:heeftAlsHoofdadres")) + inhoofdadres = 1; + + if (isbagnrtype) { + if (!strcmp(x->tag, "Objecten:identificatie") || !strcmp(x->tag, "Objecten-ref:NummeraanduidingRef")) + isbagnrtype = 0; + } + /* historical document */ + if (!strcmp(x->tag, "Historie:eindGeldigheid")) { + eindgeldig = 1; + } + } +} + +int +main(int argc, char *argv[]) +{ +#ifdef USE_MMAP + if (argc < 2) { + fprintf(stderr, "usage: %s <file>\n", argv[0]); + return 1; + } + + if ((fd = open(argv[1], O_RDONLY)) < 0) + err(1, "open"); + if (fstat(fd, &st) < 0) + err(1, "fstat"); + + off = 0; + len = st.st_size; + /*posix_fadvise(fd, 0, len, POSIX_FADV_SEQUENTIAL);*/ /* Linux */ + if ((reg = mmap(0, len, PROT_READ, MAP_SHARED|MAP_FILE, fd, off)) == MAP_FAILED) + err(1, "mmap"); + + xml_parse(&x); + + /* progress meter */ + /*fprintf(stderr, "\rProgress: %.2f%%\n", 100.0);*/ + + munmap(reg, len); + close(fd); +#else + /* required for Windows binary mode aka more retarded bullshit. */ +#if WIN32 + /* binary mode for stdin, stdout and stderr */ + _setmode(0, 0x8000); /* 0x8000 is O_BINARY */ + _setmode(1, 0x8000); + _setmode(2, 0x8000); +#endif + + xml_parse(&x); +#endif + + printaddress(); + + return 0; +} (DIR) diff --git a/process.sh b/process.sh @@ -0,0 +1,71 @@ +#!/bin/sh + +bin="./parse" +d="../data" +glue="./glue" + +maxjobs=64 + +log() { + echo "$1" >&2 +} + +# child process job: parse each file and process them to a file in parallel. +if test "$CHILD_PROC" = "1"; then + # arguments: count, name, infile, outfile + log "[$1] $2 started" + + # mmap version + "$bin" "$3" > "$4" + + # stdin version + #"$bin" < "$3" > "$4" + status="$?" + + log "[$1] $2 done" + exit "$status" +fi + +# generate a list of jobs for processing. +list() { + i=1 + for f in "$d"/*.xml; do + b="${f##*/}" + out="tmp/$b" + + printf '%s\0%s\0%s\0%s\0' "$i" "$b" "$f" "$out" + i=$((i+1)) + done +} + +# old awk version of glueing records, very slow on some platforms. +#awk_glue() { +# LC_ALL=C awk -f glue.awk +#} + +merge() { + log "Sorting data before merging records..." + LC_ALL=C sort -k1,1 -k8,8 results.csv > results_sorted.csv + + log "Merging records..." + "$glue" < results_sorted.csv > results2.csv + + log "Sorting resulting data by zipcode, address number, etc..." + # sort results by zipcode, address number, etc. + LC_ALL=C sort -k2,2 -k3,3n -k4,4 results2.csv > final.csv +} + +rm -rf tmp +mkdir -p tmp + +# parse in parallel. +list | CHILD_PROC="1" xargs -r -0 -P "${maxjobs}" -L 4 "$(readlink -f "$0")" + +# concat results to one file. +cat tmp/* > results.csv + +# merge results together. +merge + +# cleanup temp files. +rm -rf tmp (DIR) diff --git a/xml.c b/xml.c @@ -0,0 +1,480 @@ +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "xml.h" + +#define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26) +#define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5)) + +static int roffset, rtotal; +static char rbuf[4096*4]; + +int +getnext(void) +{ + ssize_t n; + + if (roffset >= rtotal) { + n = fread(rbuf, 1, sizeof(rbuf), stdin); + if (ferror(stdin)) + exit(1); + if (feof(stdin) || n == 0) { + roffset = 0; + rtotal = 0; + return EOF; + } + roffset = 0; + rtotal = n; + } + return rbuf[roffset++]; +} + +//#define GETNEXT getnext +#define GETNEXT getchar_unlocked + +static void +xml_parseattrs(XMLParser *x) +{ + size_t namelen = 0, valuelen; + int c, endsep, endname = 0, valuestart = 0; + + while ((c = GETNEXT()) != EOF) { + if (ISSPACE(c)) { + if (namelen) + endname = 1; + continue; + } else if (c == '?') + ; /* ignore */ + else if (c == '=') { + x->name[namelen] = '\0'; + valuestart = 1; + endname = 1; + } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) { + /* attribute without value */ + x->name[namelen] = '\0'; + if (x->xmlattrstart) + x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0); + if (x->xmlattrend) + x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); + endname = 0; + x->name[0] = c; + namelen = 1; + } else if (namelen && valuestart) { + /* attribute with value */ + if (x->xmlattrstart) + x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); + + valuelen = 0; + if (c == '\'' || c == '"') { + endsep = c; + } else { + endsep = ' '; /* ISSPACE() */ + goto startvalue; + } + + while ((c = GETNEXT()) != EOF) { +startvalue: + if (c == '&') { /* entities */ + x->data[valuelen] = '\0'; + /* call data function with data before entity if there is data */ + if (valuelen && x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + while ((c = GETNEXT()) != EOF) { + if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) + break; + if (valuelen < sizeof(x->data) - 1) + x->data[valuelen++] = c; + else { + /* entity too long for buffer, handle as normal data */ + x->data[valuelen] = '\0'; + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + break; + } + if (c == ';') { + x->data[valuelen] = '\0'; + if (x->xmlattrentity) + x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + valuelen = 0; + break; + } + } + } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) { + if (valuelen < sizeof(x->data) - 1) { + x->data[valuelen++] = c; + } else { + x->data[valuelen] = '\0'; + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + } + } + if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) { + x->data[valuelen] = '\0'; + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + if (x->xmlattrend) + x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); + break; + } + } + namelen = endname = valuestart = 0; + } else if (namelen < sizeof(x->name) - 1) { + x->name[namelen++] = c; + } + if (c == '>') { + break; + } else if (c == '/') { + x->isshorttag = 1; + x->name[0] = '\0'; + namelen = 0; + } + } +} + +static void +xml_parsecomment(XMLParser *x) +{ + size_t datalen = 0, i = 0; + int c; + + if (x->xmlcommentstart) + x->xmlcommentstart(x); + while ((c = GETNEXT()) != EOF) { + if (c == '-' || c == '>') { + if (x->xmlcomment && datalen) { + x->data[datalen] = '\0'; + x->xmlcomment(x, x->data, datalen); + datalen = 0; + } + } + + if (c == '-') { + if (++i > 2) { + if (x->xmlcomment) + for (; i > 2; i--) + x->xmlcomment(x, "-", 1); + i = 2; + } + continue; + } else if (c == '>' && i == 2) { + if (x->xmlcommentend) + x->xmlcommentend(x); + return; + } else if (i) { + if (x->xmlcomment) { + for (; i > 0; i--) + x->xmlcomment(x, "-", 1); + } + i = 0; + } + + if (datalen < sizeof(x->data) - 1) { + x->data[datalen++] = c; + } else { + x->data[datalen] = '\0'; + if (x->xmlcomment) + x->xmlcomment(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + } + } +} + +static void +xml_parsecdata(XMLParser *x) +{ + size_t datalen = 0, i = 0; + int c; + + if (x->xmlcdatastart) + x->xmlcdatastart(x); + while ((c = GETNEXT()) != EOF) { + if (c == ']' || c == '>') { + if (x->xmlcdata && datalen) { + x->data[datalen] = '\0'; + x->xmlcdata(x, x->data, datalen); + datalen = 0; + } + } + + if (c == ']') { + if (++i > 2) { + if (x->xmlcdata) + for (; i > 2; i--) + x->xmlcdata(x, "]", 1); + i = 2; + } + continue; + } else if (c == '>' && i == 2) { + if (x->xmlcdataend) + x->xmlcdataend(x); + return; + } else if (i) { + if (x->xmlcdata) + for (; i > 0; i--) + x->xmlcdata(x, "]", 1); + i = 0; + } + + if (datalen < sizeof(x->data) - 1) { + x->data[datalen++] = c; + } else { + x->data[datalen] = '\0'; + if (x->xmlcdata) + x->xmlcdata(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + } + } +} + +static int +codepointtoutf8(long r, char *s) +{ + if (r == 0) { + return 0; /* NUL byte */ + } else if (r <= 0x7F) { + /* 1 byte: 0aaaaaaa */ + s[0] = r; + return 1; + } else if (r <= 0x07FF) { + /* 2 bytes: 00000aaa aabbbbbb */ + s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ + s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ + return 2; + } else if (r <= 0xFFFF) { + /* 3 bytes: aaaabbbb bbcccccc */ + s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ + s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ + s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ + return 3; + } else { + /* 4 bytes: 000aaabb bbbbcccc ccdddddd */ + s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ + s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ + s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ + s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ + return 4; + } +} + +static int +namedentitytostr(const char *e, char *buf, size_t bufsiz) +{ + static const struct { + const char *entity; + int c; + } entities[] = { + { "amp;", '&' }, + { "lt;", '<' }, + { "gt;", '>' }, + { "apos;", '\'' }, + { "quot;", '"' }, + }; + size_t i; + + /* buffer is too small */ + if (bufsiz < 2) + return -1; + + for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) { + if (!strcmp(e, entities[i].entity)) { + buf[0] = entities[i].c; + buf[1] = '\0'; + return 1; + } + } + return -1; +} + +static int +numericentitytostr(const char *e, char *buf, size_t bufsiz) +{ + long l; + int len; + char *end; + + /* buffer is too small */ + if (bufsiz < 5) + return -1; + + errno = 0; + /* hex (16) or decimal (10) */ + if (*e == 'x') + l = strtol(++e, &end, 16); + else + l = strtol(e, &end, 10); + /* invalid value or not a well-formed entity or invalid code point */ + if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff || + (l >= 0xd800 && l <= 0xdfff)) + return -1; + len = codepointtoutf8(l, buf); + buf[len] = '\0'; + + return len; +} + +/* convert named- or numeric entity string to buffer string + * returns byte-length of string or -1 on failure. */ +int +xml_entitytostr(const char *e, char *buf, size_t bufsiz) +{ + /* doesn't start with & */ + if (e[0] != '&') + return -1; + /* numeric entity */ + if (e[1] == '#') + return numericentitytostr(e + 2, buf, bufsiz); + else /* named entity */ + return namedentitytostr(e + 1, buf, bufsiz); +} + +void +xml_parse(XMLParser *x) +{ + size_t datalen, tagdatalen; + int c, isend; + + while ((c = GETNEXT()) != EOF && c != '<') + ; /* skip until < */ + + while (c != EOF) { + if (c == '<') { /* parse tag */ + if ((c = GETNEXT()) == EOF) + return; + + if (c == '!') { /* CDATA and comments */ + for (tagdatalen = 0; (c = GETNEXT()) != EOF;) { + /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */ + if (tagdatalen <= sizeof("[CDATA[") - 1) + x->data[tagdatalen++] = c; + if (c == '>') + break; + else if (c == '-' && tagdatalen == sizeof("--") - 1 && + (x->data[0] == '-')) { + xml_parsecomment(x); + break; + } else if (c == '[') { + if (tagdatalen == sizeof("[CDATA[") - 1 && + !strncmp(x->data, "[CDATA[", tagdatalen)) { + xml_parsecdata(x); + break; + } + } + } + } else { + /* normal tag (open, short open, close), processing instruction. */ + x->tag[0] = c; + x->taglen = 1; + x->isshorttag = isend = 0; + + /* treat processing instruction as short tag, don't strip "?" prefix. */ + if (c == '?') { + x->isshorttag = 1; + } else if (c == '/') { + if ((c = GETNEXT()) == EOF) + return; + x->tag[0] = c; + isend = 1; + } + + while ((c = GETNEXT()) != EOF) { + if (c == '/') + x->isshorttag = 1; /* short tag */ + else if (c == '>' || ISSPACE(c)) { + x->tag[x->taglen] = '\0'; + if (isend) { /* end tag, starts with </ */ + if (x->xmltagend) + x->xmltagend(x, x->tag, x->taglen, x->isshorttag); + x->tag[0] = '\0'; + x->taglen = 0; + } else { + /* start tag */ + if (x->xmltagstart) + x->xmltagstart(x, x->tag, x->taglen); + if (ISSPACE(c)) + xml_parseattrs(x); + if (x->xmltagstartparsed) + x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag); + } + /* call tagend for short tag or processing instruction */ + if (x->isshorttag) { + if (x->xmltagend) + x->xmltagend(x, x->tag, x->taglen, x->isshorttag); + x->tag[0] = '\0'; + x->taglen = 0; + } + break; + } else if (x->taglen < sizeof(x->tag) - 1) + x->tag[x->taglen++] = c; /* NOTE: tag name truncation */ + } + } + } else { + /* parse tag data */ + datalen = 0; + if (x->xmldatastart) + x->xmldatastart(x); + while ((c = GETNEXT()) != EOF) { + if (c == '&') { + if (datalen) { + x->data[datalen] = '\0'; + if (x->xmldata) + x->xmldata(x, x->data, datalen); + } + x->data[0] = c; + datalen = 1; + while ((c = GETNEXT()) != EOF) { + if (c == '<') + break; + if (datalen < sizeof(x->data) - 1) + x->data[datalen++] = c; + else { + /* entity too long for buffer, handle as normal data */ + x->data[datalen] = '\0'; + if (x->xmldata) + x->xmldata(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + break; + } + if (c == ';') { + x->data[datalen] = '\0'; + if (x->xmldataentity) + x->xmldataentity(x, x->data, datalen); + datalen = 0; + break; + } + } + } else if (c != '<') { + if (datalen < sizeof(x->data) - 1) { + x->data[datalen++] = c; + } else { + x->data[datalen] = '\0'; + if (x->xmldata) + x->xmldata(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + } + } + if (c == '<') { + x->data[datalen] = '\0'; + if (x->xmldata && datalen) + x->xmldata(x, x->data, datalen); + if (x->xmldataend) + x->xmldataend(x); + break; + } + } + } + } +} (DIR) diff --git a/xml.h b/xml.h @@ -0,0 +1,44 @@ +#ifndef XML_H +#define XML_H + +#include <stdio.h> + +typedef struct xmlparser { + /* handlers */ + void (*xmlattr)(struct xmlparser *, const char *, size_t, + const char *, size_t, const char *, size_t); + void (*xmlattrend)(struct xmlparser *, const char *, size_t, + const char *, size_t); + void (*xmlattrstart)(struct xmlparser *, const char *, size_t, + const char *, size_t); + void (*xmlattrentity)(struct xmlparser *, const char *, size_t, + const char *, size_t, const char *, size_t); + void (*xmlcdatastart)(struct xmlparser *); + void (*xmlcdata)(struct xmlparser *, const char *, size_t); + void (*xmlcdataend)(struct xmlparser *); + void (*xmlcommentstart)(struct xmlparser *); + void (*xmlcomment)(struct xmlparser *, const char *, size_t); + void (*xmlcommentend)(struct xmlparser *); + void (*xmldata)(struct xmlparser *, const char *, size_t); + void (*xmldataend)(struct xmlparser *); + void (*xmldataentity)(struct xmlparser *, const char *, size_t); + void (*xmldatastart)(struct xmlparser *); + void (*xmltagend)(struct xmlparser *, const char *, size_t, int); + void (*xmltagstart)(struct xmlparser *, const char *, size_t); + void (*xmltagstartparsed)(struct xmlparser *, const char *, + size_t, int); + + /* current tag */ + char tag[1024]; + size_t taglen; + /* current tag is a short tag ? <tag /> */ + int isshorttag; + /* current attribute name */ + char name[1024]; + /* data buffer used for tag data, CDATA and attribute data */ + char data[BUFSIZ]; +} XMLParser; + +int xml_entitytostr(const char *, char *, size_t); +void xml_parse(XMLParser *); +#endif