various improvements - webdump - HTML to plain-text converter for webpages (HTM) git clone git://git.codemadness.org/webdump (DIR) Log (DIR) Files (DIR) Refs (DIR) README (DIR) LICENSE --- (DIR) commit 011b4885a533382d98f1aee6cb9619e280c99947 (DIR) parent 89c9108dc27fe27e0f028f67508a1156ed242d2a (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org> Date: Mon, 18 Sep 2023 19:06:03 +0200 various improvements Improve link references: - Add RB tree to lookup link references: this uses a stripped-down version of OpenBSD tree.h - Add 2 separate linked-lists for the order of visible and hidden links. - Hidden links and now also deduplicated. Improve nested nodes and max depth: - Rework and increase the allowed depth of nodes. Allocate them on the heap. Diffstat: M Makefile | 2 +- A tree.h | 483 +++++++++++++++++++++++++++++++ M webdump.c | 177 ++++++++++++++++++++----------- 3 files changed, 597 insertions(+), 65 deletions(-) --- (DIR) diff --git a/Makefile b/Makefile @@ -19,7 +19,7 @@ BIN = ${NAME} SCRIPTS = SRC = ${BIN:=.c} -HDR = arg.h namedentities.h xml.h +HDR = arg.h namedentities.h tree.h xml.h LIBXML = libxml.a LIBXMLSRC = \ (DIR) diff --git a/tree.h b/tree.h @@ -0,0 +1,483 @@ +/* $OpenBSD: tree.h,v 1.31 2023/03/08 04:43:09 guenther Exp $ */ +/* + * Copyright 2002 Niels Provos <provos@citi.umich.edu> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TREE_H +#define TREE_H + +/* + * This file defines a red-black tree structure. + * + * A red-black tree is a binary search tree with the node color as an + * extra attribute. It fulfills a set of conditions: + * - every search path from the root to a leaf consists of the + * same number of black nodes, + * - each red node (except for the root) has a black parent, + * - each leaf node is black. + * + * Every operation on a red-black tree is bounded as O(lg n). + * The maximum height of a red-black tree is 2lg (n+1). + */ + +/* Macros that define a red-black tree */ +#define RB_HEAD(name, type) \ +struct name { \ + struct type *rbh_root; /* root of the tree */ \ +} + +#define RB_INITIALIZER(root) \ + { NULL } + +#define RB_INIT(root) do { \ + (root)->rbh_root = NULL; \ +} while (0) + +#define RB_BLACK 0 +#define RB_RED 1 +#define RB_ENTRY(type) \ +struct { \ + struct type *rbe_left; /* left element */ \ + struct type *rbe_right; /* right element */ \ + struct type *rbe_parent; /* parent element */ \ + int rbe_color; /* node color */ \ +} + +#define RB_LEFT(elm, field) (elm)->field.rbe_left +#define RB_RIGHT(elm, field) (elm)->field.rbe_right +#define RB_PARENT(elm, field) (elm)->field.rbe_parent +#define RB_COLOR(elm, field) (elm)->field.rbe_color +#define RB_ROOT(head) (head)->rbh_root +#define RB_EMPTY(head) (RB_ROOT(head) == NULL) + +#define RB_SET(elm, parent, field) do { \ + RB_PARENT(elm, field) = parent; \ + RB_LEFT(elm, field) = RB_RIGHT(elm, field) = NULL; \ + RB_COLOR(elm, field) = RB_RED; \ +} while (0) + +#define RB_SET_BLACKRED(black, red, field) do { \ + RB_COLOR(black, field) = RB_BLACK; \ + RB_COLOR(red, field) = RB_RED; \ +} while (0) + +#define RB_ROTATE_LEFT(head, elm, tmp, field) do { \ + (tmp) = RB_RIGHT(elm, field); \ + if ((RB_RIGHT(elm, field) = RB_LEFT(tmp, field))) { \ + RB_PARENT(RB_LEFT(tmp, field), field) = (elm); \ + } \ + if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field))) { \ + if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \ + RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \ + else \ + RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); \ + } else \ + (head)->rbh_root = (tmp); \ + RB_LEFT(tmp, field) = (elm); \ + RB_PARENT(elm, field) = (tmp); \ +} while (0) + +#define RB_ROTATE_RIGHT(head, elm, tmp, field) do { \ + (tmp) = RB_LEFT(elm, field); \ + if ((RB_LEFT(elm, field) = RB_RIGHT(tmp, field))) { \ + RB_PARENT(RB_RIGHT(tmp, field), field) = (elm); \ + } \ + if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field))) { \ + if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \ + RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \ + else \ + RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); \ + } else \ + (head)->rbh_root = (tmp); \ + RB_RIGHT(tmp, field) = (elm); \ + RB_PARENT(elm, field) = (tmp); \ +} while (0) + +/* Generates prototypes and inline functions */ +#define RB_PROTOTYPE(name, type, field, cmp) \ + RB_PROTOTYPE_INTERNAL(name, type, field, cmp,) +#define RB_PROTOTYPE_STATIC(name, type, field, cmp) \ + RB_PROTOTYPE_INTERNAL(name, type, field, cmp, __attribute__((__unused__)) static) +#define RB_PROTOTYPE_INTERNAL(name, type, field, cmp, attr) \ +attr void name##_RB_INSERT_COLOR(struct name *, struct type *); \ +attr void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *);\ +attr struct type *name##_RB_REMOVE(struct name *, struct type *); \ +attr struct type *name##_RB_INSERT(struct name *, struct type *); \ +attr struct type *name##_RB_FIND(struct name *, struct type *); \ +attr struct type *name##_RB_NFIND(struct name *, struct type *); \ +attr struct type *name##_RB_NEXT(struct type *); \ +attr struct type *name##_RB_PREV(struct type *); \ +attr struct type *name##_RB_MINMAX(struct name *, int); \ + \ + +/* Main rb operation. + * Moves node close to the key of elm to top + */ +#define RB_GENERATE(name, type, field, cmp) \ + RB_GENERATE_INTERNAL(name, type, field, cmp,) +#define RB_GENERATE_STATIC(name, type, field, cmp) \ + RB_GENERATE_INTERNAL(name, type, field, cmp, __attribute__((__unused__)) static) +#define RB_GENERATE_INTERNAL(name, type, field, cmp, attr) \ +attr void \ +name##_RB_INSERT_COLOR(struct name *head, struct type *elm) \ +{ \ + struct type *parent, *gparent, *tmp; \ + while ((parent = RB_PARENT(elm, field)) && \ + RB_COLOR(parent, field) == RB_RED) { \ + gparent = RB_PARENT(parent, field); \ + if (parent == RB_LEFT(gparent, field)) { \ + tmp = RB_RIGHT(gparent, field); \ + if (tmp && RB_COLOR(tmp, field) == RB_RED) { \ + RB_COLOR(tmp, field) = RB_BLACK; \ + RB_SET_BLACKRED(parent, gparent, field);\ + elm = gparent; \ + continue; \ + } \ + if (RB_RIGHT(parent, field) == elm) { \ + RB_ROTATE_LEFT(head, parent, tmp, field);\ + tmp = parent; \ + parent = elm; \ + elm = tmp; \ + } \ + RB_SET_BLACKRED(parent, gparent, field); \ + RB_ROTATE_RIGHT(head, gparent, tmp, field); \ + } else { \ + tmp = RB_LEFT(gparent, field); \ + if (tmp && RB_COLOR(tmp, field) == RB_RED) { \ + RB_COLOR(tmp, field) = RB_BLACK; \ + RB_SET_BLACKRED(parent, gparent, field);\ + elm = gparent; \ + continue; \ + } \ + if (RB_LEFT(parent, field) == elm) { \ + RB_ROTATE_RIGHT(head, parent, tmp, field);\ + tmp = parent; \ + parent = elm; \ + elm = tmp; \ + } \ + RB_SET_BLACKRED(parent, gparent, field); \ + RB_ROTATE_LEFT(head, gparent, tmp, field); \ + } \ + } \ + RB_COLOR(head->rbh_root, field) = RB_BLACK; \ +} \ + \ +attr void \ +name##_RB_REMOVE_COLOR(struct name *head, struct type *parent, struct type *elm) \ +{ \ + struct type *tmp; \ + while ((elm == NULL || RB_COLOR(elm, field) == RB_BLACK) && \ + elm != RB_ROOT(head)) { \ + if (RB_LEFT(parent, field) == elm) { \ + tmp = RB_RIGHT(parent, field); \ + if (RB_COLOR(tmp, field) == RB_RED) { \ + RB_SET_BLACKRED(tmp, parent, field); \ + RB_ROTATE_LEFT(head, parent, tmp, field);\ + tmp = RB_RIGHT(parent, field); \ + } \ + if ((RB_LEFT(tmp, field) == NULL || \ + RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\ + (RB_RIGHT(tmp, field) == NULL || \ + RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\ + RB_COLOR(tmp, field) = RB_RED; \ + elm = parent; \ + parent = RB_PARENT(elm, field); \ + } else { \ + if (RB_RIGHT(tmp, field) == NULL || \ + RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK) {\ + struct type *oleft; \ + if ((oleft = RB_LEFT(tmp, field)))\ + RB_COLOR(oleft, field) = RB_BLACK;\ + RB_COLOR(tmp, field) = RB_RED; \ + RB_ROTATE_RIGHT(head, tmp, oleft, field);\ + tmp = RB_RIGHT(parent, field); \ + } \ + RB_COLOR(tmp, field) = RB_COLOR(parent, field);\ + RB_COLOR(parent, field) = RB_BLACK; \ + if (RB_RIGHT(tmp, field)) \ + RB_COLOR(RB_RIGHT(tmp, field), field) = RB_BLACK;\ + RB_ROTATE_LEFT(head, parent, tmp, field);\ + elm = RB_ROOT(head); \ + break; \ + } \ + } else { \ + tmp = RB_LEFT(parent, field); \ + if (RB_COLOR(tmp, field) == RB_RED) { \ + RB_SET_BLACKRED(tmp, parent, field); \ + RB_ROTATE_RIGHT(head, parent, tmp, field);\ + tmp = RB_LEFT(parent, field); \ + } \ + if ((RB_LEFT(tmp, field) == NULL || \ + RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\ + (RB_RIGHT(tmp, field) == NULL || \ + RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\ + RB_COLOR(tmp, field) = RB_RED; \ + elm = parent; \ + parent = RB_PARENT(elm, field); \ + } else { \ + if (RB_LEFT(tmp, field) == NULL || \ + RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) {\ + struct type *oright; \ + if ((oright = RB_RIGHT(tmp, field)))\ + RB_COLOR(oright, field) = RB_BLACK;\ + RB_COLOR(tmp, field) = RB_RED; \ + RB_ROTATE_LEFT(head, tmp, oright, field);\ + tmp = RB_LEFT(parent, field); \ + } \ + RB_COLOR(tmp, field) = RB_COLOR(parent, field);\ + RB_COLOR(parent, field) = RB_BLACK; \ + if (RB_LEFT(tmp, field)) \ + RB_COLOR(RB_LEFT(tmp, field), field) = RB_BLACK;\ + RB_ROTATE_RIGHT(head, parent, tmp, field);\ + elm = RB_ROOT(head); \ + break; \ + } \ + } \ + } \ + if (elm) \ + RB_COLOR(elm, field) = RB_BLACK; \ +} \ + \ +attr struct type * \ +name##_RB_REMOVE(struct name *head, struct type *elm) \ +{ \ + struct type *child, *parent, *old = elm; \ + int color; \ + if (RB_LEFT(elm, field) == NULL) \ + child = RB_RIGHT(elm, field); \ + else if (RB_RIGHT(elm, field) == NULL) \ + child = RB_LEFT(elm, field); \ + else { \ + struct type *left; \ + elm = RB_RIGHT(elm, field); \ + while ((left = RB_LEFT(elm, field))) \ + elm = left; \ + child = RB_RIGHT(elm, field); \ + parent = RB_PARENT(elm, field); \ + color = RB_COLOR(elm, field); \ + if (child) \ + RB_PARENT(child, field) = parent; \ + if (parent) { \ + if (RB_LEFT(parent, field) == elm) \ + RB_LEFT(parent, field) = child; \ + else \ + RB_RIGHT(parent, field) = child; \ + } else \ + RB_ROOT(head) = child; \ + if (RB_PARENT(elm, field) == old) \ + parent = elm; \ + (elm)->field = (old)->field; \ + if (RB_PARENT(old, field)) { \ + if (RB_LEFT(RB_PARENT(old, field), field) == old)\ + RB_LEFT(RB_PARENT(old, field), field) = elm;\ + else \ + RB_RIGHT(RB_PARENT(old, field), field) = elm;\ + } else \ + RB_ROOT(head) = elm; \ + RB_PARENT(RB_LEFT(old, field), field) = elm; \ + if (RB_RIGHT(old, field)) \ + RB_PARENT(RB_RIGHT(old, field), field) = elm; \ + if (parent) { \ + left = parent; \ + do { \ + } while ((left = RB_PARENT(left, field))); \ + } \ + goto color; \ + } \ + parent = RB_PARENT(elm, field); \ + color = RB_COLOR(elm, field); \ + if (child) \ + RB_PARENT(child, field) = parent; \ + if (parent) { \ + if (RB_LEFT(parent, field) == elm) \ + RB_LEFT(parent, field) = child; \ + else \ + RB_RIGHT(parent, field) = child; \ + } else \ + RB_ROOT(head) = child; \ +color: \ + if (color == RB_BLACK) \ + name##_RB_REMOVE_COLOR(head, parent, child); \ + return (old); \ +} \ + \ +/* Inserts a node into the RB tree */ \ +attr struct type * \ +name##_RB_INSERT(struct name *head, struct type *elm) \ +{ \ + struct type *tmp; \ + struct type *parent = NULL; \ + int comp = 0; \ + tmp = RB_ROOT(head); \ + while (tmp) { \ + parent = tmp; \ + comp = (cmp)(elm, parent); \ + if (comp < 0) \ + tmp = RB_LEFT(tmp, field); \ + else if (comp > 0) \ + tmp = RB_RIGHT(tmp, field); \ + else \ + return (tmp); \ + } \ + RB_SET(elm, parent, field); \ + if (parent != NULL) { \ + if (comp < 0) \ + RB_LEFT(parent, field) = elm; \ + else \ + RB_RIGHT(parent, field) = elm; \ + } else \ + RB_ROOT(head) = elm; \ + name##_RB_INSERT_COLOR(head, elm); \ + return (NULL); \ +} \ + \ +/* Finds the node with the same key as elm */ \ +attr struct type * \ +name##_RB_FIND(struct name *head, struct type *elm) \ +{ \ + struct type *tmp = RB_ROOT(head); \ + int comp; \ + while (tmp) { \ + comp = cmp(elm, tmp); \ + if (comp < 0) \ + tmp = RB_LEFT(tmp, field); \ + else if (comp > 0) \ + tmp = RB_RIGHT(tmp, field); \ + else \ + return (tmp); \ + } \ + return (NULL); \ +} \ + \ +/* Finds the first node greater than or equal to the search key */ \ +attr struct type * \ +name##_RB_NFIND(struct name *head, struct type *elm) \ +{ \ + struct type *tmp = RB_ROOT(head); \ + struct type *res = NULL; \ + int comp; \ + while (tmp) { \ + comp = cmp(elm, tmp); \ + if (comp < 0) { \ + res = tmp; \ + tmp = RB_LEFT(tmp, field); \ + } \ + else if (comp > 0) \ + tmp = RB_RIGHT(tmp, field); \ + else \ + return (tmp); \ + } \ + return (res); \ +} \ + \ +attr struct type * \ +name##_RB_NEXT(struct type *elm) \ +{ \ + if (RB_RIGHT(elm, field)) { \ + elm = RB_RIGHT(elm, field); \ + while (RB_LEFT(elm, field)) \ + elm = RB_LEFT(elm, field); \ + } else { \ + if (RB_PARENT(elm, field) && \ + (elm == RB_LEFT(RB_PARENT(elm, field), field))) \ + elm = RB_PARENT(elm, field); \ + else { \ + while (RB_PARENT(elm, field) && \ + (elm == RB_RIGHT(RB_PARENT(elm, field), field)))\ + elm = RB_PARENT(elm, field); \ + elm = RB_PARENT(elm, field); \ + } \ + } \ + return (elm); \ +} \ + \ +attr struct type * \ +name##_RB_PREV(struct type *elm) \ +{ \ + if (RB_LEFT(elm, field)) { \ + elm = RB_LEFT(elm, field); \ + while (RB_RIGHT(elm, field)) \ + elm = RB_RIGHT(elm, field); \ + } else { \ + if (RB_PARENT(elm, field) && \ + (elm == RB_RIGHT(RB_PARENT(elm, field), field))) \ + elm = RB_PARENT(elm, field); \ + else { \ + while (RB_PARENT(elm, field) && \ + (elm == RB_LEFT(RB_PARENT(elm, field), field)))\ + elm = RB_PARENT(elm, field); \ + elm = RB_PARENT(elm, field); \ + } \ + } \ + return (elm); \ +} \ + \ +attr struct type * \ +name##_RB_MINMAX(struct name *head, int val) \ +{ \ + struct type *tmp = RB_ROOT(head); \ + struct type *parent = NULL; \ + while (tmp) { \ + parent = tmp; \ + if (val < 0) \ + tmp = RB_LEFT(tmp, field); \ + else \ + tmp = RB_RIGHT(tmp, field); \ + } \ + return (parent); \ +} + +#define RB_NEGINF -1 +#define RB_INF 1 + +#define RB_INSERT(name, x, y) name##_RB_INSERT(x, y) +#define RB_REMOVE(name, x, y) name##_RB_REMOVE(x, y) +#define RB_FIND(name, x, y) name##_RB_FIND(x, y) +#define RB_NFIND(name, x, y) name##_RB_NFIND(x, y) +#define RB_NEXT(name, x, y) name##_RB_NEXT(y) +#define RB_PREV(name, x, y) name##_RB_PREV(y) +#define RB_MIN(name, x) name##_RB_MINMAX(x, RB_NEGINF) +#define RB_MAX(name, x) name##_RB_MINMAX(x, RB_INF) + +#define RB_FOREACH(x, name, head) \ + for ((x) = RB_MIN(name, head); \ + (x) != NULL; \ + (x) = name##_RB_NEXT(x)) + +#define RB_FOREACH_SAFE(x, name, head, y) \ + for ((x) = RB_MIN(name, head); \ + ((x) != NULL) && ((y) = name##_RB_NEXT(x), 1); \ + (x) = (y)) + +#define RB_FOREACH_REVERSE(x, name, head) \ + for ((x) = RB_MAX(name, head); \ + (x) != NULL; \ + (x) = name##_RB_PREV(x)) + +#define RB_FOREACH_REVERSE_SAFE(x, name, head, y) \ + for ((x) = RB_MAX(name, head); \ + ((x) != NULL) && ((y) = name##_RB_PREV(x), 1); \ + (x) = (y)) + +#endif /* TREE_H */ (DIR) diff --git a/webdump.c b/webdump.c @@ -10,6 +10,7 @@ #include "arg.h" char *argv0; +#include "tree.h" #include "xml.h" static XMLParser parser; @@ -151,19 +152,31 @@ struct selectors { size_t count; }; -/* linked-list of link references */ +/* RB tree of link references */ struct linkref { char *type; enum TagId tagid; char *url; int ishidden; size_t linknr; - struct linkref *next; + RB_ENTRY(linkref) entry; }; -static struct linkref *links_head; -static struct linkref *links_cur; -static int linkcount; /* visible link count */ +/* link references and hidden link references */ +struct linkref **visrefs; +static size_t nvisrefs, ncapvisrefs; /* visible link count / capacity */ +struct linkref **hiddenrefs; +static size_t nhiddenrefs, ncaphiddenrefs; /* hidden link count / capacity */ + +int +linkrefcmp(struct linkref *r1, struct linkref *r2) +{ + return strcmp(r1->url, r2->url); +} + +RB_HEAD(linkreftree, linkref) linkrefhead = RB_INITIALIZER(&linkrefhead); +RB_PROTOTYPE(linkreftree, linkref, entry, linkrefcmp) +RB_GENERATE(linkreftree, linkref, entry, linkrefcmp) static const char *str_bullet_item = "* "; static const char *str_checkbox_checked = "x"; @@ -213,10 +226,11 @@ static char rbuf[1024]; static int rbuflen; static int rnbufcells = 0; /* pending cell count to add */ -#define MAX_DEPTH 256 -static struct node nodes[MAX_DEPTH]; -static String nodes_links[MAX_DEPTH]; /* keep track of links per node */ -static int curnode; +#define MAX_NODE_DEPTH 65535 /* absolute maximum node depth */ +static struct node *nodes; +static String *nodes_links; /* keep track of links per node */ +static size_t ncapnodes; +static int curnode; /* current node depth */ /* reader / selector mode */ static int reader_mode = 0; @@ -1378,39 +1392,59 @@ handleinlinealt(void) } } -/* slow linear lookup of link references - TODO: optimize it, maybe using tree.h RB_TREE? */ +/* lookup a link reference by url in the red-black tree */ static struct linkref * findlinkref(const char *url) { - struct linkref *cur; + struct linkref find; - for (cur = links_head; cur; cur = cur->next) { - if (!strcmp(url, cur->url)) - return cur; - } - return NULL; + find.url = (char *)url; + + return RB_FIND(linkreftree, &linkrefhead, &find); } +/* add a link reference. Returns the added link reference, or the existing link + reference if links are deduplicated */ static struct linkref * -addlinkref(const char *url, const char *_type, enum TagId tagid, int ishidden, - int linknr) +addlinkref(const char *url, const char *_type, enum TagId tagid, int ishidden) { + struct linkref *link; + size_t linknr; + + /* if links are deduplicates return the existing link */ + if (uniqrefs && (link = findlinkref(url))) + return link; + if (tagid == TagA) _type = "link"; - /* add to linked list */ - if (!links_head) - links_cur = links_head = ecalloc(1, sizeof(*links_head)); - else - links_cur = links_cur->next = ecalloc(1, sizeof(*links_head)); - links_cur->url = estrdup(url); - links_cur->type = estrdup(_type); - links_cur->tagid = tagid; - links_cur->ishidden = ishidden; - links_cur->linknr = linknr; + link = ecalloc(1, sizeof(*link)); + + if (!ishidden) { + linknr = ++nvisrefs; + if (nvisrefs >= ncapvisrefs) + ncapvisrefs += 256; /* greedy alloc */ + visrefs = erealloc(visrefs, sizeof(*visrefs) * ncapvisrefs); + visrefs[linknr - 1] = link; /* add pointer to list */ + } else { + linknr = ++nhiddenrefs; + if (nhiddenrefs >= ncaphiddenrefs) + ncaphiddenrefs += 256; /* greedy alloc */ + hiddenrefs = erealloc(hiddenrefs, sizeof(*hiddenrefs) * ncaphiddenrefs); + hiddenrefs[linknr - 1] = link; /* add pointer to list */ + } - return links_cur; + link->url = estrdup(url); + link->type = estrdup(_type); + link->tagid = tagid; + link->ishidden = ishidden; + link->linknr = linknr; + + /* add to tree: the tree is only used for checking unique link references */ + if (uniqrefs) + RB_INSERT(linkreftree, &linkrefhead, link); + + return link; } static void @@ -1462,43 +1496,65 @@ handleinlinelink(void) /* add hidden links directly to the reference, the order doesn't matter */ if (cur->tag.displaytype & DisplayNone) - addlinkref(url, cur->tag.name, cur->tag.id, 1, 0); + addlinkref(url, cur->tag.name, cur->tag.id, 1); } void printlinkrefs(void) { + struct linkref *ref; size_t i; - int hashiddenrefs = 0; - if (!links_head) + if (!nvisrefs && !nhiddenrefs) return; if (resources) { - for (i = 1, links_cur = links_head; links_cur; links_cur = links_cur->next, i++) - dprintf(3, "%s\t%s\n", links_cur->type, links_cur->url); + for (i = 0; i < nvisrefs; i++) { + ref = visrefs[i]; + dprintf(3, "%s\t%s\n", ref->type, ref->url); + } + for (i = 0; i < nhiddenrefs; i++) { + ref = hiddenrefs[i]; + dprintf(3, "%s\t%s\n", ref->type, ref->url); + } } printf("\nReferences\n\n"); - i = 1; - for (links_cur = links_head; links_cur; links_cur = links_cur->next) { - if (links_cur->ishidden) { - hashiddenrefs = 1; - continue; - } - printf(" %zu. %s (%s)\n", links_cur->linknr, links_cur->url, links_cur->type); - i++; + for (i = 0; i < nvisrefs; i++) { + ref = visrefs[i]; + printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->type); } - if (hashiddenrefs) + if (nhiddenrefs > 0) printf("\n\nHidden references\n\n"); /* hidden links don't have a link number, just count them */ - for (links_cur = links_head; links_cur; links_cur = links_cur->next) { - if (!links_cur->ishidden) - continue; - printf(" %zu. %s (%s)\n", i, links_cur->url, links_cur->type); - i++; + for (i = 0; i < nhiddenrefs; i++) { + ref = hiddenrefs[i]; + printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->type); + } +} + +#define NODE_CAP_INC 256 + +/* increase node depth, allocate space for nodes if needed */ +static void +incnode(void) +{ + curnode++; + + if (curnode >= MAX_NODE_DEPTH) + errx(1, "max node depth reached: %d", curnode); + + if (curnode >= ncapnodes) { + nodes = erealloc(nodes, sizeof(*nodes) * (ncapnodes + NODE_CAP_INC)); + nodes_links = erealloc(nodes_links, sizeof(*nodes_links) * (ncapnodes + NODE_CAP_INC)); + + /* clear new region */ + memset(&nodes[ncapnodes], 0, sizeof(*nodes) * NODE_CAP_INC); + memset(&nodes_links[ncapnodes], 0, sizeof(*nodes_links) * NODE_CAP_INC); + + ncapnodes += NODE_CAP_INC; /* greedy alloc */ } } @@ -1670,17 +1726,8 @@ endnode(struct node *cur) /* add link and show the link number in the visible order */ if (!ishidden && nodes_links[curnode].len > 0) { - if (uniqrefs) - ref = findlinkref(nodes_links[curnode].data); - else - ref = NULL; - - /* new link: add it */ - if (!ref) { - linkcount++; - ref = addlinkref(nodes_links[curnode].data, - cur->tag.name, cur->tag.id, ishidden, linkcount); - } + ref = addlinkref(nodes_links[curnode].data, + cur->tag.name, cur->tag.id, ishidden); if (showrefinline || showurlinline) { hflush(); @@ -1825,9 +1872,6 @@ xmltagstart(XMLParser *p, const char *t, size_t tl) char *s; int i, j, k, nchildfound, parenttype; - if (curnode >= MAX_DEPTH - 2) - errx(1, "max tag depth reached: %d\n", curnode); - cur = &nodes[curnode]; string_clear(&attr_alt); @@ -1920,7 +1964,7 @@ xmltagstart(XMLParser *p, const char *t, size_t tl) } } - curnode++; + incnode(); string_clear(&nodes_links[curnode]); /* clear possible link reference for this node */ cur = &nodes[curnode]; memset(cur, 0, sizeof(*cur)); /* clear / reset node */ @@ -2333,6 +2377,11 @@ main(int argc, char **argv) usage(); } ARGEND + /* initial nodes */ + ncapnodes = NODE_CAP_INC; + nodes = ecalloc(ncapnodes, sizeof(*nodes)); + nodes_links = ecalloc(ncapnodes, sizeof(*nodes_links)); + /* top-most document root needs initialization */ nodes[0].tag.name = "";