add option for unique link references (-d) - webdump - HTML to plain-text converter for webpages
 (HTM) git clone git://git.codemadness.org/webdump
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 91d236dab89449465eb123d756a450a17eb4195a
 (DIR) parent 790402682bab675461f2a12879408dd5ad30c90f
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Tue, 12 Sep 2023 20:02:57 +0200
       
       add option for unique link references (-d)
       
       ... also make link type "a" consistently "link" (also at the bottom
       references).
       
       ... also flush inline link only if needed
       
       Diffstat:
         M webdump.1                           |       7 +++++--
         M webdump.c                           |      87 ++++++++++++++++++++++---------
       
       2 files changed, 67 insertions(+), 27 deletions(-)
       ---
 (DIR) diff --git a/webdump.1 b/webdump.1
       @@ -1,4 +1,4 @@
       -.Dd September 8, 2023
       +.Dd September 12, 2023
        .Dt WEBDUMP 1
        .Os
        .Sh NAME
       @@ -6,7 +6,7 @@
        .Nd convert HTML to plain-text
        .Sh SYNOPSIS
        .Nm
       -.Op Fl 8aiIlrx
       +.Op Fl 8adiIlrx
        .Op Fl b Ar baseurl
        .Op Fl s Ar selector
        .Op Fl u Ar selector
       @@ -28,6 +28,9 @@ Toggle ANSI escape codes usage, by default it is not enabled.
        Base URL of links.
        This is used to make links absolute.
        The specified URL is always preferred over the value in a <base/> tag.
       +.It Fl d
       +Deduplicate link references.
       +When a duplicate link reference is found reuse the same link reference number.
        .It Fl i
        Toggle if link reference numbers are displayed inline or not, by default it is
        not enabled.
 (DIR) diff --git a/webdump.c b/webdump.c
       @@ -51,12 +51,14 @@ static int showurlinline = 0;  /* show full link reference inline */
        static int linewrap      = 0;  /* line-wrapping */
        static int termwidth     = 77; /* terminal width */
        static int resources     = 0;  /* write resources line-by-line to fd 3? */
       +static int uniqrefs      = 0;  /* number unique references */
        
        /* linked-list of link references */
        struct linkref {
                char *type;
                char *url;
                int ishidden;
       +        size_t linknr;
                struct linkref *next;
        };
        
       @@ -628,6 +630,20 @@ uri_format(char *buf, size_t bufsiz, struct uri *u)
                        u->fragment);
        }
        
       +/* compare tag name (case-insensitive) */
       +int
       +tagcmp(const char *s1, const char *s2)
       +{
       +        return strcasecmp(s1, s2);
       +}
       +
       +/* compare attribute name (case-insensitive) */
       +int
       +attrcmp(const char *s1, const char *s2)
       +{
       +        return strcasecmp(s1, s2);
       +}
       +
        static void
        rindent(void)
        {
       @@ -1325,9 +1341,26 @@ handleinlinealt(void)
                }
        }
        
       -static void
       -addlinkref(const char *url, const char *_type, int ishidden)
       +/* slow linear lookup of link references
       +   TODO: optimize it, maybe using tree.h RB_TREE? */
       +static struct linkref *
       +findlinkref(const char *url)
        {
       +        struct linkref *cur;
       +
       +        for (cur = links_head; cur; cur = cur->next) {
       +                if (!strcmp(url, cur->url))
       +                        return cur;
       +        }
       +        return NULL;
       +}
       +
       +static struct linkref *
       +addlinkref(const char *url, const char *_type, int ishidden, int linknr)
       +{
       +        if (!tagcmp(_type, "a"))
       +                _type = "link";
       +
                /* add to linked list */
                if (!links_head)
                        links_cur = links_head = ecalloc(1, sizeof(*links_head));
       @@ -1336,6 +1369,9 @@ addlinkref(const char *url, const char *_type, int ishidden)
                links_cur->url = estrdup(url);
                links_cur->type = estrdup(_type);
                links_cur->ishidden = ishidden;
       +        links_cur->linknr = linknr;
       +
       +        return links_cur;
        }
        
        static void
       @@ -1382,7 +1418,7 @@ handleinlinelink(void)
                /* add hidden links directly to the reference,
                   the order doesn't matter */
                if (cur->tag.displaytype & DisplayNone)
       -                addlinkref(url, cur->tag.name, 1);
       +                addlinkref(url, cur->tag.name, 1, 0);
        }
        
        void
       @@ -1407,12 +1443,13 @@ printlinkrefs(void)
                                hashiddenrefs = 1;
                                continue;
                        }
       -                printf(" %zu. %s (%s)\n", i, links_cur->url, links_cur->type);
       +                printf(" %zu. %s (%s)\n", links_cur->linknr, links_cur->url, links_cur->type);
                        i++;
                }
        
                if (hashiddenrefs)
                        printf("\n\nHidden references\n\n");
       +        /* hidden links don't have a link number, just count them */
                for (links_cur = links_head; links_cur; links_cur = links_cur->next) {
                        if (!links_cur->ishidden)
                                continue;
       @@ -1507,20 +1544,6 @@ xmlcdata(XMLParser *p, const char *data, size_t datalen)
                xmldata(p, data, datalen); /* treat CDATA as data */
        }
        
       -/* compare tag name (case-insensitive) */
       -int
       -tagcmp(const char *s1, const char *s2)
       -{
       -        return strcasecmp(s1, s2);
       -}
       -
       -/* compare attribute name (case-insensitive) */
       -int
       -attrcmp(const char *s1, const char *s2)
       -{
       -        return strcasecmp(s1, s2);
       -}
       -
        /* lookup function to compare tag name (case-insensitive) for sort functions */
        int
        findtagcmp(const void *v1, const void *v2)
       @@ -1582,6 +1605,7 @@ handleendtag(struct tag *tag)
        static void
        endnode(struct node *cur)
        {
       +        struct linkref *ref;
                int i, ishidden;
        
                /* set a flag indicating the element and its parent containers have data.
       @@ -1597,14 +1621,24 @@ endnode(struct node *cur)
        
                /* add link and show the link number in the visible order */
                if (!ishidden && nodes_links[curnode].len > 0) {
       -                addlinkref(nodes_links[curnode].data, cur->tag.name, ishidden);
       +                if (uniqrefs)
       +                        ref = findlinkref(nodes_links[curnode].data);
       +                else
       +                        ref = NULL;
       +
       +                /* new link: add it */
       +                if (!ref) {
       +                        linkcount++;
       +                        ref = addlinkref(nodes_links[curnode].data,
       +                                cur->tag.name, ishidden, linkcount);
       +                }
       +
                        if (showrefinline)
       -                        hprintf("[%zu]", ++linkcount);
       +                        hprintf("[%zu]", ref->linknr);
                        if (showurlinline)
       -                        hprintf(" [%s: %s]",
       -                                !tagcmp(cur->tag.name, "a") ? "link" : cur->tag.name,
       -                                nodes_links[curnode].data);
       -                hflush();
       +                        hprintf(" [%s: %s]", ref->type, ref->url);
       +                if (showrefinline || showurlinline)
       +                        hflush();
                }
        
                handleendtag(&(cur->tag));
       @@ -2110,7 +2144,7 @@ xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n,
        void
        usage(void)
        {
       -        fprintf(stderr, "%s [-8aiIlrx] [-b basehref] [-s selector] [-u selector] [-w termwidth]\n", argv0);
       +        fprintf(stderr, "%s [-8adiIlrx] [-b basehref] [-s selector] [-u selector] [-w termwidth]\n", argv0);
                exit(1);
        }
        
       @@ -2134,6 +2168,9 @@ main(int argc, char **argv)
                                usage();
                        basehrefset = 1;
                        break;
       +        case 'd':
       +                uniqrefs = !uniqrefs;
       +                break;
                case 'i':
                        showrefinline = !showrefinline;
                        break;