improve link references, add option to show full URL inline - webdump - HTML to plain-text converter for webpages
 (HTM) git clone git://git.codemadness.org/webdump
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 6365a78f6c050106e64b281d29d8ef550f131bf1
 (DIR) parent 56ec7ea6c49d79cc3aaf301d2e6040e15d17785a
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Fri,  8 Sep 2023 11:25:13 +0200
       
       improve link references, add option to show full URL inline
       
       - fix URL references not being visible when only the -l option is specified
         (without -i).  Now each option can be specified separately.
       - add -I option to show full URL option inline.
       
       Diffstat:
         M webdump.1                           |       5 ++++-
         M webdump.c                           |      90 +++++++++++++++++--------------
       
       2 files changed, 53 insertions(+), 42 deletions(-)
       ---
 (DIR) diff --git a/webdump.1 b/webdump.1
       @@ -6,7 +6,7 @@
        .Nd convert HTML to plain-text
        .Sh SYNOPSIS
        .Nm
       -.Op Fl 8ailrx
       +.Op Fl 8aiIlrx
        .Op Fl b Ar baseurl
        .Op Fl s Ar selector
        .Op Fl u Ar selector
       @@ -30,6 +30,9 @@ This is used to make links absolute.
        .It Fl i
        Toggle if link reference numbers are displayed inline or not, by default it is
        not enabled.
       +.It Fl I
       +Toggle if URLs for link reference are displayed inline or not, by default it is
       +not enabled.
        .It Fl l
        Toggle if link references are displayed at the bottom or not, by default it is
        not enabled.
 (DIR) diff --git a/webdump.c b/webdump.c
       @@ -47,6 +47,7 @@ struct uri {
        static int allowansi     = 0;  /* allow ANSI escape codes */
        static int showrefbottom = 0;  /* show link references at the bottom */
        static int showrefinline = 0;  /* show link reference number inline */
       +static int showurlinline = 0;  /* show full link reference inline */
        static int linewrap      = 0;  /* line-wrapping */
        static int termwidth     = 77; /* terminal width */
        static int resources     = 0;  /* write resources line-by-line to fd 3? */
       @@ -1319,46 +1320,49 @@ handleinlinelink(void)
                char buf[4096], *url;
                int b, r;
        
       -        /* show links as reference at the bottom */
       -        if ((showrefbottom || resources) && (attr_src.len || attr_href.len)) {
       -                /* by default use the original URL */
       -                if (attr_src.len)
       -                        url = attr_src.data;
       -                else
       -                        url = attr_href.data;
       -
       -                b = -1;
       -                if (uri_hasscheme(url))
       -                        ; /* already absolute: nothing to do */
       -                else if (basehref[0]) /* prefer -b option over <base> */
       -                        b = uri_parse(basehref, &base);
       -                else if (basehrefdoc[0])
       -                        b = uri_parse(basehrefdoc, &base);
       -
       -                if (b != -1 &&
       -                    uri_parse(url, &olduri) != -1 &&
       -                    uri_makeabs(&newuri, &olduri, &base) != -1 &&
       -                    newuri.proto[0]) {
       -                        r = uri_format(buf, sizeof(buf), &newuri);
       -                        if (r >= 0 && (size_t)r < sizeof(buf))
       -                                url = buf;
       -                }
       +        if (!showrefbottom && !showrefinline && !showurlinline && !resources)
       +                return; /* there is no need to collect the reference */
        
       -                if (!url[0])
       -                        return;
       +        if (!attr_src.len && !attr_href.len)
       +                return; /* there is no reference */
        
       -                cur = &nodes[curnode];
       +        /* by default use the original URL */
       +        if (attr_src.len)
       +                url = attr_src.data;
       +        else
       +                url = attr_href.data;
       +
       +        b = -1;
       +        if (uri_hasscheme(url))
       +                ; /* already absolute: nothing to do */
       +        else if (basehref[0]) /* prefer -b option over <base> */
       +                b = uri_parse(basehref, &base);
       +        else if (basehrefdoc[0])
       +                b = uri_parse(basehrefdoc, &base);
       +
       +        if (b != -1 &&
       +            uri_parse(url, &olduri) != -1 &&
       +            uri_makeabs(&newuri, &olduri, &base) != -1 &&
       +            newuri.proto[0]) {
       +                r = uri_format(buf, sizeof(buf), &newuri);
       +                if (r >= 0 && (size_t)r < sizeof(buf))
       +                        url = buf;
       +        }
        
       -                if (showrefinline && !(cur->tag.displaytype & DisplayNone)) {
       -                        string_clear(&nodes_links[curnode]);
       -                        string_append(&nodes_links[curnode], url, strlen(url));
       -                }
       +        if (!url[0])
       +                return;
       +
       +        cur = &nodes[curnode];
        
       -                /* add hidden links directly to the reference,
       -                   the order doesn't matter */
       -                if (cur->tag.displaytype & DisplayNone)
       -                        addlinkref(url, cur->tag.name, 1);
       +        if (!(cur->tag.displaytype & DisplayNone)) {
       +                string_clear(&nodes_links[curnode]);
       +                string_append(&nodes_links[curnode], url, strlen(url));
                }
       +
       +        /* add hidden links directly to the reference,
       +           the order doesn't matter */
       +        if (cur->tag.displaytype & DisplayNone)
       +                addlinkref(url, cur->tag.name, 1);
        }
        
        void
       @@ -1574,11 +1578,12 @@ endnode(struct node *cur)
                /* add link and show the link number in the visible order */
                if (!ishidden && nodes_links[curnode].len > 0) {
                        addlinkref(nodes_links[curnode].data, cur->tag.name, ishidden);
       -#if 1
       -                hprintf("[%zu]", ++linkcount);
       -#else
       -                hprintf("[%s: %s]", cur->tag.name, nodes_links[curnode].data);
       -#endif
       +                if (showrefinline)
       +                        hprintf("[%zu]", ++linkcount);
       +                if (showurlinline)
       +                        hprintf(" [%s: %s]",
       +                                !tagcmp(cur->tag.name, "a") ? "link" : cur->tag.name,
       +                                nodes_links[curnode].data);
                }
        
                handleendtag(&(cur->tag));
       @@ -2014,7 +2019,7 @@ xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n,
        void
        usage(void)
        {
       -        fprintf(stderr, "%s [-8ailrx] [-b basehref] [-s selector] [-u selector] [-w termwidth]\n", argv0);
       +        fprintf(stderr, "%s [-8aiIlrx] [-b basehref] [-s selector] [-u selector] [-w termwidth]\n", argv0);
                exit(1);
        }
        
       @@ -2038,6 +2043,9 @@ main(int argc, char **argv)
                case 'i':
                        showrefinline = !showrefinline;
                        break;
       +        case 'I':
       +                showurlinline = !showurlinline;
       +                break;
                case 'l':
                        showrefbottom = !showrefbottom;
                        break;