improve base URL and <base href /> handling - webdump - HTML to plain-text converter for webpages (HTM) git clone git://git.codemadness.org/webdump (DIR) Log (DIR) Files (DIR) Refs (DIR) README (DIR) LICENSE --- (DIR) commit 0705fb754f00c7866b2cc8cee0739a88a584a2e1 (DIR) parent 7d4723febabeb679e1980c12b5dfd3b656475b4f (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org> Date: Fri, 8 Sep 2023 13:09:37 +0200 improve base URL and <base href /> handling - Parse the base URI once and reuse the structure (optimization). - Once it is parsed it cannot be overwritten again. This matches the browser more closely. Diffstat: M webdump.1 | 1 + M webdump.c | 30 ++++++++++++++++-------------- 2 files changed, 17 insertions(+), 14 deletions(-) --- (DIR) diff --git a/webdump.1 b/webdump.1 @@ -27,6 +27,7 @@ Toggle ANSI escape codes usage, by default it is not enabled. .It Fl b Ar baseurl Base URL of links. This is used to make links absolute. +The specified URL is always preferred over the value in a <base/> tag. .It Fl i Toggle if link reference numbers are displayed inline or not, by default it is not enabled. (DIR) diff --git a/webdump.c b/webdump.c @@ -148,6 +148,8 @@ static const char *str_ruler = "-"; /* base href, to make URLs absolute */ static char *basehref = ""; static char basehrefdoc[4096]; /* base href in document, if any */ +static int basehrefset = 0; /* base href set and can be used? */ +static struct uri base; /* buffers for some attributes of the current tag */ String attr_alt; /* alt attribute */ @@ -1311,14 +1313,13 @@ addlinkref(const char *url, const char *_type, int ishidden) links_cur->ishidden = ishidden; } -/* TODO: make parsed base URL global and overwrite it once. */ static void handleinlinelink(void) { - struct uri base, newuri, olduri; + struct uri newuri, olduri; struct node *cur; char buf[4096], *url; - int b, r; + int r; if (!showrefbottom && !showrefinline && !showurlinline && !resources) return; /* there is no need to collect the reference */ @@ -1332,15 +1333,9 @@ handleinlinelink(void) else url = attr_href.data; - b = -1; - if (uri_hasscheme(url)) - ; /* already absolute: nothing to do */ - else if (basehref[0]) /* prefer -b option over <base> */ - b = uri_parse(basehref, &base); - else if (basehrefdoc[0]) - b = uri_parse(basehrefdoc, &base); - - if (b != -1 && + /* Not an absolute URL yet: try to make it absolute. + If it is not possible use the relative URL */ + if (!uri_hasscheme(url) && basehrefset && uri_parse(url, &olduri) != -1 && uri_makeabs(&newuri, &olduri, &base) != -1 && newuri.proto[0]) { @@ -1948,7 +1943,7 @@ xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name, string_append(&attr_id, value, valuelen); /* <base href="..." /> */ - if (!attrcmp(name, "href") && !tagcmp(tag, "base")) + if (!basehrefset && !attrcmp(name, "href") && !tagcmp(tag, "base")) strlcat(basehrefdoc, value, sizeof(basehrefdoc)); /* hide tags with attribute aria-hidden or hidden */ @@ -1992,6 +1987,10 @@ static void xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl) { + /* set base URL, if it is set it cannot be overwritten again */ + if (!basehrefset && basehrefdoc[0] && + !attrcmp(n, "href") && !tagcmp(t, "base")) + basehrefset = uri_parse(basehrefdoc, &base) != -1 ? 1 : 0; } static void @@ -2013,7 +2012,7 @@ xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, else if (!attrcmp(n, "value")) string_clear(&attr_value); - if (!attrcmp(n, "href") && !tagcmp(t, "base")) + if (basehrefdoc[0] && !attrcmp(n, "href") && !tagcmp(t, "base")) basehrefdoc[0] = '\0'; } @@ -2040,6 +2039,9 @@ main(int argc, char **argv) break; case 'b': basehref = EARGF(usage()); + if (uri_parse(basehref, &base) == -1) + usage(); + basehrefset = 1; break; case 'i': showrefinline = !showrefinline;