for the class and id attribute use the first value set - webdump - HTML to plain-text converter for webpages (HTM) git clone git://git.codemadness.org/webdump (DIR) Log (DIR) Files (DIR) Refs (DIR) README (DIR) LICENSE --- (DIR) commit ae36c548e48ddea692a87557938441bb7cd54994 (DIR) parent 4793272ce07153284318336426796cb7e3c93af4 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org> Date: Wed, 20 Sep 2023 18:51:10 +0200 for the class and id attribute use the first value set + small code-style tweaks. Diffstat: M webdump.c | 35 +++++++++++++++++++------------ 1 file changed, 22 insertions(+), 13 deletions(-) --- (DIR) diff --git a/webdump.c b/webdump.c @@ -191,15 +191,17 @@ static int basehrefset; /* base href set and can be used? */ static struct uri base; /* parsed current base href */ /* buffers for some attributes of the current tag */ -String attr_alt; /* alt attribute */ -String attr_checked; /* checked attribute */ -String attr_class; /* class attribute */ -String attr_data; /* data attribute */ -String attr_href; /* href attribute */ -String attr_id; /* id attribute */ -String attr_src; /* src attribute */ -String attr_type; /* type attribute */ -String attr_value; /* value attribute */ +static String attr_alt; /* alt attribute */ +static String attr_checked; /* checked attribute */ +static String attr_class; /* class attribute */ +static int attr_class_set; /* class attribute is set already */ +static String attr_data; /* data attribute */ +static String attr_href; /* href attribute */ +static String attr_id; /* id attribute */ +static int attr_id_set; /* class attribute is set already */ +static String attr_src; /* src attribute */ +static String attr_type; /* type attribute */ +static String attr_value; /* value attribute */ static String htmldata; /* buffered HTML data near the current tag */ @@ -1870,9 +1872,11 @@ xmltagstart(XMLParser *p, const char *t, size_t tl) string_clear(&attr_alt); string_clear(&attr_checked); string_clear(&attr_class); + attr_class_set = 0; string_clear(&attr_data); string_clear(&attr_href); string_clear(&attr_id); + attr_id_set = 0; string_clear(&attr_src); string_clear(&attr_type); string_clear(&attr_value); @@ -2191,9 +2195,9 @@ xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, if (!attrcmp(n, "aria-hidden") || !attrcmp(n, "hidden")) cur->tag.displaytype |= DisplayNone; - if (!attrcmp(n, "class")) + if (!attr_class_set && !attrcmp(n, "class")) /* use the first set attribute */ string_append(&attr_class, v, vl); - else if (!attrcmp(n, "id")) + else if (!attr_id_set && !attrcmp(n, "id")) /* use the first set attribute */ string_append(&attr_id, v, vl); else if (!attrcmp(n, "type")) string_append(&attr_type, v, vl); @@ -2262,6 +2266,11 @@ xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, cur = &nodes[curnode]; tagid = cur->tag.id; + if (!attr_class_set && !attrcmp(n, "class")) + attr_class_set = 1; + else if (!attr_id_set && !attrcmp(n, "id")) + attr_id_set = 1; + /* set base URL, if it is set it cannot be overwritten again */ if (!basehrefset && basehrefdoc[0] && tagid == TagBase && !attrcmp(n, "href")) @@ -2286,13 +2295,13 @@ xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, string_clear(&attr_alt); else if (!attrcmp(n, "checked")) string_clear(&attr_checked); - else if (!attrcmp(n, "class")) + else if (!attr_class_set && !attrcmp(n, "class")) string_clear(&attr_class); else if (!attrcmp(n, "data")) string_clear(&attr_data); else if (!attrcmp(n, "href")) string_clear(&attr_href); - else if (!attrcmp(n, "id")) + else if (!attr_id_set && !attrcmp(n, "id")) string_clear(&attr_id); else if (!attrcmp(n, "src")) string_clear(&attr_src);