optional tag handling improvements - webdump - HTML to plain-text converter for webpages (HTM) git clone git://git.codemadness.org/webdump (DIR) Log (DIR) Files (DIR) Refs (DIR) README (DIR) LICENSE --- (DIR) commit 2e32abeb2743e5fce55bdfc1591bb66eedd63a45 (DIR) parent 9f4c3a0a47eb2bb127db5a270dfa27ad368deb6a (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org> Date: Mon, 11 Sep 2023 19:03:25 +0200 optional tag handling improvements Much better handling for the optional tags: <p>, <dd>, <dt>, <dl>. An example page: https://www.openbsd.org/policy.html Some tags to add: - aside - menu - address - details Maybe: - search - hgroup Diffstat: M webdump.c | 105 +++++++++++++++++++++++-------- 1 file changed, 78 insertions(+), 27 deletions(-) --- (DIR) diff --git a/webdump.c b/webdump.c @@ -78,7 +78,8 @@ enum DisplayType { DisplayTable = 1 << 9, DisplayTableRow = 1 << 10, DisplayTableCell = 1 << 11, - DisplayHeader = 1 << 12 + DisplayHeader = 1 << 12, + DisplayDl = 1 << 13 }; /* ANSI markup */ @@ -222,7 +223,7 @@ static struct tag tags[] = { { "dd", DisplayBlock, 0, 0, 0, 1, 0, 0, 4 }, { "del", DisplayInline, MarkupStrike, 0, 0, 0, 0, 0, 0 }, { "div", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, -{ "dl", DisplayInline, 0, 0, 0, 0, 0, 0, 0 }, +{ "dl", DisplayBlock|DisplayDl, 0, 0, 0, 0, 0, 0, 0 }, { "dt", DisplayBlock, MarkupBold, 0, 0, 1, 0, 0, 0 }, { "em", DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 }, { "embed", DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, @@ -1600,8 +1601,9 @@ static void xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) { struct tag *found, *tag; - const char *child; - int i, j, parenttype; + char *child, *childs[16]; + size_t nchilds; + int i, j, k, nchildfound, parenttype; /* ignore closing of void elements, like </br>, which is not allowed */ if ((found = findtag(t))) { @@ -1614,31 +1616,48 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */ child = NULL; + nchilds = 0; + nchildfound = 0; parenttype = 0; if (found && found->displaytype & DisplayPre) { skipinitialws = 0; /* do not skip white-space, for margins */ } else if (found && found->displaytype & DisplayList) { - child = "li"; + childs[0] = "li"; + nchilds = 1; parenttype = DisplayList; } else if (found && found->displaytype & DisplayTableRow) { - child = "td"; + childs[0] = "td"; + nchilds = 1; parenttype = DisplayTableRow; } else if (found && found->displaytype & DisplayTable) { - child = "td"; + childs[0] = "td"; + nchilds = 1; parenttype = DisplayTable; + } else if (found && found->displaytype & DisplayDl) { + childs[0] = "p"; + childs[1] = "dd"; + childs[2] = "dt"; + nchilds = 3; + parenttype = DisplayDl; } - if (child && parenttype) { + if (nchilds > 0) { for (i = curnode; i >= 0; i--) { - if ((nodes[i].tag.displaytype & parenttype)) + if (nchildfound) break; - if (!tagcmp(nodes[i].tag.name, child)) { - /* fake closing the previous tags */ - for (j = curnode; j >= i; j--) - endnode(&nodes[j]); - curnode = j; + if ((nodes[i].tag.displaytype & parenttype)) break; + for (j = 0; j < nchilds; j++) { + child = childs[j]; + if (!tagcmp(nodes[i].tag.name, child)) { + /* fake closing the previous tags */ + for (k = curnode; k >= i; k--) + endnode(&nodes[k]); + curnode = k; + nchildfound = 1; + break; + } } } } @@ -1685,9 +1704,10 @@ xmltagstart(XMLParser *p, const char *t, size_t tl) { struct tag *found; struct node *cur; - const char *child; + char *child, *childs[16]; + size_t nchilds; char *s; - int i, j, parenttype; + int i, j, k, nchildfound, parenttype; if (curnode >= MAX_DEPTH - 2) errx(1, "max tag depth reached: %d\n", curnode); @@ -1711,38 +1731,69 @@ xmltagstart(XMLParser *p, const char *t, size_t tl) https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */ child = NULL; + nchilds = 0; + nchildfound = 0; parenttype = 0; - /* if optional tag <p> is open and a block element is found, close </p>. */ + /* if optional tag <p> is open and a list element is found, close </p>. */ if (found && found->displaytype & DisplayList) { /* not inside a list */ - child = "p"; + childs[0] = "p"; + nchilds = 1; parenttype = DisplayList; } else if (found && found->isoptional) { if (!tagcmp(t, "li")) { - child = "li"; + childs[0] = "li"; + nchilds = 1; parenttype = DisplayList; } else if (!tagcmp(t, "td")) { - child = "td"; + childs[0] = "td"; + nchilds = 1; parenttype = DisplayTableRow; } else if (!tagcmp(t, "tr")) { - child = "tr"; + childs[0] = "tr"; + nchilds = 1; parenttype = DisplayTable; + } else if (!tagcmp(t, "p")) { + childs[0] = "p"; + nchilds = 1; + parenttype = 0; /* seek until the root */ + } else if (!tagcmp(t, "dt")) { + childs[0] = "dd"; + nchilds = 1; + parenttype = 0; /* seek until the root */ + } else if (!tagcmp(t, "dd")) { + childs[0] = "dd"; + childs[1] = "dt"; + nchilds = 2; + parenttype = 0; /* seek until the root */ } else if (!tagcmp(t, cur->tag.name)) { /* fake closing the previous tag if it is the same and repeated */ xmltagend(p, t, tl, 0); } + } else if (found && found->displaytype & DisplayBlock) { + /* check if we have an open "<p>" tag */ + childs[0] = "p"; + childs[1] = "dl"; + nchilds = 2; + parenttype = 0; /* seek until the root */ } - if (child && parenttype) { + if (nchilds > 0) { for (i = curnode; i >= 0; i--) { - if ((nodes[i].tag.displaytype & parenttype)) + if (nchildfound) break; - if (!tagcmp(nodes[i].tag.name, child)) { - /* fake closing the previous tags */ - for (j = curnode; j >= i; j--) - xmltagend(p, nodes[j].tag.name, strlen(nodes[j].tag.name), 0); + if ((nodes[i].tag.displaytype & parenttype)) break; + for (j = 0; j < nchilds; j++) { + child = childs[j]; + if (!tagcmp(nodes[i].tag.name, child)) { + /* fake closing the previous tags */ + for (k = curnode; k >= i; k--) + xmltagend(p, nodes[k].tag.name, strlen(nodes[k].tag.name), 0); + nchildfound = 1; + break; + } } } }