optional tag handling improvements - webdump - HTML to plain-text converter for webpages
 (HTM) git clone git://git.codemadness.org/webdump
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 2e32abeb2743e5fce55bdfc1591bb66eedd63a45
 (DIR) parent 9f4c3a0a47eb2bb127db5a270dfa27ad368deb6a
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Mon, 11 Sep 2023 19:03:25 +0200
       
       optional tag handling improvements
       
       Much better handling for the optional tags: <p>, <dd>, <dt>, <dl>.
       
       An example page:
       
       https://www.openbsd.org/policy.html
       
       Some tags to add:
       
       - aside
       - menu
       - address
       - details
       
       Maybe:
       - search
       - hgroup
       
       Diffstat:
         M webdump.c                           |     105 +++++++++++++++++++++++--------
       
       1 file changed, 78 insertions(+), 27 deletions(-)
       ---
 (DIR) diff --git a/webdump.c b/webdump.c
       @@ -78,7 +78,8 @@ enum DisplayType {
                DisplayTable       = 1 << 9,
                DisplayTableRow    = 1 << 10,
                DisplayTableCell   = 1 << 11,
       -        DisplayHeader      = 1 << 12
       +        DisplayHeader      = 1 << 12,
       +        DisplayDl          = 1 << 13
        };
        
        /* ANSI markup */
       @@ -222,7 +223,7 @@ static struct tag tags[] = {
        { "dd",         DisplayBlock,                     0,               0,               0, 1, 0, 0, 4 },
        { "del",        DisplayInline,                    MarkupStrike,    0,               0, 0, 0, 0, 0 },
        { "div",        DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       -{ "dl",         DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
       +{ "dl",         DisplayBlock|DisplayDl,           0,               0,               0, 0, 0, 0, 0 },
        { "dt",         DisplayBlock,                     MarkupBold,      0,               0, 1, 0, 0, 0 },
        { "em",         DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
        { "embed",      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       @@ -1600,8 +1601,9 @@ static void
        xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
        {
                struct tag *found, *tag;
       -        const char *child;
       -        int i, j, parenttype;
       +        char *child, *childs[16];
       +        size_t nchilds;
       +        int i, j, k, nchildfound, parenttype;
        
                /* ignore closing of void elements, like </br>, which is not allowed */
                if ((found = findtag(t))) {
       @@ -1614,31 +1616,48 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
                   https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */
        
                child = NULL;
       +        nchilds = 0;
       +        nchildfound = 0;
                parenttype = 0;
        
                if (found && found->displaytype & DisplayPre) {
                        skipinitialws = 0; /* do not skip white-space, for margins */
                } else if (found && found->displaytype & DisplayList) {
       -                child = "li";
       +                childs[0] = "li";
       +                nchilds = 1;
                        parenttype = DisplayList;
                } else if (found && found->displaytype & DisplayTableRow) {
       -                child = "td";
       +                childs[0] = "td";
       +                nchilds = 1;
                        parenttype = DisplayTableRow;
                } else if (found && found->displaytype & DisplayTable) {
       -                child = "td";
       +                childs[0] = "td";
       +                nchilds = 1;
                        parenttype = DisplayTable;
       +        } else if (found && found->displaytype & DisplayDl) {
       +                childs[0] = "p";
       +                childs[1] = "dd";
       +                childs[2] = "dt";
       +                nchilds = 3;
       +                parenttype = DisplayDl;
                }
        
       -        if (child && parenttype) {
       +        if (nchilds > 0) {
                        for (i = curnode; i >= 0; i--) {
       -                        if ((nodes[i].tag.displaytype & parenttype))
       +                        if (nchildfound)
                                        break;
       -                        if (!tagcmp(nodes[i].tag.name, child)) {
       -                                /* fake closing the previous tags */
       -                                for (j = curnode; j >= i; j--)
       -                                        endnode(&nodes[j]);
       -                                curnode = j;
       +                        if ((nodes[i].tag.displaytype & parenttype))
                                        break;
       +                        for (j = 0; j < nchilds; j++) {
       +                                child = childs[j];
       +                                if (!tagcmp(nodes[i].tag.name, child)) {
       +                                        /* fake closing the previous tags */
       +                                        for (k = curnode; k >= i; k--)
       +                                                endnode(&nodes[k]);
       +                                        curnode = k;
       +                                        nchildfound = 1;
       +                                        break;
       +                                }
                                }
                        }
                }
       @@ -1685,9 +1704,10 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
        {
                struct tag *found;
                struct node *cur;
       -        const char *child;
       +        char *child, *childs[16];
       +        size_t nchilds;
                char *s;
       -        int i, j, parenttype;
       +        int i, j, k, nchildfound, parenttype;
        
                if (curnode >= MAX_DEPTH - 2)
                        errx(1, "max tag depth reached: %d\n", curnode);
       @@ -1711,38 +1731,69 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
                   https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */
        
                child = NULL;
       +        nchilds = 0;
       +        nchildfound = 0;
                parenttype = 0;
        
       -        /* if optional tag <p> is open and a block element is found, close </p>. */
       +        /* if optional tag <p> is open and a list element is found, close </p>. */
                if (found && found->displaytype & DisplayList) {
                        /* not inside a list */
       -                child = "p";
       +                childs[0] = "p";
       +                nchilds = 1;
                        parenttype = DisplayList;
                } else if (found && found->isoptional) {
                        if (!tagcmp(t, "li")) {
       -                        child = "li";
       +                        childs[0] = "li";
       +                        nchilds = 1;
                                parenttype = DisplayList;
                        } else if (!tagcmp(t, "td")) {
       -                        child = "td";
       +                        childs[0] = "td";
       +                        nchilds = 1;
                                parenttype = DisplayTableRow;
                        } else if (!tagcmp(t, "tr")) {
       -                        child = "tr";
       +                        childs[0] = "tr";
       +                        nchilds = 1;
                                parenttype = DisplayTable;
       +                } else if (!tagcmp(t, "p")) {
       +                        childs[0] = "p";
       +                        nchilds = 1;
       +                        parenttype = 0; /* seek until the root */
       +                } else if (!tagcmp(t, "dt")) {
       +                        childs[0] = "dd";
       +                        nchilds = 1;
       +                        parenttype = 0; /* seek until the root */
       +                } else if (!tagcmp(t, "dd")) {
       +                        childs[0] = "dd";
       +                        childs[1] = "dt";
       +                        nchilds = 2;
       +                        parenttype = 0; /* seek until the root */
                        } else if (!tagcmp(t, cur->tag.name)) {
                                /* fake closing the previous tag if it is the same and repeated */
                                xmltagend(p, t, tl, 0);
                        }
       +        } else if (found && found->displaytype & DisplayBlock) {
       +                /* check if we have an open "<p>" tag */
       +                childs[0] = "p";
       +                childs[1] = "dl";
       +                nchilds = 2;
       +                parenttype = 0; /* seek until the root */
                }
        
       -        if (child && parenttype) {
       +        if (nchilds > 0) {
                        for (i = curnode; i >= 0; i--) {
       -                        if ((nodes[i].tag.displaytype & parenttype))
       +                        if (nchildfound)
                                        break;
       -                        if (!tagcmp(nodes[i].tag.name, child)) {
       -                                /* fake closing the previous tags */
       -                                for (j = curnode; j >= i; j--)
       -                                        xmltagend(p, nodes[j].tag.name, strlen(nodes[j].tag.name), 0);
       +                        if ((nodes[i].tag.displaytype & parenttype))
                                        break;
       +                        for (j = 0; j < nchilds; j++) {
       +                                child = childs[j];
       +                                if (!tagcmp(nodes[i].tag.name, child)) {
       +                                        /* fake closing the previous tags */
       +                                        for (k = curnode; k >= i; k--)
       +                                                xmltagend(p, nodes[k].tag.name, strlen(nodes[k].tag.name), 0);
       +                                        nchildfound = 1;
       +                                        break;
       +                                }
                                }
                        }
                }