improve forms a bit - webdump - HTML to plain-text converter for webpages
 (HTM) git clone git://git.codemadness.org/webdump
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 630f76162a192327a3eecd4fc0adcb9b31cd4504
 (DIR) parent 0705fb754f00c7866b2cc8cee0739a88a584a2e1
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Fri,  8 Sep 2023 15:05:38 +0200
       
       improve forms a bit
       
       - Treat fieldset and legend as block elements.
       - Support more types, default or unsupported is "text".
       - Show the default selected value for radio and checkboxes.
       - Don't show hidden input types.
       - Add a DisplayType DisplayInput to check the tag faster.
       
       Diffstat:
         M webdump.c                           |      64 +++++++++++++++++++++----------
       
       1 file changed, 44 insertions(+), 20 deletions(-)
       ---
 (DIR) diff --git a/webdump.c b/webdump.c
       @@ -68,16 +68,17 @@ enum DisplayType {
                DisplayUnknown     = 0,
                DisplayInline      = 1 << 0,
                DisplayInlineBlock = 1 << 1, /* unused for now */
       -        DisplayBlock       = 1 << 2,
       -        DisplayNone        = 1 << 3,
       -        DisplayPre         = 1 << 4,
       -        DisplayList        = 1 << 5,
       -        DisplayListOrdered = 1 << 6,
       -        DisplayListItem    = 1 << 7,
       -        DisplayTable       = 1 << 8,
       -        DisplayTableRow    = 1 << 9,
       -        DisplayTableCell   = 1 << 10,
       -        DisplayHeader      = 1 << 11
       +        DisplayInput       = 1 << 2,
       +        DisplayBlock       = 1 << 3,
       +        DisplayNone        = 1 << 4,
       +        DisplayPre         = 1 << 5,
       +        DisplayList        = 1 << 6,
       +        DisplayListOrdered = 1 << 7,
       +        DisplayListItem    = 1 << 8,
       +        DisplayTable       = 1 << 9,
       +        DisplayTableRow    = 1 << 10,
       +        DisplayTableCell   = 1 << 11,
       +        DisplayHeader      = 1 << 12
        };
        
        /* ANSI markup */
       @@ -143,7 +144,9 @@ struct selectors {
        };
        
        static const char *str_bullet_item = "* ";
       +static const char *str_checkbox_checked = "x";
        static const char *str_ruler = "-";
       +static const char *str_radio_checked = "*";
        
        /* base href, to make URLs absolute */
        static char *basehref = "";
       @@ -153,6 +156,7 @@ static struct uri base;
        
        /* buffers for some attributes of the current tag */
        String attr_alt; /* alt attribute */
       +String attr_checked; /* checked attribute */
        String attr_class; /* class attribute */
        String attr_href; /* href attribute */
        String attr_id; /* id attribute */
       @@ -221,6 +225,7 @@ static struct tag tags[] = {
        { "dt",         DisplayBlock,                     MarkupBold,      0,               0, 1, 0, 0, 0 },
        { "em",         DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
        { "embed",      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       +{ "fieldset",   DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
        { "figcaption", DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
        { "figure",     DisplayBlock,                     0,               0,               0, 0, 1, 1, 4 },
        { "footer",     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       @@ -236,8 +241,9 @@ static struct tag tags[] = {
        { "html",       DisplayBlock,                     0,               0,               0, 1, 0, 0, 0 },
        { "i",          DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
        { "img",        DisplayInline,                    MarkupUnderline, 0,               1, 0, 0, 0, 0 },
       -{ "input",      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       +{ "input",      DisplayInput,                     0,               0,               1, 0, 0, 0, 0 },
        { "label",      DisplayInline,                    MarkupBold,      0,               0, 0, 0, 0, 0 },
       +{ "legend",     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
        { "li",         DisplayListItem,                  0,               DisplayList,     0, 1, 0, 0, 0 },
        { "link",       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
        { "main",       DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       @@ -1684,6 +1690,7 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
                cur = &nodes[curnode];
        
                string_clear(&attr_alt);
       +        string_clear(&attr_checked);
                string_clear(&attr_class);
                string_clear(&attr_href);
                string_clear(&attr_id);
       @@ -1891,18 +1898,23 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
                if (!tagcmp(cur->tag.name, "input")) {
                        if (!attr_type.len) {
                                hprintf("[%-15s]", attr_value.len ? attr_value.data : ""); /* default: text */
       -                } else if (!strcasecmp(attr_type.data, "text")) {
       -                        hprintf("[%-15s]", attr_value.len ? attr_value.data : ""); /* text */
       -                } else if (!strcasecmp(attr_type.data, "search")) {
       -                        hprintf("[%-15s]", attr_value.len ? attr_value.data : "");
       -                } else if (!strcasecmp(attr_type.data, "button")) {
       -                        hprintf("[%s]", attr_value.len ? attr_value.data : "");
       -                } else if (!strcasecmp(attr_type.data, "submit")) {
       +                } else if (!strcasecmp(attr_type.data, "button") ||
       +                           !strcasecmp(attr_type.data, "submit") ||
       +                           !strcasecmp(attr_type.data, "reset")) {
                                hprintf("[%s]", attr_value.len ? attr_value.data : "");
                        } else if (!strcasecmp(attr_type.data, "checkbox")) {
       -                        hprint("[ ]"); /* TODO: show x or unicode checkmark when selected? */
       +                        hprintf("[%s]",
       +                                attr_checked.len &&
       +                                !strcasecmp(attr_checked.data, "checked") ? str_checkbox_checked : " ");
                        } else if (!strcasecmp(attr_type.data, "radio")) {
       -                        hprint("( )"); /* TODO: show x or unicode checkmark when selected? */
       +                        hprintf("[%s]",
       +                                attr_checked.len &&
       +                                !strcasecmp(attr_checked.data, "checked") ? str_radio_checked : " ");
       +                } else if (!strcasecmp(attr_type.data, "hidden")) {
       +                        cur->tag.displaytype |= DisplayNone;
       +                } else {
       +                        /* unrecognized / default case is text */
       +                        hprintf("[%-15s]", attr_value.len ? attr_value.data : "");
                        }
                }
        
       @@ -1963,6 +1975,8 @@ xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
                if (!tagcmp(tag, "img") && !attrcmp(name, "alt"))
                        string_append(&attr_alt, value, valuelen);
        
       +        if (!attrcmp(name, "checked"))
       +                string_append(&attr_checked, value, valuelen);
                if (!attrcmp(name, "type"))
                        string_append(&attr_type, value, valuelen);
                if (!attrcmp(name, "value"))
       @@ -1987,10 +2001,18 @@ static void
        xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n,
                size_t nl)
        {
       +        struct node *cur;
       +
       +        cur = &nodes[curnode];
       +
                /* set base URL, if it is set it cannot be overwritten again */
                if (!basehrefset && basehrefdoc[0] &&
                    !attrcmp(n, "href") && !tagcmp(t, "base"))
                        basehrefset = uri_parse(basehrefdoc, &base) != -1 ? 1 : 0;
       +
       +        /* if attribute checked is set but it has no value then set it to "checked" */
       +        if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked") && !attr_checked.len)
       +                string_append(&attr_checked, "checked", sizeof("checked") - 1);
        }
        
        static void
       @@ -1999,6 +2021,8 @@ xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n,
        {
                if (!attrcmp(n, "alt"))
                        string_clear(&attr_alt);
       +        else if (!attrcmp(n, "checked"))
       +                string_clear(&attr_checked);
                else if (!attrcmp(n, "class"))
                        string_clear(&attr_class);
                else if (!attrcmp(n, "href"))