various improvements - webdump - HTML to plain-text converter for webpages
 (HTM) git clone git://git.codemadness.org/webdump
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 89c9108dc27fe27e0f028f67508a1156ed242d2a
 (DIR) parent 62884d7b5684e791bb0cd6466f74367d6d71618d
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Thu, 14 Sep 2023 22:31:03 +0200
       
       various improvements
       
       - add an unique tagid number per tag. This allows checking by tag number.
       - add support for the link reference <frame>, <iframe>, <embed src>.
       - improve checking for open optional <p> tags when a block element (such as
         <section> is open).
       - check if the base URI using the -b option is absolute.
       
       Diffstat:
         M webdump.1                           |       3 ++-
         M webdump.c                           |     430 +++++++++++++++++--------------
       
       2 files changed, 245 insertions(+), 188 deletions(-)
       ---
 (DIR) diff --git a/webdump.1 b/webdump.1
       @@ -1,4 +1,4 @@
       -.Dd September 12, 2023
       +.Dd September 14, 2023
        .Dt WEBDUMP 1
        .Os
        .Sh NAME
       @@ -18,6 +18,7 @@ It converts and writes the output as plain-text to stdout.
        A
        .Ar baseurl
        can be specified if the links in the feed are relative URLs.
       +This must be an absolute URI.
        .Bl -tag -width Ds
        .It Fl 8
        Use UTF-8 symbols for certain items like bullet items and rulers to make the
 (DIR) diff --git a/webdump.c b/webdump.c
       @@ -53,19 +53,6 @@ static int termwidth     = 77; /* terminal width */
        static int resources     = 0;  /* write resources line-by-line to fd 3? */
        static int uniqrefs      = 0;  /* number unique references */
        
       -/* linked-list of link references */
       -struct linkref {
       -        char *type;
       -        char *url;
       -        int ishidden;
       -        size_t linknr;
       -        struct linkref *next;
       -};
       -
       -static struct linkref *links_head;
       -static struct linkref *links_cur;
       -static int linkcount; /* visible link count */
       -
        enum DisplayType {
                DisplayUnknown     = 0,
                DisplayInline      = 1 << 0,
       @@ -106,8 +93,22 @@ typedef struct string {
                size_t  bufsiz; /* allocated size */
        } String;
        
       +enum TagId { TagA = 1, TagAddress, TagArea, TagArticle, TagAside, TagAudio,
       +TagB, TagBase, TagBlink, TagBlockquote, TagBody, TagBr, TagButton, TagCite,
       +TagCol, TagColgroup, TagDatalist, TagDd, TagDel, TagDetails, TagDfn, TagDir,
       +TagDiv, TagDl, TagDt, TagEm, TagEmbed, TagFieldset, TagFigcaption, TagFigure,
       +TagFooter, TagForm, TagFrame, TagH1, TagH2, TagH3, TagH4, TagH5, TagH6,
       +TagHead, TagHeader, TagHr, TagHtml, TagI, TagIframe, TagImg, TagInput, TagIns,
       +TagLabel, TagLegend, TagLi, TagLink, TagMain, TagMark, TagMenu, TagMeta,
       +TagNav, TagObject, TagOl, TagOption, TagP, TagParam, TagPre, TagS, TagScript,
       +TagSearch, TagSection, TagSelect, TagSource, TagStrike, TagStrong, TagStyle,
       +TagSummary, TagTable, TagTbody, TagTd, TagTemplate, TagTextarea, TagTfoot,
       +TagTh, TagThead, TagTitle, TagTr, TagTrack, TagU, TagUl, TagVar, TagVideo,
       +TagWbr, TagXmp };
       +
        struct tag {
                const char *name;
       +        enum TagId id;
                enum DisplayType displaytype;
                enum MarkupType markuptype; /* ANSI markup */
                enum DisplayType parenttype; /* display type belonging to element */
       @@ -150,6 +151,20 @@ struct selectors {
                size_t count;
        };
        
       +/* linked-list of link references */
       +struct linkref {
       +        char *type;
       +        enum TagId tagid;
       +        char *url;
       +        int ishidden;
       +        size_t linknr;
       +        struct linkref *next;
       +};
       +
       +static struct linkref *links_head;
       +static struct linkref *links_cur;
       +static int linkcount; /* visible link count */
       +
        static const char *str_bullet_item = "* ";
        static const char *str_checkbox_checked = "x";
        static const char *str_ruler = "-";
       @@ -212,96 +227,100 @@ static enum MarkupType curmarkup;
        /* selector to match */
        static struct selectors *sel_hide, *sel_show;
        
       -/* tag          displaytype                       markup           parent           v  o  b  a  i */
       +/* tags table: needs to be sorted like tagcmp(), alphabetically */
       +
       +/* tag          id             displaytype                       markup           parent           v  o  b  a  i */
        static struct tag tags[] = {
       -{ "a",          DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
       -{ "address",    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       -{ "area",       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       -{ "article",    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       -{ "aside",      DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       -{ "audio",      DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
       -{ "b",          DisplayInline,                    MarkupBold,      0,               0, 0, 0, 0, 0 },
       -{ "base",       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       -{ "blink",      DisplayInline,                    MarkupBlink,     0,               0, 0, 0, 0, 0 },
       -{ "blockquote", DisplayBlock,                     0,               0,               0, 0, 0, 0, 2 },
       -{ "body",       DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       -{ "br",         0,                                0,               0,               1, 0, 0, 0, 0 },
       -{ "button",     DisplayInline | DisplayButton,    0,               0,               0, 0, 0, 0, 0 },
       -{ "cite",       DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
       -{ "col",        DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       -{ "colgroup",   DisplayInline,                    0,               0,               0, 1, 0, 0, 0 },
       -{ "datalist",   DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
       -{ "dd",         DisplayBlock,                     0,               0,               0, 1, 0, 0, 4 },
       -{ "del",        DisplayInline,                    MarkupStrike,    0,               0, 0, 0, 0, 0 },
       -{ "details",    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       -{ "dfn",        DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
       -{ "dir",        DisplayList,                      0,               0,               0, 0, 1, 1, 2 },
       -{ "div",        DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       -{ "dl",         DisplayBlock | DisplayDl,         0,               0,               0, 0, 0, 0, 0 },
       -{ "dt",         DisplayBlock,                     MarkupBold,      0,               0, 1, 0, 0, 0 },
       -{ "em",         DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
       -{ "embed",      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       -{ "fieldset",   DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       -{ "figcaption", DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       -{ "figure",     DisplayBlock,                     0,               0,               0, 0, 1, 1, 4 },
       -{ "footer",     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       -{ "form",       DisplayBlock,                     0,               0,               0, 0, 0, 1, 0 },
       -{ "h1",         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
       -{ "h2",         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
       -{ "h3",         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
       -{ "h4",         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
       -{ "h5",         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
       -{ "h6",         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
       -{ "head",       DisplayBlock,                     0,               0,               0, 1, 0, 0, 0 },
       -{ "header",     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       -{ "hr",         DisplayBlock,                     0,               0,               1, 0, 0, 0, 0 },
       -{ "html",       DisplayBlock,                     0,               0,               0, 1, 0, 0, 0 },
       -{ "i",          DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
       -{ "img",        DisplayInline,                    MarkupUnderline, 0,               1, 0, 0, 0, 0 },
       -{ "input",      DisplayInput,                     0,               0,               1, 0, 0, 0, 0 },
       -{ "ins",        DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
       -{ "label",      DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
       -{ "legend",     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       -{ "li",         DisplayListItem,                  0,               DisplayList,     0, 1, 0, 0, 0 },
       -{ "link",       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       -{ "main",       DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       -{ "mark",       DisplayInline,                    MarkupReverse,   0,               0, 0, 0, 0, 0 },
       -{ "menu",       DisplayList,                      0,               0,               0, 0, 1, 1, 2 },
       -{ "meta",       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       -{ "nav",        DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       -{ "object",     DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
       -{ "ol",         DisplayList | DisplayListOrdered, 0,               0,               0, 0, 1, 1, 0 },
       -{ "option",     DisplayInline | DisplayOption,    0,               0,               0, 1, 0, 0, 0 },
       -{ "p",          DisplayBlock,                     0,               0,               0, 1, 1, 1, 0 },
       -{ "param",      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       -{ "pre",        DisplayPre,                       0,               0,               0, 0, 1, 1, 4 },
       -{ "s",          DisplayInline,                    MarkupStrike,    0,               0, 0, 0, 0, 0 },
       -{ "search",     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       -{ "script",     DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
       -{ "section",    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       -{ "select",     DisplayInline | DisplaySelect,    0,               0,               0, 0, 0, 0, 0 },
       -{ "source",     DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       -{ "strike",     DisplayInline,                    MarkupStrike,    0,               0, 0, 0, 0, 0 },
       -{ "strong",     DisplayInline,                    MarkupBold,      0,               0, 0, 0, 0, 0 },
       -{ "style",      DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
       -{ "summary",    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       -{ "table",      DisplayTable,                     0,               0,               0, 0, 0, 0, 0 },
       -{ "tbody",      DisplayInline,                    0,               DisplayTable,    0, 1, 0, 0, 0 },
       -{ "td",         DisplayTableCell,                 0,               DisplayTableRow, 0, 1, 0, 0, 0 },
       -{ "template",   DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
       -{ "textarea",   DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
       -{ "tfoot",      DisplayInline,                    0,               DisplayTable,    0, 1, 0, 0, 0 },
       -{ "th",         DisplayTableCell,                 MarkupBold,      DisplayTableRow, 0, 1, 0, 0, 0 },
       -{ "thead",      DisplayInline,                    0,               DisplayTable,    0, 1, 0, 0, 0 },
       -{ "title",      DisplayBlock,                     0,               0,               0, 0, 0, 1, -DEFAULT_INDENT },
       -{ "tr",         DisplayTableRow,                  0,               DisplayTable,    0, 1, 0, 0, 0 },
       -{ "track",      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       -{ "u",          DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
       -{ "ul",         DisplayList,                      0,               0,               0, 0, 1, 1, 2 },
       -{ "var",        DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
       -{ "video",      DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
       -{ "wbr",        DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       -{ "xmp",        DisplayPre,                       0,               0,               0, 0, 1, 1, 4 }
       +{ "a",          TagA,          DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
       +{ "address",    TagAddress,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       +{ "area",       TagArea,       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       +{ "article",    TagArticle,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       +{ "aside",      TagAside,      DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       +{ "audio",      TagAudio,      DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
       +{ "b",          TagB,          DisplayInline,                    MarkupBold,      0,               0, 0, 0, 0, 0 },
       +{ "base",       TagBase,       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       +{ "blink",      TagBlink,      DisplayInline,                    MarkupBlink,     0,               0, 0, 0, 0, 0 },
       +{ "blockquote", TagBlockquote, DisplayBlock,                     0,               0,               0, 0, 0, 0, 2 },
       +{ "body",       TagBody,       DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       +{ "br",         TagBr,         0,                                0,               0,               1, 0, 0, 0, 0 },
       +{ "button",     TagButton,     DisplayInline | DisplayButton,    0,               0,               0, 0, 0, 0, 0 },
       +{ "cite",       TagCite,       DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
       +{ "col",        TagCol,        DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       +{ "colgroup",   TagColgroup,   DisplayInline,                    0,               0,               0, 1, 0, 0, 0 },
       +{ "datalist",   TagDatalist,   DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
       +{ "dd",         TagDd,         DisplayBlock,                     0,               0,               0, 1, 0, 0, 4 },
       +{ "del",        TagDel,        DisplayInline,                    MarkupStrike,    0,               0, 0, 0, 0, 0 },
       +{ "details",    TagDetails,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       +{ "dfn",        TagDfn,        DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
       +{ "dir",        TagDir,        DisplayList,                      0,               0,               0, 0, 1, 1, 2 },
       +{ "div",        TagDiv,        DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       +{ "dl",         TagDl,         DisplayBlock | DisplayDl,         0,               0,               0, 0, 0, 0, 0 },
       +{ "dt",         TagDt,         DisplayBlock,                     MarkupBold,      0,               0, 1, 0, 0, 0 },
       +{ "em",         TagEm,         DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
       +{ "embed",      TagEmbed,      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       +{ "fieldset",   TagFieldset,   DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       +{ "figcaption", TagFigcaption, DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       +{ "figure",     TagFigure,     DisplayBlock,                     0,               0,               0, 0, 1, 1, 4 },
       +{ "footer",     TagFooter,     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       +{ "form",       TagForm,       DisplayBlock,                     0,               0,               0, 0, 0, 1, 0 },
       +{ "frame",      TagFrame,      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       +{ "h1",         TagH1,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
       +{ "h2",         TagH2,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
       +{ "h3",         TagH3,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
       +{ "h4",         TagH4,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
       +{ "h5",         TagH5,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
       +{ "h6",         TagH6,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
       +{ "head",       TagHead,       DisplayBlock,                     0,               0,               0, 1, 0, 0, 0 },
       +{ "header",     TagHeader,     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       +{ "hr",         TagHr,         DisplayBlock,                     0,               0,               1, 0, 0, 0, 0 },
       +{ "html",       TagHtml,       DisplayBlock,                     0,               0,               0, 1, 0, 0, 0 },
       +{ "i",          TagI,          DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
       +{ "iframe",     TagIframe,     DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
       +{ "img",        TagImg,        DisplayInline,                    MarkupUnderline, 0,               1, 0, 0, 0, 0 },
       +{ "input",      TagInput,      DisplayInput,                     0,               0,               1, 0, 0, 0, 0 },
       +{ "ins",        TagIns,        DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
       +{ "label",      TagLabel,      DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
       +{ "legend",     TagLegend,     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       +{ "li",         TagLi,         DisplayListItem,                  0,               DisplayList,     0, 1, 0, 0, 0 },
       +{ "link",       TagLink,       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       +{ "main",       TagMain,       DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       +{ "mark",       TagMark,       DisplayInline,                    MarkupReverse,   0,               0, 0, 0, 0, 0 },
       +{ "menu",       TagMenu,       DisplayList,                      0,               0,               0, 0, 1, 1, 2 },
       +{ "meta",       TagMeta,       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       +{ "nav",        TagNav,        DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       +{ "object",     TagObject,     DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
       +{ "ol",         TagOl,         DisplayList | DisplayListOrdered, 0,               0,               0, 0, 1, 1, 0 },
       +{ "option",     TagOption,     DisplayInline | DisplayOption,    0,               0,               0, 1, 0, 0, 0 },
       +{ "p",          TagP,          DisplayBlock,                     0,               0,               0, 1, 1, 1, 0 },
       +{ "param",      TagParam,      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       +{ "pre",        TagPre,        DisplayPre,                       0,               0,               0, 0, 1, 1, 4 },
       +{ "s",          TagS,          DisplayInline,                    MarkupStrike,    0,               0, 0, 0, 0, 0 },
       +{ "script",     TagScript,     DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
       +{ "search",     TagSearch,     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       +{ "section",    TagSection,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       +{ "select",     TagSelect,     DisplayInline | DisplaySelect,    0,               0,               0, 0, 0, 0, 0 },
       +{ "source",     TagSource,     DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       +{ "strike",     TagStrike,     DisplayInline,                    MarkupStrike,    0,               0, 0, 0, 0, 0 },
       +{ "strong",     TagStrong,     DisplayInline,                    MarkupBold,      0,               0, 0, 0, 0, 0 },
       +{ "style",      TagStyle,      DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
       +{ "summary",    TagSummary,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
       +{ "table",      TagTable,      DisplayTable,                     0,               0,               0, 0, 0, 0, 0 },
       +{ "tbody",      TagTbody,      DisplayInline,                    0,               DisplayTable,    0, 1, 0, 0, 0 },
       +{ "td",         TagTd,         DisplayTableCell,                 0,               DisplayTableRow, 0, 1, 0, 0, 0 },
       +{ "template",   TagTemplate,   DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
       +{ "textarea",   TagTextarea,   DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
       +{ "tfoot",      TagTfoot,      DisplayInline,                    0,               DisplayTable,    0, 1, 0, 0, 0 },
       +{ "th",         TagTh,         DisplayTableCell,                 MarkupBold,      DisplayTableRow, 0, 1, 0, 0, 0 },
       +{ "thead",      TagThead,      DisplayInline,                    0,               DisplayTable,    0, 1, 0, 0, 0 },
       +{ "title",      TagTitle,      DisplayBlock,                     0,               0,               0, 0, 0, 1, -DEFAULT_INDENT },
       +{ "tr",         TagTr,         DisplayTableRow,                  0,               DisplayTable,    0, 1, 0, 0, 0 },
       +{ "track",      TagTrack,      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       +{ "u",          TagU,          DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
       +{ "ul",         TagUl,         DisplayList,                      0,               0,               0, 0, 1, 1, 2 },
       +{ "var",        TagVar,        DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
       +{ "video",      TagVideo,      DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
       +{ "wbr",        TagWbr,        DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
       +{ "xmp",        TagXmp,        DisplayPre,                       0,               0,               0, 0, 1, 1, 4 }
        };
        
        /* hint for compilers and static analyzers that a function exits */
       @@ -1374,9 +1393,10 @@ findlinkref(const char *url)
        }
        
        static struct linkref *
       -addlinkref(const char *url, const char *_type, int ishidden, int linknr)
       +addlinkref(const char *url, const char *_type, enum TagId tagid, int ishidden,
       +        int linknr)
        {
       -        if (!tagcmp(_type, "a"))
       +        if (tagid == TagA)
                        _type = "link";
        
                /* add to linked list */
       @@ -1386,6 +1406,7 @@ addlinkref(const char *url, const char *_type, int ishidden, int linknr)
                        links_cur = links_cur->next = ecalloc(1, sizeof(*links_head));
                links_cur->url = estrdup(url);
                links_cur->type = estrdup(_type);
       +        links_cur->tagid = tagid;
                links_cur->ishidden = ishidden;
                links_cur->linknr = linknr;
        
       @@ -1441,7 +1462,7 @@ handleinlinelink(void)
                /* add hidden links directly to the reference,
                   the order doesn't matter */
                if (cur->tag.displaytype & DisplayNone)
       -                addlinkref(url, cur->tag.name, 1, 0);
       +                addlinkref(url, cur->tag.name, cur->tag.id, 1, 0);
        }
        
        void
       @@ -1658,7 +1679,7 @@ endnode(struct node *cur)
                        if (!ref) {
                                linkcount++;
                                ref = addlinkref(nodes_links[curnode].data,
       -                                cur->tag.name, ishidden, linkcount);
       +                                cur->tag.name, cur->tag.id, ishidden, linkcount);
                        }
        
                        if (showrefinline || showurlinline) {
       @@ -1669,7 +1690,7 @@ endnode(struct node *cur)
                        if (showrefinline)
                                hprintf("[%zu]", ref->linknr);
                        if (showurlinline) {
       -                        if (!tagcmp("link", ref->type))
       +                        if (ref->tagid == TagA)
                                        hprintf("[%s]", ref->url);
                                else
                                        hprintf("[%s: %s]", ref->type, ref->url);
       @@ -1687,7 +1708,7 @@ static void
        xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
        {
                struct tag *found, *tag;
       -        char *child, *childs[16];
       +        enum TagId child, childs[16];
                size_t nchilds;
                int i, j, k, nchildfound, parenttype;
        
       @@ -1701,35 +1722,39 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
                   in reality the optional tag rules are more complex, see:
                   https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */
        
       -        child = NULL;
       +        child = 0;
                nchilds = 0;
                nchildfound = 0;
       -        parenttype = 0;
       +        parenttype = 0; /* by default, seek until the root */
        
                if (found && found->displaytype & DisplayPre) {
                        skipinitialws = 0; /* do not skip white-space, for margins */
                } else if (found && found->displaytype & DisplayList) {
       -                childs[0] = "li";
       +                childs[0] = TagLi;
                        nchilds = 1;
                        parenttype = DisplayList;
                } else if (found && found->displaytype & DisplayTableRow) {
       -                childs[0] = "td";
       +                childs[0] = TagTd;
                        nchilds = 1;
                        parenttype = DisplayTableRow;
                } else if (found && found->displaytype & DisplayTable) {
       -                childs[0] = "td";
       +                childs[0] = TagTd;
                        nchilds = 1;
                        parenttype = DisplayTable;
                } else if (found && found->displaytype & DisplaySelect) {
       -                childs[0] = "option";
       +                childs[0] = TagOption;
                        nchilds = 1;
                        parenttype = DisplaySelect;
                } else if (found && found->displaytype & DisplayDl) {
       -                childs[0] = "p";
       -                childs[1] = "dd";
       -                childs[2] = "dt";
       +                childs[0] = TagP;
       +                childs[1] = TagDd;
       +                childs[2] = TagDt;
                        nchilds = 3;
                        parenttype = DisplayDl;
       +        } else if (found && found->displaytype & DisplayBlock) {
       +                childs[0] = TagP;
       +                nchilds = 1;
       +                parenttype = 0; /* seek until the root */
                }
        
                if (nchilds > 0) {
       @@ -1740,7 +1765,7 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
                                        break;
                                for (j = 0; j < nchilds; j++) {
                                        child = childs[j];
       -                                if (!tagcmp(nodes[i].tag.name, child)) {
       +                                if (nodes[i].tag.id == child) {
                                                /* fake closing the previous tags */
                                                for (k = curnode; k >= i; k--)
                                                        endnode(&nodes[k]);
       @@ -1794,7 +1819,8 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
        {
                struct tag *found;
                struct node *cur;
       -        char *child, *childs[16];
       +        enum TagId tagid;
       +        enum TagId child, childs[16];
                size_t nchilds;
                char *s;
                int i, j, k, nchildfound, parenttype;
       @@ -1821,55 +1847,56 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
                   in reality the optional tag rules are more complex, see:
                   https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */
        
       -        child = NULL;
       +        child = 0;
                nchilds = 0;
                nchildfound = 0;
       -        parenttype = 0;
       +        parenttype = 0; /* by default, seek until the root */
        
                /* if optional tag <p> is open and a list element is found, close </p>. */
                if (found && found->displaytype & DisplayList) {
                        /* not inside a list */
       -                childs[0] = "p";
       +                childs[0] = TagP;
                        nchilds = 1;
                        parenttype = DisplayList;
                } else if (found && found->isoptional) {
       -                if (!tagcmp(t, "li")) {
       -                        childs[0] = "li";
       +                tagid = found->id;
       +                if (tagid == TagLi) {
       +                        childs[0] = TagLi;
                                nchilds = 1;
                                parenttype = DisplayList;
       -                } else if (!tagcmp(t, "td")) {
       -                        childs[0] = "td";
       +                } else if (tagid == TagTd) {
       +                        childs[0] = TagTd;
                                nchilds = 1;
                                parenttype = DisplayTableRow;
       -                } else if (!tagcmp(t, "tr")) {
       -                        childs[0] = "tr";
       +                } else if (tagid == TagTr) {
       +                        childs[0] = TagTr;
                                nchilds = 1;
                                parenttype = DisplayTable;
       -                } else if (!tagcmp(t, "p")) {
       -                        childs[0] = "p";
       +                } else if (tagid == TagP) {
       +                        childs[0] = TagP;
                                nchilds = 1;
                                parenttype = 0; /* seek until the root */
       -                } else if (!tagcmp(t, "option")) {
       -                        childs[0] = "option";
       +                } else if (tagid == TagOption) {
       +                        childs[0] = TagOption;
                                nchilds = 1;
                                parenttype = DisplaySelect;
       -                } else if (!tagcmp(t, "dt")) {
       -                        childs[0] = "dd";
       +                } else if (tagid == TagDt) {
       +                        childs[0] = TagDd;
                                nchilds = 1;
                                parenttype = DisplayDl;
       -                } else if (!tagcmp(t, "dd")) {
       -                        childs[0] = "dd";
       -                        childs[1] = "dt";
       +                } else if (tagid == TagDd) {
       +                        childs[0] = TagDd;
       +                        childs[1] = TagDt;
                                nchilds = 2;
                                parenttype = DisplayDl;
       -                } else if (!tagcmp(t, cur->tag.name)) {
       +                } else if (tagid == cur->tag.id) {
                                /* fake closing the previous tag if it is the same and repeated */
                                xmltagend(p, t, tl, 0);
                        }
                } else if (found && found->displaytype & DisplayBlock) {
                        /* check if we have an open "<p>" tag */
       -                childs[0] = "p";
       -                childs[1] = "dl";
       +                childs[0] = TagP;
       +                childs[1] = TagDl;
                        nchilds = 2;
                        parenttype = DisplayDl;
                }
       @@ -1882,7 +1909,7 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
                                        break;
                                for (j = 0; j < nchilds; j++) {
                                        child = childs[j];
       -                                if (!tagcmp(nodes[i].tag.name, child)) {
       +                                if (nodes[i].tag.id == child) {
                                                /* fake closing the previous tags */
                                                for (k = curnode; k >= i; k--)
                                                        xmltagend(p, nodes[k].tag.name, strlen(nodes[k].tag.name), 0);
       @@ -1917,19 +1944,26 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
        static void
        xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
        {
       +        struct tag *found;
       +        enum TagId tagid;
                struct node *cur, *parent;
                int i, margintop;
        
       +        /* match tag */
       +        tagid = 0;
       +        if ((found = findtag(t)))
       +                tagid = found->id;
       +
                /* temporary replace the callback except the reader and end of tag
                   restore the context once we receive the same ignored tag in the
                   end tag handler */
       -        if (!tagcmp(t, "script")) {
       +        if (tagid == TagScript) {
                        ignorestate = endtag = "</script>";
                        getnext = p->getnext; /* for restore */
                        p->getnext = getnext_ignore;
                        xmltagend(p, t, tl, 0); /* fake the call the tag was ended */
                        return;
       -        } else if (!tagcmp(t, "style")) {
       +        } else if (tagid == TagStyle) {
                        ignorestate = endtag = "</style>";
                        getnext = p->getnext; /* for restore */
                        p->getnext = getnext_ignore;
       @@ -2089,12 +2123,12 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
                   the node */
                cur->hasdata = 0;
        
       -        if (!tagcmp(t, "hr")) { /* ruler */
       +        if (tagid == TagHr) { /* ruler */
                        i = termwidth - indent - defaultindent;
                        for (; i > 0; i--)
                                hprint(str_ruler);
                        cur->hasdata = 1; /* treat <hr/> as data */
       -        } else if (!tagcmp(t, "br")) {
       +        } else if (tagid == TagBr) {
                        hflush();
                        hadnewline = 0; /* forced newline */
                        hputchar('\n');
       @@ -2107,65 +2141,78 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
        }
        
        static void
       -xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
       -        size_t namelen, const char *value, size_t valuelen)
       +xmlattr(XMLParser *p, const char *t, size_t tl, const char *n,
       +        size_t nl, const char *v, size_t vl)
        {
                struct node *cur;
       +        enum TagId tagid;
        
                cur = &nodes[curnode];
       -
       -        if (!attrcmp(name, "class"))
       -                string_append(&attr_class, value, valuelen);
       -        else if (!attrcmp(name, "id"))
       -                string_append(&attr_id, value, valuelen);
       -
       -        /* <base href="..." /> */
       -        if (!basehrefset && !attrcmp(name, "href") && !tagcmp(tag, "base"))
       -                strlcat(basehrefdoc, value, sizeof(basehrefdoc));
       +        tagid = cur->tag.id;
        
                /* hide tags with attribute aria-hidden or hidden */
       -        if (!attrcmp(name, "aria-hidden") || !attrcmp(name, "hidden"))
       +        if (!attrcmp(n, "aria-hidden") || !attrcmp(n, "hidden"))
                        cur->tag.displaytype |= DisplayNone;
        
       -        if (!tagcmp(tag, "select") && !attrcmp(name, "multiple"))
       -                cur->tag.displaytype |= DisplaySelectMulti;
       +        if (!attrcmp(n, "class"))
       +                string_append(&attr_class, v, vl);
       +        else if (!attrcmp(n, "id"))
       +                string_append(&attr_id, v, vl);
       +        else if (!attrcmp(n, "type"))
       +                string_append(&attr_type, v, vl);
       +        else if (!attrcmp(n, "value"))
       +                string_append(&attr_value, v, vl);
        
       -        if (!tagcmp(tag, "a") && !attrcmp(name, "href"))
       -                string_append(&attr_href, value, valuelen);
       +        /* <base href="..." /> */
       +        if (!basehrefset && tagid == TagBase && !attrcmp(n, "href"))
       +                strlcat(basehrefdoc, v, sizeof(basehrefdoc));
        
       -        if (!tagcmp(tag, "object") && !attrcmp(name, "data"))
       -                string_append(&attr_data, value, valuelen);
       +        if (tagid == TagA && !attrcmp(n, "href"))
       +                string_append(&attr_href, v, vl);
        
       -        if ((!tagcmp(tag, "img") || !tagcmp(tag, "video") ||
       -             !tagcmp(tag, "source") || !tagcmp(tag, "track") ||
       -             !tagcmp(tag, "audio")) &&
       -             !attrcmp(name, "src") && valuelen)
       -                string_append(&attr_src, value, valuelen);
       +        if (tagid == TagSelect && !attrcmp(n, "multiple"))
       +                cur->tag.displaytype |= DisplaySelectMulti;
        
       -        /* show img alt attribute as text. */
       -        if (!tagcmp(tag, "img") && !attrcmp(name, "alt"))
       -                string_append(&attr_alt, value, valuelen);
       +        if (tagid == TagObject && !attrcmp(n, "data"))
       +                string_append(&attr_data, v, vl);
        
       -        if (!attrcmp(name, "checked"))
       -                string_append(&attr_checked, value, valuelen);
       -        else if (!attrcmp(name, "type"))
       -                string_append(&attr_type, value, valuelen);
       -        else if (!attrcmp(name, "value"))
       -                string_append(&attr_value, value, valuelen);
       +        /* show img alt attribute as text. */
       +        if (tagid == TagImg && !attrcmp(n, "alt"))
       +                string_append(&attr_alt, v, vl);
       +
       +        if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked"))
       +                string_append(&attr_checked, v, vl);
       +
       +        /* src attribute */
       +        switch (tagid) {
       +        case TagAudio:
       +        case TagEmbed:
       +        case TagFrame:
       +        case TagIframe:
       +        case TagImg:
       +        case TagSource:
       +        case TagTrack:
       +        case TagVideo:
       +                if (!attrcmp(n, "src"))
       +                        string_append(&attr_src, v, vl);
       +                break;
       +        default:
       +                break;
       +        }
        }
        
        static void
       -xmlattrentity(XMLParser *p, const char *tag, size_t taglen, const char *name,
       -        size_t namelen, const char *value, size_t valuelen)
       +xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n,
       +        size_t nl, const char *v, size_t vl)
        {
                char buf[16];
       -        int n;
       +        int len;
        
       -        n = xml_entitytostr(value, buf, sizeof(buf));
       -        if (n > 0)
       -                xmlattr(p, tag, taglen, name, namelen, buf, (size_t)n);
       +        len = xml_entitytostr(v, buf, sizeof(buf));
       +        if (len > 0)
       +                xmlattr(p, t, tl, n, nl, buf, (size_t)len);
                else
       -                xmlattr(p, tag, taglen, name, namelen, value, valuelen);
       +                xmlattr(p, t, tl, n, nl, v, vl);
        }
        
        static void
       @@ -2173,12 +2220,14 @@ xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n,
                size_t nl)
        {
                struct node *cur;
       +        enum TagId tagid;
        
                cur = &nodes[curnode];
       +        tagid = cur->tag.id;
        
                /* set base URL, if it is set it cannot be overwritten again */
                if (!basehrefset && basehrefdoc[0] &&
       -            !attrcmp(n, "href") && !tagcmp(t, "base"))
       +            tagid == TagBase && !attrcmp(n, "href"))
                        basehrefset = uri_parse(basehrefdoc, &base) != -1 ? 1 : 0;
        
                /* if attribute checked is set but it has no value then set it to "checked" */
       @@ -2190,6 +2239,12 @@ static void
        xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n,
                size_t nl)
        {
       +        struct node *cur;
       +        enum TagId tagid;
       +
       +        cur = &nodes[curnode];
       +        tagid = cur->tag.id;
       +
                if (!attrcmp(n, "alt"))
                        string_clear(&attr_alt);
                else if (!attrcmp(n, "checked"))
       @@ -2209,7 +2264,7 @@ xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n,
                else if (!attrcmp(n, "value"))
                        string_clear(&attr_value);
        
       -        if (basehrefdoc[0] && !attrcmp(n, "href") && !tagcmp(t, "base"))
       +        if (basehrefdoc[0] && tagid == TagBase && !attrcmp(n, "href"))
                        basehrefdoc[0] = '\0';
        }
        
       @@ -2236,7 +2291,8 @@ main(int argc, char **argv)
                        break;
                case 'b':
                        basehref = EARGF(usage());
       -                if (uri_parse(basehref, &base) == -1)
       +                if (uri_parse(basehref, &base) == -1 ||
       +                    !base.proto[0])
                                usage();
                        basehrefset = 1;
                        break;