codemadness.org

       webdump.c - webdump - HTML to plain-text converter for webpages
 (HTM) git clone git://git.codemadness.org/webdump
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       webdump.c (66805B)
       ---
            1 #include <errno.h>
            2 #include <limits.h>
            3 #include <stdio.h>
            4 #include <stdarg.h>
            5 #include <stdlib.h>
            6 #include <string.h>
            7 #include <strings.h>
            8 #include <unistd.h>
            9 
           10 #include "arg.h"
           11 char *argv0;
           12 
           13 #include "tree.h"
           14 #include "xml.h"
           15 
           16 static XMLParser parser;
           17 
           18 #ifndef __OpenBSD__
           19 #define pledge(p1,p2) 0
           20 #endif
           21 
           22 #undef strlcat
           23 size_t strlcat(char *, const char *, size_t);
           24 #undef strlcpy
           25 size_t strlcpy(char *, const char *, size_t);
           26 
           27 /* ctype-like macros, but always compatible with ASCII / UTF-8 */
           28 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
           29 #define ISCNTRL(c) ((c) < ' ' || (c) == 0x7f)
           30 #define ISDIGIT(c) (((unsigned)c) - '0' < 10)
           31 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
           32 #define TOLOWER(c) ((((unsigned)c) - 'A' < 26) ? ((c) | 32) : (c))
           33 
           34 #define LEN(x) (sizeof(x) / sizeof(x[0]))
           35 
           36 /* URI */
           37 struct uri {
           38         char proto[48];     /* scheme including ":" or "://" */
           39         char userinfo[256]; /* username [:password] */
           40         char host[256];
           41         char port[6];       /* numeric port */
           42         char path[1024];
           43         char query[1024];
           44         char fragment[1024];
           45 };
           46 
           47 /* options */
           48 static int allowansi     = 0;  /* (-a) allow ANSI escape codes */
           49 static int uniqrefs      = 0;  /* (-d) number unique references */
           50 static int showrefinline = 0;  /* (-i) show link reference number inline */
           51 static int showurlinline = 0;  /* (-I) show full link reference inline */
           52 static int showrefbottom = 0;  /* (-l) show link references at the bottom */
           53 static int allowlinewrap = 0;  /* (-r) line-wrapping */
           54 static int termwidth     = 77; /* (-w) terminal width */
           55 static int resources     = 0;  /* (-x) write resources line-by-line to fd 3? */
           56 
           57 enum DisplayType {
           58         DisplayUnknown     = 0,
           59         DisplayInline      = 1 << 0,
           60         DisplayInlineBlock = 1 << 1, /* unused for now */
           61         DisplayBlock       = 1 << 2,
           62         DisplayNone        = 1 << 3,
           63         DisplayPre         = 1 << 4,
           64         DisplayList        = 1 << 5,
           65         DisplayListOrdered = 1 << 6,
           66         DisplayListItem    = 1 << 7,
           67         DisplayTable       = 1 << 8,
           68         DisplayTableRow    = 1 << 9,
           69         DisplayTableCell   = 1 << 10,
           70         DisplayHeader      = 1 << 11,
           71         DisplayDl          = 1 << 12,
           72         DisplayInput       = 1 << 13,
           73         DisplayButton      = 1 << 14,
           74         DisplaySelect      = 1 << 15,
           75         DisplaySelectMulti = 1 << 16,
           76         DisplayOption      = 1 << 17
           77 };
           78 
           79 /* ANSI markup */
           80 enum MarkupType {
           81         MarkupNone        = 0,
           82         MarkupBold        = 1 << 0,
           83         MarkupItalic      = 1 << 1,
           84         MarkupUnderline   = 1 << 2,
           85         MarkupBlink       = 1 << 3, /* lol */
           86         MarkupReverse     = 1 << 4,
           87         MarkupStrike      = 1 << 5
           88 };
           89 
           90 /* String data / memory pool */
           91 typedef struct string {
           92         char   *data;   /* data */
           93         size_t  len;    /* string length */
           94         size_t  bufsiz; /* allocated size */
           95 } String;
           96 
           97 enum TagId { TagA = 1, TagAddress, TagArea, TagArticle, TagAside, TagAudio,
           98         TagB, TagBase, TagBlink, TagBlockquote, TagBody, TagBr, TagButton,
           99         TagCite, TagCol, TagColgroup, TagDatalist, TagDd, TagDel, TagDetails,
          100         TagDfn, TagDir, TagDiv, TagDl, TagDt, TagEm, TagEmbed, TagFieldset,
          101         TagFigcaption, TagFigure, TagFooter, TagForm, TagFrame, TagH1, TagH2,
          102         TagH3, TagH4, TagH5, TagH6, TagHead, TagHeader, TagHr, TagHtml, TagI,
          103         TagIframe, TagImg, TagInput, TagIns, TagLabel, TagLegend, TagLi,
          104         TagLink, TagMain, TagMark, TagMenu, TagMeta, TagNav, TagObject, TagOl,
          105         TagOption, TagP, TagParam, TagPre, TagS, TagScript, TagSearch,
          106         TagSection, TagSelect, TagSource, TagStrike, TagStrong, TagStyle,
          107         TagSummary, TagSvg, TagTable, TagTbody, TagTd, TagTemplate,
          108         TagTextarea, TagTfoot, TagTh, TagThead, TagTitle, TagTr, TagTrack,
          109         TagU, TagUl, TagVar, TagVideo, TagWbr, TagXmp };
          110 
          111 struct tag {
          112         const char *name;
          113         enum TagId id;
          114         enum DisplayType displaytype;
          115         enum MarkupType markuptype; /* ANSI markup */
          116         enum DisplayType parenttype; /* display type belonging to element */
          117         int isvoid; /* "void" element */
          118         int isoptional; /* optional to close tag */
          119         int margintop; /* newlines when the tag starts */
          120         int marginbottom; /* newlines after the tag ends */
          121         int indent; /* indent in cells */
          122 };
          123 
          124 struct node {
          125         char tagname[256];
          126         struct tag tag;
          127         size_t nchildren; /* child node count */
          128         size_t visnchildren; /* child node count which are visible */
          129         /* attributes */
          130         char id[256];
          131         char classnames[1024];
          132         int indent; /* indent per node, for formatting */
          133         int hasdata; /* tag contains some data, for formatting */
          134 };
          135 
          136 struct selectornode {
          137         char tagname[256];
          138         long index; /* index of node to match on: -1 if not matching on index */
          139         /* attributes */
          140         char id[256];
          141         char classnames[1024];
          142 };
          143 
          144 struct selector {
          145         const char *text;
          146         struct selectornode nodes[32];
          147         int depth;
          148 };
          149 
          150 /* list of selectors */
          151 struct selectors {
          152         struct selector **selectors;
          153         size_t count;
          154 };
          155 
          156 /* RB tree of link references */
          157 struct linkref {
          158         char *type;
          159         enum TagId tagid;
          160         char *url;
          161         int ishidden;
          162         size_t linknr;
          163         RB_ENTRY(linkref) entry;
          164 };
          165 
          166 /* link references and hidden link references */
          167 static struct linkref **visrefs;
          168 static size_t nvisrefs, ncapvisrefs; /* visible link count / capacity */
          169 static struct linkref **hiddenrefs;
          170 static size_t nhiddenrefs, ncaphiddenrefs; /* hidden link count / capacity */
          171 
          172 /* compare link by URL for link references RB-tree */
          173 static int
          174 linkrefcmp(struct linkref *r1, struct linkref *r2)
          175 {
          176         return strcmp(r1->url, r2->url);
          177 }
          178 
          179 RB_HEAD(linkreftree, linkref) linkrefhead = RB_INITIALIZER(&linkrefhead);
          180 RB_GENERATE(linkreftree, linkref, entry, linkrefcmp)
          181 
          182 static const char *str_bullet_item = "* ";
          183 static const char *str_checkbox_checked = "x";
          184 static const char *str_ruler = "-";
          185 static const char *str_radio_checked = "*";
          186 
          187 /* base href, to make URLs absolute */
          188 static char basehrefdoc[4096]; /* buffer for base href in document, if any */
          189 static int basehrefset; /* base href set and can be used? */
          190 static struct uri base; /* parsed current base href */
          191 
          192 /* buffers for some attributes of the current tag */
          193 static String attr_alt; /* alt attribute */
          194 static String attr_checked; /* checked attribute */
          195 static String attr_class; /* class attribute */
          196 static int attr_class_set; /* class attribute is set already */
          197 static String attr_data; /* data attribute */
          198 static String attr_href; /* href attribute */
          199 static String attr_id; /* id attribute */
          200 static int attr_id_set; /* class attribute is set already */
          201 static String attr_src; /* src attribute */
          202 static String attr_type; /* type attribute */
          203 static String attr_value; /* value attribute */
          204 
          205 static String htmldata; /* buffered HTML data near the current tag */
          206 
          207 /* for white-space output handling:
          208    1 = whitespace emitted (suppress repeated), 2 = other characters on this line
          209    Behaviour:
          210    * White-space data before non-whitespace data in tags are ignored on a line.
          211    * Repeated white-space are ignored: a single space (' ') is emitted.
          212 */
          213 static int whitespace_mode;
          214 static int nbytesline; /* bytes on this line */
          215 static int ncells; /* current cell/column count */
          216 static int hadnewline; /* count for repeated newlines */
          217 /* flag for skipping initial white-space in tag: for HTML white-space handling */
          218 static int skipinitialws = 1;
          219 #define DEFAULT_INDENT 2
          220 static const int defaultindent = DEFAULT_INDENT; /* default indent / margin */
          221 static int indent; /* indent for the current line, in columns */
          222 /* previous output sequential newlines, used for calculating margins between
          223    elements and reducing excessive newlines */
          224 static int currentnewlines;
          225 
          226 /* buffers for line-wrapping (buffer per word boundary) */
          227 static char rbuf[1024];
          228 static int rbuflen;
          229 static int rnbufcells; /* pending cell count to add */
          230 
          231 #define MAX_NODE_DEPTH 4096 /* absolute maximum node depth */
          232 static struct node *nodes; /* node tree (one per level is remembered) */
          233 static String *nodes_links; /* keep track of links per node */
          234 static size_t ncapnodes; /* current allocated node capacity */
          235 static int curnode; /* current node depth */
          236 
          237 /* reader / selector mode (-s) */
          238 static int reader_mode;
          239 /* flag if the tags and their children should be ignored in the current context */
          240 static int reader_ignore;
          241 
          242 static enum MarkupType curmarkup; /* current markup state (bold, underline, etc) */
          243 static int linewrap; /* allow linewrap in this context */
          244 
          245 /* selector to match (for -s and -u) */
          246 static struct selectors *sel_hide, *sel_show;
          247 
          248 /* tags table: needs to be sorted like tagcmp(), alphabetically */
          249 
          250 /* tag          id             displaytype                       markup           parent           v  o  b  a  i */
          251 static struct tag tags[] = {
          252 { "a",          TagA,          DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
          253 { "address",    TagAddress,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          254 { "area",       TagArea,       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          255 { "article",    TagArticle,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          256 { "aside",      TagAside,      DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          257 { "audio",      TagAudio,      DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
          258 { "b",          TagB,          DisplayInline,                    MarkupBold,      0,               0, 0, 0, 0, 0 },
          259 { "base",       TagBase,       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          260 { "blink",      TagBlink,      DisplayInline,                    MarkupBlink,     0,               0, 0, 0, 0, 0 },
          261 { "blockquote", TagBlockquote, DisplayBlock,                     0,               0,               0, 0, 0, 0, 2 },
          262 { "body",       TagBody,       DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          263 { "br",         TagBr,         0,                                0,               0,               1, 0, 0, 0, 0 },
          264 { "button",     TagButton,     DisplayInline | DisplayButton,    0,               0,               0, 0, 0, 0, 0 },
          265 { "cite",       TagCite,       DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
          266 { "col",        TagCol,        DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          267 { "colgroup",   TagColgroup,   DisplayInline,                    0,               0,               0, 1, 0, 0, 0 },
          268 { "datalist",   TagDatalist,   DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
          269 { "dd",         TagDd,         DisplayBlock,                     0,               0,               0, 1, 0, 0, 4 },
          270 { "del",        TagDel,        DisplayInline,                    MarkupStrike,    0,               0, 0, 0, 0, 0 },
          271 { "details",    TagDetails,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          272 { "dfn",        TagDfn,        DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
          273 { "dir",        TagDir,        DisplayList,                      0,               0,               0, 0, 1, 1, 2 },
          274 { "div",        TagDiv,        DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          275 { "dl",         TagDl,         DisplayBlock | DisplayDl,         0,               0,               0, 0, 0, 0, 0 },
          276 { "dt",         TagDt,         DisplayBlock,                     MarkupBold,      0,               0, 1, 0, 0, 0 },
          277 { "em",         TagEm,         DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
          278 { "embed",      TagEmbed,      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          279 { "fieldset",   TagFieldset,   DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          280 { "figcaption", TagFigcaption, DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          281 { "figure",     TagFigure,     DisplayBlock,                     0,               0,               0, 0, 1, 1, 4 },
          282 { "footer",     TagFooter,     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          283 { "form",       TagForm,       DisplayBlock,                     0,               0,               0, 0, 0, 1, 0 },
          284 { "frame",      TagFrame,      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          285 { "h1",         TagH1,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
          286 { "h2",         TagH2,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
          287 { "h3",         TagH3,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
          288 { "h4",         TagH4,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
          289 { "h5",         TagH5,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
          290 { "h6",         TagH6,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
          291 { "head",       TagHead,       DisplayBlock,                     0,               0,               0, 1, 0, 0, 0 },
          292 { "header",     TagHeader,     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          293 { "hr",         TagHr,         DisplayBlock,                     0,               0,               1, 0, 0, 0, 0 },
          294 { "html",       TagHtml,       DisplayBlock,                     0,               0,               0, 1, 0, 0, 0 },
          295 { "i",          TagI,          DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
          296 { "iframe",     TagIframe,     DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
          297 { "img",        TagImg,        DisplayInline,                    MarkupUnderline, 0,               1, 0, 0, 0, 0 },
          298 { "input",      TagInput,      DisplayInput,                     0,               0,               1, 0, 0, 0, 0 },
          299 { "ins",        TagIns,        DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
          300 { "label",      TagLabel,      DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
          301 { "legend",     TagLegend,     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          302 { "li",         TagLi,         DisplayListItem,                  0,               DisplayList,     0, 1, 0, 0, 0 },
          303 { "link",       TagLink,       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          304 { "main",       TagMain,       DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          305 { "mark",       TagMark,       DisplayInline,                    MarkupReverse,   0,               0, 0, 0, 0, 0 },
          306 { "menu",       TagMenu,       DisplayList,                      0,               0,               0, 0, 1, 1, 2 },
          307 { "meta",       TagMeta,       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          308 { "nav",        TagNav,        DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          309 { "object",     TagObject,     DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
          310 { "ol",         TagOl,         DisplayList | DisplayListOrdered, 0,               0,               0, 0, 1, 1, 0 },
          311 { "option",     TagOption,     DisplayInline | DisplayOption,    0,               0,               0, 1, 0, 0, 0 },
          312 { "p",          TagP,          DisplayBlock,                     0,               0,               0, 1, 1, 1, 0 },
          313 { "param",      TagParam,      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          314 { "pre",        TagPre,        DisplayPre,                       0,               0,               0, 0, 1, 1, 4 },
          315 { "s",          TagS,          DisplayInline,                    MarkupStrike,    0,               0, 0, 0, 0, 0 },
          316 { "script",     TagScript,     DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
          317 { "search",     TagSearch,     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          318 { "section",    TagSection,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          319 { "select",     TagSelect,     DisplayInline | DisplaySelect,    0,               0,               0, 0, 0, 0, 0 },
          320 { "source",     TagSource,     DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          321 { "strike",     TagStrike,     DisplayInline,                    MarkupStrike,    0,               0, 0, 0, 0, 0 },
          322 { "strong",     TagStrong,     DisplayInline,                    MarkupBold,      0,               0, 0, 0, 0, 0 },
          323 { "style",      TagStyle,      DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
          324 { "summary",    TagSummary,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          325 { "svg",        TagSvg,        DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
          326 { "table",      TagTable,      DisplayTable,                     0,               0,               0, 0, 0, 0, 0 },
          327 { "tbody",      TagTbody,      DisplayInline,                    0,               DisplayTable,    0, 1, 0, 0, 0 },
          328 { "td",         TagTd,         DisplayTableCell,                 0,               DisplayTableRow, 0, 1, 0, 0, 0 },
          329 { "template",   TagTemplate,   DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
          330 { "textarea",   TagTextarea,   DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
          331 { "tfoot",      TagTfoot,      DisplayInline,                    0,               DisplayTable,    0, 1, 0, 0, 0 },
          332 { "th",         TagTh,         DisplayTableCell,                 MarkupBold,      DisplayTableRow, 0, 1, 0, 0, 0 },
          333 { "thead",      TagThead,      DisplayInline,                    0,               DisplayTable,    0, 1, 0, 0, 0 },
          334 { "title",      TagTitle,      DisplayBlock,                     0,               0,               0, 0, 0, 1, -DEFAULT_INDENT },
          335 { "tr",         TagTr,         DisplayTableRow,                  0,               DisplayTable,    0, 1, 0, 0, 0 },
          336 { "track",      TagTrack,      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          337 { "u",          TagU,          DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
          338 { "ul",         TagUl,         DisplayList,                      0,               0,               0, 0, 1, 1, 2 },
          339 { "var",        TagVar,        DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
          340 { "video",      TagVideo,      DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
          341 { "wbr",        TagWbr,        DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          342 { "xmp",        TagXmp,        DisplayPre,                       0,               0,               0, 0, 1, 1, 4 }
          343 };
          344 
          345 /* hint for compilers and static analyzers that a function exits */
          346 #ifndef __dead
          347 #define __dead
          348 #endif
          349 
          350 /* print to stderr, print error message of errno and exit(). */
          351 __dead static void
          352 err(int exitstatus, const char *fmt, ...)
          353 {
          354         va_list ap;
          355         int saved_errno;
          356 
          357         saved_errno = errno;
          358 
          359         fputs("webdump: ", stderr);
          360         if (fmt) {
          361                 va_start(ap, fmt);
          362                 vfprintf(stderr, fmt, ap);
          363                 va_end(ap);
          364                 fputs(": ", stderr);
          365         }
          366         fprintf(stderr, "%s\n", strerror(saved_errno));
          367 
          368         exit(exitstatus);
          369 }
          370 
          371 /* print to stderr and exit(). */
          372 __dead static void
          373 errx(int exitstatus, const char *fmt, ...)
          374 {
          375         va_list ap;
          376 
          377         fputs("webdump: ", stderr);
          378         if (fmt) {
          379                 va_start(ap, fmt);
          380                 vfprintf(stderr, fmt, ap);
          381                 va_end(ap);
          382         }
          383         fputs("\n", stderr);
          384 
          385         exit(exitstatus);
          386 }
          387 
          388 static const char *ignorestate, *endtag;
          389 static int (*getnext)(void);
          390 
          391 /* return a space for all data until some case-insensitive string occurs. This
          392    is used to parse incorrect HTML/XML that contains unescaped HTML in script
          393    or style tags. If you see some </script> tag in a CDATA or comment
          394    section then e-mail W3C and tell them the web is too complex. */
          395 static inline int
          396 getnext_ignore(void)
          397 {
          398         int c;
          399 
          400         if ((c = getnext()) == EOF)
          401                 return EOF;
          402 
          403         if (TOLOWER((unsigned char)c) == TOLOWER((unsigned char)*ignorestate)) {
          404                 ignorestate++;
          405                 if (*ignorestate == '\0') {
          406                         parser.getnext = getnext; /* restore */
          407                         return ' ';
          408                 }
          409         } else {
          410                 ignorestate = endtag; /* no full match: reset to beginning */
          411         }
          412 
          413         return ' '; /* pretend there is just SPACEs */
          414 }
          415 
          416 /* Clear string only; don't free, prevents unnecessary reallocation. */
          417 static void
          418 string_clear(String *s)
          419 {
          420         if (s->data)
          421                 s->data[0] = '\0';
          422         s->len = 0;
          423 }
          424 
          425 static void
          426 string_buffer_realloc(String *s, size_t newlen)
          427 {
          428         size_t alloclen;
          429 
          430         for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
          431                 ;
          432         if (!(s->data = realloc(s->data, alloclen)))
          433                 err(1, "realloc");
          434         s->bufsiz = alloclen;
          435 }
          436 
          437 static void
          438 string_append(String *s, const char *data, size_t len)
          439 {
          440         if (!len)
          441                 return;
          442         /* check if allocation is necesary, don't shrink buffer,
          443          * should be more than bufsiz ofcourse. */
          444         if (s->len + len >= s->bufsiz)
          445                 string_buffer_realloc(s, s->len + len + 1);
          446         memcpy(s->data + s->len, data, len);
          447         s->len += len;
          448         s->data[s->len] = '\0';
          449 }
          450 
          451 static char *
          452 estrdup(const char *s)
          453 {
          454         char *p;
          455 
          456         if (!(p = strdup(s)))
          457                 err(1, "strdup");
          458         return p;
          459 }
          460 
          461 static char *
          462 estrndup(const char *s, size_t n)
          463 {
          464         char *p;
          465 
          466         if (!(p = strndup(s, n)))
          467                 err(1, "strndup");
          468         return p;
          469 }
          470 
          471 static void *
          472 erealloc(void *p, size_t siz)
          473 {
          474         if (!(p = realloc(p, siz)))
          475                 err(1, "realloc");
          476 
          477         return p;
          478 }
          479 
          480 static void *
          481 ecalloc(size_t nmemb, size_t size)
          482 {
          483         void *p;
          484 
          485         if (!(p = calloc(nmemb, size)))
          486                 err(1, "calloc");
          487         return p;
          488 }
          489 
          490 /* check if string has a non-empty scheme / protocol part */
          491 static int
          492 uri_hasscheme(const char *s)
          493 {
          494         const char *p = s;
          495 
          496         for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) ||
          497                        *p == '+' || *p == '-' || *p == '.'; p++)
          498                 ;
          499         /* scheme, except if empty and starts with ":" then it is a path */
          500         return (*p == ':' && p != s);
          501 }
          502 
          503 static int
          504 uri_parse(const char *s, struct uri *u)
          505 {
          506         const char *p = s;
          507         char *endptr;
          508         size_t i;
          509         long l;
          510 
          511         u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0';
          512         u->path[0] = u->query[0] = u->fragment[0] = '\0';
          513 
          514         /* protocol-relative */
          515         if (*p == '/' && *(p + 1) == '/') {
          516                 p += 2; /* skip "//" */
          517                 goto parseauth;
          518         }
          519 
          520         /* scheme / protocol part */
          521         for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) ||
          522                        *p == '+' || *p == '-' || *p == '.'; p++)
          523                 ;
          524         /* scheme, except if empty and starts with ":" then it is a path */
          525         if (*p == ':' && p != s) {
          526                 if (*(p + 1) == '/' && *(p + 2) == '/')
          527                         p += 3; /* skip "://" */
          528                 else
          529                         p++; /* skip ":" */
          530 
          531                 if ((size_t)(p - s) >= sizeof(u->proto))
          532                         return -1; /* protocol too long */
          533                 memcpy(u->proto, s, p - s);
          534                 u->proto[p - s] = '\0';
          535 
          536                 if (*(p - 1) != '/')
          537                         goto parsepath;
          538         } else {
          539                 p = s; /* no scheme format, reset to start */
          540                 goto parsepath;
          541         }
          542 
          543 parseauth:
          544         /* userinfo (username:password) */
          545         i = strcspn(p, "@/?#");
          546         if (p[i] == '@') {
          547                 if (i >= sizeof(u->userinfo))
          548                         return -1; /* userinfo too long */
          549                 memcpy(u->userinfo, p, i);
          550                 u->userinfo[i] = '\0';
          551                 p += i + 1;
          552         }
          553 
          554         /* IPv6 address */
          555         if (*p == '[') {
          556                 /* bracket not found, host too short or too long */
          557                 i = strcspn(p, "]");
          558                 if (p[i] != ']' || i < 3)
          559                         return -1;
          560                 i++; /* including "]" */
          561         } else {
          562                 /* domain / host part, skip until port, path or end. */
          563                 i = strcspn(p, ":/?#");
          564         }
          565         if (i >= sizeof(u->host))
          566                 return -1; /* host too long */
          567         memcpy(u->host, p, i);
          568         u->host[i] = '\0';
          569         p += i;
          570 
          571         /* port */
          572         if (*p == ':') {
          573                 p++;
          574                 if ((i = strcspn(p, "/?#")) >= sizeof(u->port))
          575                         return -1; /* port too long */
          576                 memcpy(u->port, p, i);
          577                 u->port[i] = '\0';
          578                 /* check for valid port: range 1 - 65535, may be empty */
          579                 errno = 0;
          580                 l = strtol(u->port, &endptr, 10);
          581                 if (i && (errno || *endptr || l <= 0 || l > 65535))
          582                         return -1;
          583                 p += i;
          584         }
          585 
          586 parsepath:
          587         /* path */
          588         if ((i = strcspn(p, "?#")) >= sizeof(u->path))
          589                 return -1; /* path too long */
          590         memcpy(u->path, p, i);
          591         u->path[i] = '\0';
          592         p += i;
          593 
          594         /* query */
          595         if (*p == '?') {
          596                 p++;
          597                 if ((i = strcspn(p, "#")) >= sizeof(u->query))
          598                         return -1; /* query too long */
          599                 memcpy(u->query, p, i);
          600                 u->query[i] = '\0';
          601                 p += i;
          602         }
          603 
          604         /* fragment */
          605         if (*p == '#') {
          606                 p++;
          607                 if ((i = strlen(p)) >= sizeof(u->fragment))
          608                         return -1; /* fragment too long */
          609                 memcpy(u->fragment, p, i);
          610                 u->fragment[i] = '\0';
          611         }
          612 
          613         return 0;
          614 }
          615 
          616 /* Transform and try to make the URI `u` absolute using base URI `b` into `a`.
          617    Follows some of the logic from "RFC 3986 - 5.2.2. Transform References".
          618    Returns 0 on success, -1 on error or truncation. */
          619 static int
          620 uri_makeabs(struct uri *a, struct uri *u, struct uri *b)
          621 {
          622         char *p;
          623         int c;
          624 
          625         strlcpy(a->fragment, u->fragment, sizeof(a->fragment));
          626 
          627         if (u->proto[0] || u->host[0]) {
          628                 strlcpy(a->proto, u->proto[0] ? u->proto : b->proto, sizeof(a->proto));
          629                 strlcpy(a->host, u->host, sizeof(a->host));
          630                 strlcpy(a->userinfo, u->userinfo, sizeof(a->userinfo));
          631                 strlcpy(a->host, u->host, sizeof(a->host));
          632                 strlcpy(a->port, u->port, sizeof(a->port));
          633                 strlcpy(a->path, u->path, sizeof(a->path));
          634                 strlcpy(a->query, u->query, sizeof(a->query));
          635                 return 0;
          636         }
          637 
          638         strlcpy(a->proto, b->proto, sizeof(a->proto));
          639         strlcpy(a->host, b->host, sizeof(a->host));
          640         strlcpy(a->userinfo, b->userinfo, sizeof(a->userinfo));
          641         strlcpy(a->host, b->host, sizeof(a->host));
          642         strlcpy(a->port, b->port, sizeof(a->port));
          643 
          644         if (!u->path[0]) {
          645                 strlcpy(a->path, b->path, sizeof(a->path));
          646         } else if (u->path[0] == '/') {
          647                 strlcpy(a->path, u->path, sizeof(a->path));
          648         } else {
          649                 a->path[0] = (b->host[0] && b->path[0] != '/') ? '/' : '\0';
          650                 a->path[1] = '\0';
          651 
          652                 if ((p = strrchr(b->path, '/'))) {
          653                         c = *(++p);
          654                         *p = '\0'; /* temporary NUL-terminate */
          655                         if (strlcat(a->path, b->path, sizeof(a->path)) >= sizeof(a->path))
          656                                 return -1;
          657                         *p = c; /* restore */
          658                 }
          659                 if (strlcat(a->path, u->path, sizeof(a->path)) >= sizeof(a->path))
          660                         return -1;
          661         }
          662 
          663         if (u->path[0] || u->query[0])
          664                 strlcpy(a->query, u->query, sizeof(a->query));
          665         else
          666                 strlcpy(a->query, b->query, sizeof(a->query));
          667 
          668         return 0;
          669 }
          670 
          671 static int
          672 uri_format(char *buf, size_t bufsiz, struct uri *u)
          673 {
          674         return snprintf(buf, bufsiz, "%s%s%s%s%s%s%s%s%s%s%s%s",
          675                 u->proto,
          676                 u->userinfo[0] ? u->userinfo : "",
          677                 u->userinfo[0] ? "@" : "",
          678                 u->host,
          679                 u->port[0] ? ":" : "",
          680                 u->port,
          681                 u->host[0] && u->path[0] && u->path[0] != '/' ? "/" : "",
          682                 u->path,
          683                 u->query[0] ? "?" : "",
          684                 u->query,
          685                 u->fragment[0] ? "#" : "",
          686                 u->fragment);
          687 }
          688 
          689 /* compare tag name (case-insensitive) */
          690 static int
          691 tagcmp(const char *s1, const char *s2)
          692 {
          693         return strcasecmp(s1, s2);
          694 }
          695 
          696 /* compare attribute name (case-insensitive) */
          697 static int
          698 attrcmp(const char *s1, const char *s2)
          699 {
          700         return strcasecmp(s1, s2);
          701 }
          702 
          703 static void
          704 rindent(void)
          705 {
          706         int i, total;
          707 
          708         total = indent + defaultindent;
          709         if (total < 0)
          710                 total = 0;
          711         for (i = 0; i < total; i++)
          712                 putchar(' ');
          713 
          714         nbytesline += total;
          715         ncells += total;
          716 }
          717 
          718 static void
          719 emitmarkup(int markuptype)
          720 {
          721         if (!allowansi)
          722                 return;
          723 
          724         if (!markuptype)
          725                 fputs("\033[0m", stdout); /* reset all attributes */
          726 
          727         /* set */
          728         if (markuptype & MarkupBold)
          729                 fputs("\033[1m", stdout);
          730         if (markuptype & MarkupItalic)
          731                 fputs("\033[3m", stdout);
          732         if (markuptype & MarkupUnderline)
          733                 fputs("\033[4m", stdout);
          734         if (markuptype & MarkupBlink)
          735                 fputs("\033[5m", stdout);
          736         if (markuptype & MarkupReverse)
          737                 fputs("\033[7m", stdout);
          738         if (markuptype & MarkupStrike)
          739                 fputs("\033[9m", stdout);
          740 }
          741 
          742 /* flush remaining buffer (containing a word): used for word-wrap handling */
          743 static void
          744 hflush(void)
          745 {
          746         int i;
          747 
          748         if (!rbuflen)
          749                 return;
          750 
          751         if (!nbytesline) {
          752                 if (curmarkup)
          753                         emitmarkup(0);
          754                 rindent();
          755                 /* emit code again per line, needed for GNU/less -R */
          756                 if (curmarkup)
          757                         emitmarkup(curmarkup);
          758         }
          759 
          760         for (i = 0; i < rbuflen; i++)
          761                 putchar(rbuf[i]);
          762 
          763         nbytesline += rbuflen;
          764         ncells += rnbufcells;
          765         rbuflen = 0;
          766         rnbufcells = 0;
          767 }
          768 
          769 static void
          770 printansi(const char *s)
          771 {
          772         size_t len;
          773 
          774         if (!allowansi)
          775                 return;
          776 
          777         if (linewrap) {
          778                 len = strlen(s);
          779                 if (rbuflen + len + 1 >= sizeof(rbuf))
          780                         hflush();
          781                 if (rbuflen + len + 1 < sizeof(rbuf)) {
          782                         memcpy(rbuf + rbuflen, s, len);
          783                         rbuflen += len;
          784                         /* NOTE: nbytesline and ncells are not counted for markup */
          785                 }
          786         } else {
          787                 fputs(s, stdout);
          788         }
          789 }
          790 
          791 static void
          792 setmarkup(int markuptype)
          793 {
          794         if (!allowansi)
          795                 return;
          796 
          797         /* need change? */
          798         if (curmarkup == markuptype)
          799                 return;
          800 
          801         if (!markuptype) {
          802                 printansi("\033[0m"); /* reset all attributes */
          803                 curmarkup = markuptype;
          804                 return;
          805         }
          806 
          807         /* set */
          808         if (!(curmarkup & MarkupBold) && (markuptype & MarkupBold))
          809                 printansi("\033[1m");
          810         if (!(curmarkup & MarkupItalic) && (markuptype & MarkupItalic))
          811                 printansi("\033[3m");
          812         if (!(curmarkup & MarkupUnderline) && (markuptype & MarkupUnderline))
          813                 printansi("\033[4m");
          814         if (!(curmarkup & MarkupBlink) && (markuptype & MarkupBlink))
          815                 printansi("\033[5m");
          816         if (!(curmarkup & MarkupReverse) && (markuptype & MarkupReverse))
          817                 printansi("\033[7m");
          818         if (!(curmarkup & MarkupStrike) && (markuptype & MarkupStrike))
          819                 printansi("\033[9m");
          820 
          821         /* unset */
          822         if ((curmarkup & MarkupBold) && !(markuptype & MarkupBold))
          823                 printansi("\033[22m"); /* reset bold or faint */
          824         if ((curmarkup & MarkupItalic) && !(markuptype & MarkupItalic))
          825                 printansi("\033[23m"); /* reset italic */
          826         if ((curmarkup & MarkupUnderline) && !(markuptype & MarkupUnderline))
          827                 printansi("\033[24m"); /* reset underline */
          828         if ((curmarkup & MarkupBlink) && !(markuptype & MarkupBlink))
          829                 printansi("\033[25m"); /* reset blink */
          830         if ((curmarkup & MarkupReverse) && !(markuptype & MarkupReverse))
          831                 printansi("\033[27m"); /* reset reverse */
          832         if ((curmarkup & MarkupStrike) && !(markuptype & MarkupStrike))
          833                 printansi("\033[29m"); /* reset strike */
          834 
          835         curmarkup = markuptype;
          836 }
          837 
          838 static void
          839 startmarkup(int markuptype)
          840 {
          841         setmarkup(curmarkup | markuptype);
          842 }
          843 
          844 static void
          845 endmarkup(int markuptype)
          846 {
          847         setmarkup(curmarkup & ~markuptype);
          848 }
          849 
          850 /* rough cell width of a unicode codepoint by counting a unicode codepoint as 1
          851    cell in general.
          852    NOTE: this is of course incorrect since characters can be 2 width aswell,
          853    in the future maybe replace this with wcwidth() or similar */
          854 static int
          855 utfwidth(int c)
          856 {
          857         /* not the start of a codepoint */
          858         if ((c & 0xc0) == 0x80)
          859                 return 0;
          860         /* count TAB as 8 */
          861         if (c == '\t')
          862                 return 8;
          863         return 1;
          864 }
          865 
          866 /* write a character, handling state of repeated newlines, some HTML
          867    white-space rules, indentation and word-wrapping */
          868 static void
          869 hputchar(int c)
          870 {
          871         struct node *cur = &nodes[curnode];
          872         cur->hasdata = 1;
          873 
          874         if (c == '\n') {
          875                 /* previous line had characters, so not a repeated newline */
          876                 if (nbytesline > 0)
          877                         hadnewline = 0;
          878 
          879                 /* start a new line, no chars on this line yet */
          880                 whitespace_mode &= ~2; /* no chars on this line yet */
          881                 nbytesline = 0;
          882                 ncells = 0;
          883 
          884                 if (hadnewline)
          885                         currentnewlines++; /* repeating newlines */
          886                 hadnewline = 1;
          887         } else {
          888                 hadnewline = 0;
          889                 currentnewlines = 0;
          890         }
          891 
          892         /* skip initial/leading white-space */
          893         if (ISSPACE((unsigned char)c)) {
          894                 if (skipinitialws)
          895                         return;
          896         } else {
          897                 skipinitialws = 0;
          898         }
          899 
          900         if (!(c == '\n' || c == '\t' || !ISCNTRL((unsigned char)c)))
          901                 return;
          902 
          903         if (!linewrap) {
          904                 if (c == '\n') {
          905                         putchar('\n');
          906                         nbytesline = 0;
          907                         ncells = 0;
          908                 } else {
          909                         if (!nbytesline) {
          910                                 if (curmarkup)
          911                                         emitmarkup(0);
          912                                 rindent();
          913                                 /* emit code again per line, needed for GNU/less -R */
          914                                 if (curmarkup)
          915                                         emitmarkup(curmarkup);
          916                         }
          917                         putchar(c);
          918                         nbytesline++;
          919                         ncells += utfwidth(c);
          920                 }
          921                 return;
          922         }
          923 
          924         /* really too long: the whole word doesn't even fit, flush it */
          925         if (ncells + rnbufcells >= termwidth || rbuflen >= sizeof(rbuf) - 1) {
          926                 putchar('\n');
          927                 nbytesline = 0;
          928                 ncells = 0;
          929                 hflush();
          930         }
          931 
          932         if (c == '\n') {
          933                 putchar('\n');
          934                 hflush();
          935                 return;
          936         } else if (ISSPACE((unsigned char)c) || c == '-') {
          937                 if (ncells + rnbufcells >= termwidth) {
          938                         putchar('\n');
          939                         nbytesline = 0;
          940                         ncells = 0;
          941                 }
          942                 rbuf[rbuflen++] = c;
          943                 rnbufcells += utfwidth(c);
          944                 hflush();
          945                 return;
          946         }
          947 
          948         rbuf[rbuflen++] = c;
          949         rnbufcells += utfwidth(c);
          950 }
          951 
          952 /* calculate indentation of current node depth, using the sum of each
          953    indentation per node */
          954 static int
          955 calcindent(void)
          956 {
          957         int i, n = 0;
          958 
          959         for (i = curnode; i >= 0; i--)
          960                 n += nodes[i].indent;
          961 
          962         return n;
          963 }
          964 
          965 static void
          966 hprint(const char *s)
          967 {
          968         for (; *s; ++s)
          969                 hputchar(*s);
          970 }
          971 
          972 /* printf(), max 256 bytes for now */
          973 static void
          974 hprintf(const char *fmt, ...)
          975 {
          976         va_list ap;
          977         char buf[256];
          978 
          979         va_start(ap, fmt);
          980         vsnprintf(buf, sizeof(buf), fmt, ap);
          981         va_end(ap);
          982 
          983         /* use hprint() formatting logic. */
          984         hprint(buf);
          985 }
          986 
          987 static void
          988 newline(void)
          989 {
          990         if (skipinitialws)
          991                 return;
          992         hputchar('\n');
          993 }
          994 
          995 static int
          996 parentcontainerhasdata(int curtype, int n)
          997 {
          998         int i;
          999 
         1000         for (i = n; i >= 0; i--) {
         1001                 if (nodes[i].tag.displaytype & (DisplayList|DisplayTable))
         1002                         break;
         1003                 if (nodes[i].hasdata)
         1004                         return 1;
         1005         }
         1006 
         1007         return 0;
         1008 }
         1009 
         1010 /* start on a newline for the start of a block element or not */
         1011 static void
         1012 startblock(void)
         1013 {
         1014         hflush();
         1015         whitespace_mode &= ~2; /* no characters on this line yet */
         1016         if (nbytesline <= 0)
         1017                 return;
         1018         if (!hadnewline && curnode >= 0 && nodes[curnode - 1].hasdata)
         1019                 hputchar('\n');
         1020 }
         1021 
         1022 /* start on a newline for the end of a block element or not */
         1023 static void
         1024 endblock(void)
         1025 {
         1026         hflush();
         1027         whitespace_mode &= ~2; /* no characters on this line yet */
         1028         if (nbytesline <= 0)
         1029                 return;
         1030         if (!hadnewline)
         1031                 hputchar('\n');
         1032 }
         1033 
         1034 /* print one character safely: no control characters,
         1035    handle HTML white-space rules */
         1036 static void
         1037 printc(int c)
         1038 {
         1039         if (ISSPACE((unsigned char)c)) {
         1040                 if (whitespace_mode == 2)
         1041                         hputchar(' ');
         1042                 whitespace_mode |= 1;
         1043         } else {
         1044                 whitespace_mode = 2;
         1045                 if (!ISCNTRL((unsigned char)c))
         1046                         hputchar(c);
         1047         }
         1048 }
         1049 
         1050 static void
         1051 printpre(const char *s, size_t len)
         1052 {
         1053         struct node *cur;
         1054         size_t i;
         1055 
         1056         /* reset state of newlines because this data is printed literally */
         1057         hadnewline = 0;
         1058         currentnewlines = 0;
         1059 
         1060         /* skip leading newline */
         1061         i = 0;
         1062         if (skipinitialws) {
         1063                 if (*s == '\n' && i < len) {
         1064                         s++;
         1065                         i++;
         1066                 }
         1067         }
         1068 
         1069         hflush();
         1070 
         1071         skipinitialws = 0;
         1072 
         1073         if (*s) {
         1074                 cur = &nodes[curnode];
         1075                 cur->hasdata = 1;
         1076         }
         1077 
         1078         for (; *s && i < len; s++, i++) {
         1079                 switch (*s) {
         1080                 case '\n':
         1081                         putchar('\n');
         1082                         nbytesline = 0;
         1083                         ncells = 0;
         1084                         break;
         1085                 case '\t':
         1086                         hadnewline = 0;
         1087                         if (!nbytesline) {
         1088                                 if (curmarkup)
         1089                                         emitmarkup(0);
         1090                                 rindent();
         1091                                 /* emit code again per line, needed for GNU/less -R */
         1092                                 if (curmarkup)
         1093                                         emitmarkup(curmarkup);
         1094                         }
         1095 
         1096                         /* TAB to 8 spaces */
         1097                         fputs("        ", stdout);
         1098                         nbytesline += 8;
         1099                         ncells += 8;
         1100                         break;
         1101                 default:
         1102                         if (ISCNTRL((unsigned char)*s))
         1103                                 continue;
         1104 
         1105                         if (!nbytesline) {
         1106                                 if (curmarkup)
         1107                                         emitmarkup(0);
         1108                                 rindent();
         1109                                 /* emit code again per line, needed for GNU/less -R */
         1110                                 if (curmarkup)
         1111                                         emitmarkup(curmarkup);
         1112                         }
         1113 
         1114                         putchar(*s);
         1115                         nbytesline++;
         1116                         /* start of rune: incorrectly assume 1 rune is 1 cell for now */
         1117                         ncells += utfwidth((unsigned char)*s);
         1118                 }
         1119         }
         1120 }
         1121 
         1122 static struct node *
         1123 findparenttype(int cur, int findtype)
         1124 {
         1125         int i;
         1126 
         1127         for (i = cur; i >= 0; i--) {
         1128                 if ((nodes[i].tag.displaytype & findtype))
         1129                         return &nodes[i];
         1130         }
         1131         return NULL;
         1132 }
         1133 
         1134 static int
         1135 isclassmatch(const char *haystack, const char *needle)
         1136 {
         1137         const char *p;
         1138         size_t needlelen;
         1139         size_t matched = 0;
         1140 
         1141         needlelen = strlen(needle);
         1142         for (p = haystack; *p; p++) {
         1143                 if (ISSPACE((unsigned char)*p)) {
         1144                         matched = 0;
         1145                         continue;
         1146                 }
         1147                 if (needle[matched] == *p)
         1148                         matched++;
         1149                 else
         1150                         matched = 0;
         1151                 if (matched == needlelen) {
         1152                         if (*(p + 1) == '\0' || ISSPACE((unsigned char)*(p + 1)))
         1153                                 return 1;
         1154                 }
         1155         }
         1156 
         1157         return 0;
         1158 }
         1159 
         1160 /* very limited CSS-like selector, supports: main, main#id, main.class,
         1161    ".class", "#id", "ul li a" */
         1162 static int
         1163 compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes)
         1164 {
         1165         int depth = 0, len;
         1166         long l;
         1167         const char *s, *start;
         1168         char tmp[256];
         1169         int nameset = 0;
         1170 
         1171         memset(&nodes[0], 0, sizeof(nodes[0]));
         1172         nodes[0].index = -1;
         1173 
         1174         s = sel;
         1175         for (; *s && ISSPACE((unsigned char)*s); s++)
         1176                 ;
         1177 
         1178         start = s;
         1179         for (; ; s++) {
         1180                 /* end of tag */
         1181                 if (!nameset &&
         1182                     (*s == '#' || *s == '.' || *s == '@' ||
         1183                      *s == '\0' || ISSPACE((unsigned char)*s))) {
         1184                         nameset = 1;
         1185                         len = s - start; /* tag name */
         1186                         if (len >= sizeof(tmp))
         1187                                 return 0;
         1188                         if (len)
         1189                                 memcpy(tmp, start, len);
         1190                         tmp[len] = '\0';
         1191 
         1192                         memcpy(nodes[depth].tagname, tmp, len + 1);
         1193                 }
         1194 
         1195                 /* end */
         1196                 if (*s == '\0' || ISSPACE((unsigned char)*s)) {
         1197                         for (; ISSPACE((unsigned char)*s); s++)
         1198                                 ;
         1199                         start = s; /* start of a new tag */
         1200                         depth++;
         1201                         if (depth >= maxnodes)
         1202                                 return 0;
         1203 
         1204                         nameset = 0;
         1205                         memset(&nodes[depth], 0, sizeof(nodes[depth]));
         1206                         nodes[depth].index = -1;
         1207 
         1208                         /* end of selector */
         1209                         if (*s == '\0')
         1210                                 break;
         1211                 }
         1212 
         1213                 /* index */
         1214                 if (*s == '@') {
         1215                         len = strcspn(s + 1, ".#@ \t\n");
         1216                         if (len >= sizeof(tmp))
         1217                                 return 0;
         1218                         memcpy(tmp, s + 1, len);
         1219                         tmp[len] = '\0';
         1220 
         1221                         l = strtol(tmp, NULL, 10);
         1222                         if (l >= 0)
         1223                                 nodes[depth].index = l;
         1224                         s += len;
         1225                         start = s + 1;
         1226                         continue;
         1227                 }
         1228 
         1229                 /* id */
         1230                 if (*s == '#') {
         1231                         len = strcspn(s + 1, ".#@ \t\n");
         1232                         if (len >= sizeof(tmp))
         1233                                 return 0;
         1234                         memcpy(tmp, s + 1, len);
         1235                         tmp[len] = '\0';
         1236                         memcpy(nodes[depth].id, tmp, len + 1);
         1237                         s += len;
         1238                         start = s + 1;
         1239                         continue;
         1240                 }
         1241 
         1242                 /* class */
         1243                 if (*s == '.') {
         1244                         len = strcspn(s + 1, ".#@ \t\n");
         1245                         if (len >= sizeof(tmp))
         1246                                 return 0;
         1247                         memcpy(tmp, s + 1, len);
         1248                         tmp[len] = '\0';
         1249                         /* allow only one classname for now */
         1250                         memcpy(nodes[depth].classnames, tmp, len + 1);
         1251                         s += len;
         1252                         start = s + 1;
         1253                         continue;
         1254                 }
         1255         }
         1256 
         1257         return depth;
         1258 }
         1259 
         1260 static struct selector *
         1261 newselector(const char *q)
         1262 {
         1263         struct selector *sel;
         1264         int r;
         1265 
         1266         sel = ecalloc(1, sizeof(*sel));
         1267         sel->text = estrdup(q);
         1268 
         1269         r = compileselector(sel->text, sel->nodes, LEN(sel->nodes));
         1270         if (r <= 0) {
         1271                 free(sel);
         1272                 return NULL;
         1273         }
         1274         sel->depth = r;
         1275 
         1276         return sel;
         1277 }
         1278 
         1279 static struct selectors *
         1280 compileselectors(const char *q)
         1281 {
         1282         struct selectors *sels = NULL;
         1283         struct selector *sel;
         1284         const char *start;
         1285         char *qe;
         1286         int count = 0;
         1287         size_t siz;
         1288 
         1289         sels = ecalloc(1, sizeof(*sels));
         1290 
         1291         start = q;
         1292         for (; ; q++) {
         1293                 if (*q == ',' || *q == '\0') {
         1294                         qe = estrndup(start, q - start);
         1295                         sel = newselector(qe);
         1296                         free(qe);
         1297 
         1298                         /* add new selector */
         1299                         siz = (count + 1) * sizeof(struct selector *);
         1300                         sels->selectors = erealloc(sels->selectors, siz);
         1301                         sels->selectors[count] = sel;
         1302                         count++;
         1303 
         1304                         if (*q == '\0')
         1305                                 break;
         1306                         start = q + 1;
         1307                 }
         1308         }
         1309         sels->count = count;
         1310 
         1311         return sels;
         1312 }
         1313 
         1314 /* very limited CSS-like matcher, supports: main, main#id, main.class,
         1315    ".class", "#id", "ul li a" */
         1316 static int
         1317 iscssmatch(struct selector *sel, struct node *root, int maxdepth)
         1318 {
         1319         int d, md = 0;
         1320 
         1321         for (d = 0; d <= maxdepth; d++) {
         1322                 /* tag matched? */
         1323                 if (sel->nodes[md].tagname[0] &&
         1324                     strcasecmp(sel->nodes[md].tagname, root[d].tagname))
         1325                         continue; /* no */
         1326 
         1327                 /* id matched? */
         1328                 if (sel->nodes[md].id[0] && strcmp(sel->nodes[md].id, root[d].id))
         1329                         continue; /* no */
         1330 
         1331                 /* class matched, for now allow only one classname in the selector,
         1332                    matching multiple classnames */
         1333                 if (sel->nodes[md].classnames[0] &&
         1334                     !isclassmatch(root[d].classnames, sel->nodes[md].classnames))
         1335                         continue; /* no */
         1336 
         1337                 /* index matched */
         1338                 if (sel->nodes[md].index != -1 &&
         1339                     (d == 0 ||
         1340                     root[d - 1].nchildren == 0 ||
         1341                     sel->nodes[md].index != root[d - 1].nchildren - 1))
         1342                         continue;
         1343 
         1344                 md++;
         1345                 /* all matched of one selector */
         1346                 if (md == sel->depth)
         1347                         return 1;
         1348         }
         1349 
         1350         return 0;
         1351 }
         1352 
         1353 static int
         1354 iscssmatchany(struct selectors *sels, struct node *root, int maxdepth)
         1355 {
         1356         struct selector *sel;
         1357         int i;
         1358 
         1359         for (i = 0; i < sels->count; i++) {
         1360                 sel = sels->selectors[i];
         1361                 if (iscssmatch(sel, root, maxdepth))
         1362                         return 1;
         1363         }
         1364         return 0;
         1365 }
         1366 
         1367 static void
         1368 handleinlinealt(void)
         1369 {
         1370         struct node *cur;
         1371         char *start, *s, *e;
         1372 
         1373         /* do not show the alt text if the element is hidden */
         1374         cur = &nodes[curnode];
         1375         if (cur->tag.displaytype & DisplayNone)
         1376                 return;
         1377 
         1378         /* show img alt attribute as text. */
         1379         if (attr_alt.len) {
         1380                 start = attr_alt.data;
         1381                 e = attr_alt.data + attr_alt.len;
         1382 
         1383                 for (s = start; s < e; s++)
         1384                         printc((unsigned char)*s);
         1385                 hflush();
         1386         } else if (cur->tag.id == TagImg && !showurlinline) {
         1387                 /* if there is no alt text and no URL is shown inline, then
         1388                    show "[IMG]" to indicate there was an image there */
         1389                 hprint("[IMG]");
         1390         }
         1391 }
         1392 
         1393 /* lookup a link reference by url in the red-black tree */
         1394 static struct linkref *
         1395 findlinkref(const char *url)
         1396 {
         1397         struct linkref find;
         1398 
         1399         find.url = (char *)url;
         1400 
         1401         return RB_FIND(linkreftree, &linkrefhead, &find);
         1402 }
         1403 
         1404 /* add a link reference. Returns the added link reference, or the existing link
         1405    reference if links are deduplicated */
         1406 static struct linkref *
         1407 addlinkref(const char *url, const char *_type, enum TagId tagid, int ishidden)
         1408 {
         1409         struct linkref *link;
         1410         size_t linknr;
         1411 
         1412         /* if links are deduplicates return the existing link */
         1413         if (uniqrefs && (link = findlinkref(url)))
         1414                 return link;
         1415 
         1416         if (tagid == TagA)
         1417                 _type = "link";
         1418 
         1419         link = ecalloc(1, sizeof(*link));
         1420 
         1421         if (!ishidden) {
         1422                 linknr = ++nvisrefs;
         1423                 if (nvisrefs >= ncapvisrefs)
         1424                         ncapvisrefs += 256; /* greedy alloc */
         1425                 visrefs = erealloc(visrefs, sizeof(*visrefs) * ncapvisrefs);
         1426                 visrefs[linknr - 1] = link; /* add pointer to list */
         1427         } else {
         1428                 linknr = ++nhiddenrefs;
         1429                 if (nhiddenrefs >= ncaphiddenrefs)
         1430                         ncaphiddenrefs += 256; /* greedy alloc */
         1431                 hiddenrefs = erealloc(hiddenrefs, sizeof(*hiddenrefs) * ncaphiddenrefs);
         1432                 hiddenrefs[linknr - 1] = link; /* add pointer to list */
         1433         }
         1434 
         1435         link->url = estrdup(url);
         1436         link->type = estrdup(_type);
         1437         link->tagid = tagid;
         1438         link->ishidden = ishidden;
         1439         link->linknr = linknr;
         1440 
         1441         /* add to tree: the tree is only used for checking unique link references */
         1442         if (uniqrefs)
         1443                 RB_INSERT(linkreftree, &linkrefhead, link);
         1444 
         1445         return link;
         1446 }
         1447 
         1448 static void
         1449 handleinlinelink(void)
         1450 {
         1451         struct uri newuri, olduri;
         1452         struct node *cur;
         1453         char buf[4096], *url;
         1454         int r;
         1455 
         1456         if (!showrefbottom && !showrefinline && !showurlinline && !resources)
         1457                 return; /* there is no need to collect the reference */
         1458 
         1459         if (!attr_href.len && !attr_src.len && !attr_data.len)
         1460                 return; /* there is no reference */
         1461 
         1462         /* by default use the original URL */
         1463         if (attr_src.len)
         1464                 url = attr_src.data;
         1465         else if (attr_href.len)
         1466                 url = attr_href.data;
         1467         else
         1468                 url = attr_data.data;
         1469 
         1470         if (!url)
         1471                 return;
         1472 
         1473         /* Not an absolute URL yet: try to make it absolute.
         1474            If it is not possible use the relative URL */
         1475         if (!uri_hasscheme(url) && basehrefset &&
         1476             uri_parse(url, &olduri) != -1 &&
         1477             uri_makeabs(&newuri, &olduri, &base) != -1 &&
         1478             newuri.proto[0]) {
         1479                 r = uri_format(buf, sizeof(buf), &newuri);
         1480                 if (r >= 0 && (size_t)r < sizeof(buf))
         1481                         url = buf;
         1482         }
         1483 
         1484         if (!url[0])
         1485                 return;
         1486 
         1487         cur = &nodes[curnode];
         1488 
         1489         if (!(cur->tag.displaytype & DisplayNone)) {
         1490                 string_clear(&nodes_links[curnode]);
         1491                 string_append(&nodes_links[curnode], url, strlen(url));
         1492         }
         1493 
         1494         /* add hidden links directly to the reference,
         1495            the order doesn't matter */
         1496         if (cur->tag.displaytype & DisplayNone)
         1497                 addlinkref(url, cur->tag.name, cur->tag.id, 1);
         1498 }
         1499 
         1500 static void
         1501 printlinkrefs(void)
         1502 {
         1503         struct linkref *ref;
         1504         size_t i;
         1505 
         1506         if (!nvisrefs && !nhiddenrefs)
         1507                 return;
         1508 
         1509         if (resources) {
         1510                 for (i = 0; i < nvisrefs; i++) {
         1511                         ref = visrefs[i];
         1512                         dprintf(3, "%s\t%s\n", ref->type, ref->url);
         1513                 }
         1514                 for (i = 0; i < nhiddenrefs; i++) {
         1515                         ref = hiddenrefs[i];
         1516                         dprintf(3, "%s\t%s\n", ref->type, ref->url);
         1517                 }
         1518         }
         1519 
         1520         printf("\nReferences\n\n");
         1521 
         1522         for (i = 0; i < nvisrefs; i++) {
         1523                 ref = visrefs[i];
         1524                 printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->type);
         1525         }
         1526 
         1527         if (nhiddenrefs > 0)
         1528                 printf("\n\nHidden references\n\n");
         1529         /* hidden links don't have a link number, just count them */
         1530         for (i = 0; i < nhiddenrefs; i++) {
         1531                 ref = hiddenrefs[i];
         1532                 printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->type);
         1533         }
         1534 }
         1535 
         1536 /* size to grow node capacity (greedy) */
         1537 #define NODE_CAP_INC 256
         1538 
         1539 /* increase node depth, allocate space for nodes if needed */
         1540 static void
         1541 incnode(void)
         1542 {
         1543         size_t i;
         1544 
         1545         curnode++;
         1546 
         1547         if (curnode >= MAX_NODE_DEPTH)
         1548                 errx(1, "max node depth reached: %d", curnode);
         1549 
         1550         if (curnode >= ncapnodes) {
         1551                 nodes = erealloc(nodes, sizeof(*nodes) * (ncapnodes + NODE_CAP_INC));
         1552                 nodes_links = erealloc(nodes_links, sizeof(*nodes_links) * (ncapnodes + NODE_CAP_INC));
         1553 
         1554                 /* clear new region */
         1555                 memset(&nodes[ncapnodes], 0, sizeof(*nodes) * NODE_CAP_INC);
         1556                 memset(&nodes_links[ncapnodes], 0, sizeof(*nodes_links) * NODE_CAP_INC);
         1557 
         1558                 for (i = ncapnodes; i < ncapnodes + NODE_CAP_INC; i++) {
         1559                         nodes[i].tag.displaytype = DisplayInline;
         1560                         nodes[i].tag.name = nodes[i].tagname; /* assign to use fixed-size buffer */
         1561                 }
         1562 
         1563                 ncapnodes += NODE_CAP_INC; /* greedy alloc */
         1564         }
         1565 }
         1566 
         1567 static void
         1568 xmldatastart(XMLParser *p)
         1569 {
         1570 }
         1571 
         1572 static void
         1573 xmldataend(XMLParser *p)
         1574 {
         1575         struct node *cur;
         1576         char *start, *s, *e;
         1577 
         1578         if (!htmldata.data || !htmldata.len)
         1579                 return;
         1580 
         1581         cur = &nodes[curnode];
         1582 
         1583         if (reader_ignore || (cur->tag.displaytype & DisplayNone)) {
         1584                 /* print nothing */
         1585         } else if ((cur->tag.displaytype & DisplayPre) ||
         1586                    findparenttype(curnode - 1, DisplayPre)) {
         1587                 printpre(htmldata.data, htmldata.len);
         1588         } else {
         1589                 start = htmldata.data;
         1590                 e = htmldata.data + htmldata.len;
         1591 
         1592                 for (s = start; s < e; s++)
         1593                         printc((unsigned char)*s);
         1594         }
         1595 
         1596         string_clear(&htmldata);
         1597 }
         1598 
         1599 static void
         1600 xmldata(XMLParser *p, const char *data, size_t datalen)
         1601 {
         1602         struct node *cur;
         1603 
         1604         if (reader_ignore)
         1605                 return;
         1606 
         1607         cur = &nodes[curnode];
         1608         if (cur->tag.displaytype & DisplayNone)
         1609                 return;
         1610 
         1611         string_append(&htmldata, data, datalen);
         1612 }
         1613 
         1614 static void
         1615 xmldataentity(XMLParser *p, const char *data, size_t datalen)
         1616 {
         1617         struct node *cur;
         1618         char buf[16];
         1619         int n;
         1620 
         1621         if (reader_ignore)
         1622                 return;
         1623 
         1624         cur = &nodes[curnode];
         1625         if (cur->tag.displaytype & DisplayNone)
         1626                 return;
         1627 
         1628         n = xml_entitytostr(data, buf, sizeof(buf));
         1629         if (n > 0)
         1630                 xmldata(p, buf, (size_t)n);
         1631         else
         1632                 xmldata(p, data, datalen);
         1633 }
         1634 
         1635 static void
         1636 xmlcdatastart(XMLParser *p)
         1637 {
         1638         xmldatastart(p);
         1639 }
         1640 
         1641 static void
         1642 xmlcdataend(XMLParser *p)
         1643 {
         1644         xmldataend(p); /* treat CDATA as data */
         1645 }
         1646 
         1647 static void
         1648 xmlcdata(XMLParser *p, const char *data, size_t datalen)
         1649 {
         1650         xmldata(p, data, datalen); /* treat CDATA as data */
         1651 }
         1652 
         1653 /* lookup function to compare tag name (case-insensitive) for sort functions */
         1654 static int
         1655 findtagcmp(const void *v1, const void *v2)
         1656 {
         1657         struct tag *t1 = (struct tag *)v1;
         1658         struct tag *t2 = (struct tag *)v2;
         1659 
         1660         return strcasecmp(t1->name, t2->name);
         1661 }
         1662 
         1663 /* binary search tag by tag name */
         1664 static struct tag *
         1665 findtag(const char *t)
         1666 {
         1667         struct tag find = { 0 };
         1668 
         1669         find.name = t;
         1670 
         1671         return bsearch(&find, tags, LEN(tags), sizeof(*tags), findtagcmp);
         1672 }
         1673 
         1674 static void
         1675 handleendtag(struct tag *tag)
         1676 {
         1677         int i, marginbottom;
         1678 
         1679         if (tag->displaytype & DisplayNone)
         1680                 return;
         1681         if (reader_ignore)
         1682                 return;
         1683 
         1684         if (tag->displaytype & (DisplayButton | DisplayOption)) {
         1685                 hputchar(']');
         1686                 hflush();
         1687         }
         1688 
         1689         if (tag->displaytype & (DisplayBlock | DisplayHeader | DisplayTable | DisplayTableRow |
         1690                 DisplayList | DisplayListItem | DisplayPre)) {
         1691                 endblock(); /* break line if needed */
         1692         }
         1693 
         1694         /* when a list ends and its not inside a list add an extra bottom margin */
         1695         marginbottom = tag->marginbottom;
         1696 
         1697         if (marginbottom > 0) {
         1698                 if (tag->displaytype & DisplayList) {
         1699                         if (findparenttype(curnode - 1, DisplayList))
         1700                                 marginbottom--;
         1701                 }
         1702         }
         1703 
         1704         if (marginbottom > 0) {
         1705                 hflush();
         1706                 for (i = currentnewlines; i < marginbottom; i++) {
         1707                         putchar('\n');
         1708                         nbytesline = 0;
         1709                         ncells = 0;
         1710                         currentnewlines++;
         1711                 }
         1712                 hadnewline = 1;
         1713         }
         1714 }
         1715 
         1716 static void
         1717 endnode(struct node *cur)
         1718 {
         1719         struct linkref *ref;
         1720         int i, ishidden;
         1721 
         1722         /* set a flag indicating the element and its parent containers have data.
         1723            This is used for some formatting */
         1724         if (cur->hasdata) {
         1725                 for (i = curnode; i >= 0; i--)
         1726                         nodes[i].hasdata = 1;
         1727         }
         1728 
         1729         endmarkup(cur->tag.markuptype);
         1730 
         1731         ishidden = reader_ignore || (cur->tag.displaytype & DisplayNone);
         1732 
         1733         /* add link and show the link number in the visible order */
         1734         if (!ishidden && nodes_links[curnode].len > 0) {
         1735                 ref = addlinkref(nodes_links[curnode].data,
         1736                         cur->tag.name, cur->tag.id, ishidden);
         1737 
         1738                 if (showrefinline || showurlinline) {
         1739                         hflush();
         1740                         startmarkup(MarkupReverse);
         1741                 }
         1742 
         1743                 if (showrefinline)
         1744                         hprintf("[%zu]", ref->linknr);
         1745                 if (showurlinline) {
         1746                         if (ref->tagid == TagA)
         1747                                 hprintf("[%s]", ref->url);
         1748                         else
         1749                                 hprintf("[%s: %s]", ref->type, ref->url);
         1750                 }
         1751                 if (showrefinline || showurlinline) {
         1752                         endmarkup(MarkupReverse);
         1753                         hflush();
         1754                 }
         1755         }
         1756 
         1757         handleendtag(&(cur->tag));
         1758 }
         1759 
         1760 static void
         1761 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
         1762 {
         1763         struct tag *found, *tag;
         1764         enum TagId child, childs[16];
         1765         size_t nchilds;
         1766         int i, j, k, nchildfound, parenttype;
         1767 
         1768         /* match tag and lookup metadata */
         1769         /* ignore closing of void elements, like </br>, which is not allowed */
         1770         if ((found = findtag(t))) {
         1771                 if (!isshort && found->isvoid)
         1772                         return;
         1773         }
         1774 
         1775         /* TODO: implement more complete optional tag handling.
         1776            in reality the optional tag rules are more complex, see:
         1777            https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */
         1778 
         1779         child = 0;
         1780         nchilds = 0;
         1781         nchildfound = 0;
         1782         parenttype = 0; /* by default, seek until the root */
         1783 
         1784         if (found && found->displaytype & DisplayPre) {
         1785                 skipinitialws = 0; /* do not skip white-space, for margins */
         1786         } else if (found && found->displaytype & DisplayList) {
         1787                 childs[0] = TagLi;
         1788                 nchilds = 1;
         1789                 parenttype = DisplayList;
         1790         } else if (found && found->displaytype & DisplayTableRow) {
         1791                 childs[0] = TagTd;
         1792                 nchilds = 1;
         1793                 parenttype = DisplayTableRow;
         1794         } else if (found && found->displaytype & DisplayTable) {
         1795                 childs[0] = TagTd;
         1796                 nchilds = 1;
         1797                 parenttype = DisplayTable;
         1798         } else if (found && found->displaytype & DisplaySelect) {
         1799                 childs[0] = TagOption;
         1800                 nchilds = 1;
         1801                 parenttype = DisplaySelect;
         1802         } else if (found && found->displaytype & DisplayDl) {
         1803                 childs[0] = TagP;
         1804                 childs[1] = TagDd;
         1805                 childs[2] = TagDt;
         1806                 nchilds = 3;
         1807                 parenttype = DisplayDl;
         1808         } else if (found && found->displaytype & DisplayBlock) {
         1809                 childs[0] = TagP;
         1810                 nchilds = 1;
         1811                 parenttype = 0; /* seek until the root */
         1812         }
         1813 
         1814         if (nchilds > 0) {
         1815                 for (i = curnode; i >= 0; i--) {
         1816                         if (nchildfound)
         1817                                 break;
         1818                         if ((nodes[i].tag.displaytype & parenttype))
         1819                                 break;
         1820                         for (j = 0; j < nchilds; j++) {
         1821                                 child = childs[j];
         1822                                 if (nodes[i].tag.id == child) {
         1823                                         /* fake closing the previous tags */
         1824                                         for (k = curnode; k >= i; k--)
         1825                                                 endnode(&nodes[k]);
         1826                                         curnode = k;
         1827                                         nchildfound = 1;
         1828                                         break;
         1829                                 }
         1830                         }
         1831                 }
         1832         }
         1833 
         1834         /* if the current closing tag matches the current open tag */
         1835         if (nodes[curnode].tag.name &&
         1836             !tagcmp(nodes[curnode].tag.name, t)) {
         1837                 endnode(&nodes[curnode]);
         1838                 if (curnode)
         1839                         curnode--;
         1840         } else {
         1841                 /* ... else lookup the first matching start tag. This is also
         1842                    for handling optional closing tags */
         1843                 tag = NULL;
         1844                 for (i = curnode; i >= 0; i--) {
         1845                         if (nodes[i].tag.name &&
         1846                             !tagcmp(nodes[i].tag.name, t)) {
         1847                                 endnode(&nodes[i]);
         1848                                 curnode = i > 0 ? i - 1 : 0;
         1849                                 tag = &nodes[i].tag;
         1850                                 break;
         1851                         }
         1852                 }
         1853                 /* unmatched closing tag found */
         1854                 if (!tag && found)
         1855                         handleendtag(found);
         1856         }
         1857         indent = calcindent();
         1858 
         1859 #if 0
         1860         /* check if linewrap is enabled, but currently is disabled and needs to
         1861            be restored */
         1862         if (allowlinewrap && !linewrap) {
         1863                 tag = NULL;
         1864                 for (i = curnode; i >= 0; i--) {
         1865                         if (nodes[i].tag.id == TagTable) {
         1866                                 tag = &nodes[i].tag;
         1867                                 break;
         1868                         }
         1869                 }
         1870                 if (!tag)
         1871                         linewrap = allowlinewrap;
         1872         }
         1873 #endif
         1874 
         1875         /* restore markup of the tag we are in now */
         1876         startmarkup(nodes[curnode].tag.markuptype);
         1877 
         1878         /* check if the current node still matches the visible selector */
         1879         if (reader_mode && sel_show && !reader_ignore) {
         1880                 if (!iscssmatchany(sel_show, nodes, curnode)) {
         1881                         reader_ignore = 1;
         1882                         newline();
         1883                 }
         1884         }
         1885 }
         1886 
         1887 static void
         1888 xmltagstart(XMLParser *p, const char *t, size_t tl)
         1889 {
         1890         struct tag *found;
         1891         struct node *cur;
         1892         enum TagId tagid;
         1893         enum TagId child, childs[16];
         1894         size_t nchilds;
         1895         char *s;
         1896         int i, j, k, nchildfound, parenttype;
         1897 
         1898         cur = &nodes[curnode];
         1899 
         1900         string_clear(&attr_alt);
         1901         string_clear(&attr_checked);
         1902         string_clear(&attr_class);
         1903         attr_class_set = 0;
         1904         string_clear(&attr_data);
         1905         string_clear(&attr_href);
         1906         string_clear(&attr_id);
         1907         attr_id_set = 0;
         1908         string_clear(&attr_src);
         1909         string_clear(&attr_type);
         1910         string_clear(&attr_value);
         1911 
         1912         /* match tag and lookup metadata */
         1913         found = findtag(t);
         1914 
         1915         /* TODO: implement more complete optional tag handling.
         1916            in reality the optional tag rules are more complex, see:
         1917            https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */
         1918 
         1919         child = 0;
         1920         nchilds = 0;
         1921         nchildfound = 0;
         1922         parenttype = 0; /* by default, seek until the root */
         1923 
         1924         /* if optional tag <p> is open and a list element is found, close </p>. */
         1925         if (found && found->displaytype & DisplayList) {
         1926                 /* not inside a list */
         1927                 childs[0] = TagP;
         1928                 nchilds = 1;
         1929                 parenttype = DisplayList;
         1930         } else if (found && found->isoptional) {
         1931                 tagid = found->id;
         1932                 if (tagid == TagLi) {
         1933                         childs[0] = TagLi;
         1934                         nchilds = 1;
         1935                         parenttype = DisplayList;
         1936                 } else if (tagid == TagTd) {
         1937                         childs[0] = TagTd;
         1938                         nchilds = 1;
         1939                         parenttype = DisplayTableRow;
         1940                 } else if (tagid == TagTr) {
         1941                         childs[0] = TagTr;
         1942                         nchilds = 1;
         1943                         parenttype = DisplayTable;
         1944                 } else if (tagid == TagP) {
         1945                         childs[0] = TagP;
         1946                         nchilds = 1;
         1947                         parenttype = 0; /* seek until the root */
         1948                 } else if (tagid == TagOption) {
         1949                         childs[0] = TagOption;
         1950                         nchilds = 1;
         1951                         parenttype = DisplaySelect;
         1952                 } else if (tagid == TagDt) {
         1953                         childs[0] = TagDd;
         1954                         nchilds = 1;
         1955                         parenttype = DisplayDl;
         1956                 } else if (tagid == TagDd) {
         1957                         childs[0] = TagDd;
         1958                         childs[1] = TagDt;
         1959                         nchilds = 2;
         1960                         parenttype = DisplayDl;
         1961                 } else if (tagid == cur->tag.id) {
         1962                         /* fake closing the previous tag if it is the same and repeated */
         1963                         xmltagend(p, t, tl, 0);
         1964                 }
         1965         } else if (found && found->displaytype & DisplayBlock) {
         1966                 /* check if we have an open "<p>" tag */
         1967                 childs[0] = TagP;
         1968                 childs[1] = TagDl;
         1969                 nchilds = 2;
         1970                 parenttype = DisplayDl;
         1971         }
         1972 
         1973         if (nchilds > 0) {
         1974                 for (i = curnode; i >= 0; i--) {
         1975                         if (nchildfound)
         1976                                 break;
         1977                         if ((nodes[i].tag.displaytype & parenttype))
         1978                                 break;
         1979                         for (j = 0; j < nchilds; j++) {
         1980                                 child = childs[j];
         1981                                 if (nodes[i].tag.id == child) {
         1982                                         /* fake closing the previous tags */
         1983                                         for (k = curnode; k >= i; k--)
         1984                                                 xmltagend(p, nodes[k].tag.name, strlen(nodes[k].tag.name), 0);
         1985                                         nchildfound = 1;
         1986                                         break;
         1987                                 }
         1988                         }
         1989                 }
         1990         }
         1991 
         1992         incnode();
         1993         string_clear(&nodes_links[curnode]); /* clear possible link reference for this node */
         1994         cur = &nodes[curnode];
         1995         memset(cur, 0, sizeof(*cur)); /* clear / reset node */
         1996         /* tag defaults */
         1997         cur->tag.displaytype = DisplayInline;
         1998         cur->tag.name = cur->tagname; /* assign fixed-size buffer */
         1999         strlcpy(cur->tagname, t, sizeof(cur->tagname));
         2000 
         2001         /* force to lowercase */
         2002         for (s = cur->tagname; *s; s++)
         2003                 *s = TOLOWER((unsigned char)*s);
         2004 
         2005         /* matched tag: copy tag information to current node */
         2006         if (found)
         2007                 memcpy(&(cur->tag), found, sizeof(*found));
         2008 
         2009         /* if parent tag is hidden then hide itself too */
         2010         if (curnode > 0 && (nodes[curnode - 1].tag.displaytype & DisplayNone))
         2011                 cur->tag.displaytype |= DisplayNone;
         2012 }
         2013 
         2014 static void
         2015 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
         2016 {
         2017         struct tag *found;
         2018         enum TagId tagid;
         2019         struct node *cur, *parent;
         2020         int i, margintop;
         2021 
         2022         /* match tag and lookup metadata */
         2023         tagid = 0;
         2024         if ((found = findtag(t)))
         2025                 tagid = found->id;
         2026 
         2027         /* temporary replace the callback except the reader and end of tag
         2028            restore the context once we receive the same ignored tag in the
         2029            end tag handler */
         2030         if (tagid == TagScript) {
         2031                 ignorestate = endtag = "</script>";
         2032                 getnext = p->getnext; /* for restore */
         2033                 p->getnext = getnext_ignore;
         2034                 xmltagend(p, t, tl, 0); /* fake the call the tag was ended */
         2035                 return;
         2036         } else if (tagid == TagStyle) {
         2037                 ignorestate = endtag = "</style>";
         2038                 getnext = p->getnext; /* for restore */
         2039                 p->getnext = getnext_ignore;
         2040                 xmltagend(p, t, tl, 0); /* fake the call the tag was ended */
         2041                 return;
         2042         }
         2043 
         2044 #if 0
         2045         /* disable line-wrapping inside tables */
         2046         if (tagid == TagTable)
         2047                 linewrap = 0;
         2048 #endif
         2049 
         2050         cur = &nodes[curnode];
         2051 
         2052         /* copy attributes if set */
         2053         if (attr_id.len)
         2054                 strlcpy(cur->id, attr_id.data, sizeof(cur->id));
         2055         else
         2056                 cur->id[0] = '\0';
         2057         if (attr_class.len)
         2058                 strlcpy(cur->classnames, attr_class.data, sizeof(cur->classnames));
         2059         else
         2060                 cur->classnames[0] = '\0';
         2061 
         2062         /* parent node */
         2063         if (curnode > 0) {
         2064                 parent = &nodes[curnode - 1];
         2065                 parent->nchildren++; /* increase child node count */
         2066                 /* count visible childnodes */
         2067                 if (!(cur->tag.displaytype & DisplayNone))
         2068                         parent->visnchildren++;
         2069         } else {
         2070                 parent = NULL;
         2071         }
         2072 
         2073         if (reader_mode && sel_show && reader_ignore &&
         2074             iscssmatchany(sel_show, nodes, curnode))
         2075                 reader_ignore = 0;
         2076 
         2077         /* hide element */
         2078         if (reader_mode && sel_hide &&
         2079             iscssmatchany(sel_hide, nodes, curnode))
         2080                 cur->tag.displaytype |= DisplayNone;
         2081 
         2082         /* indent for this tag */
         2083         cur->indent = cur->tag.indent;
         2084 
         2085         if (!reader_ignore) {
         2086                 /* add link reference, print links and alt text */
         2087                 handleinlinelink();
         2088                 handleinlinealt();
         2089         }
         2090 
         2091         /* <select><option> */
         2092         if (cur->tag.displaytype & DisplayOption) {
         2093                 /* <select multiple>: show all options */
         2094                 if (parent->tag.displaytype & DisplaySelectMulti)
         2095                         cur->tag.displaytype |= DisplayBlock;
         2096                 else if (parent->nchildren > 1) /* show the first item as selected */
         2097                         cur->tag.displaytype |= DisplayNone; /* else hide */
         2098         }
         2099 
         2100         if (cur->tag.displaytype & DisplayNone)
         2101                 return;
         2102 
         2103         if (reader_ignore)
         2104                 return;
         2105 
         2106         indent = calcindent();
         2107 
         2108         if ((cur->tag.displaytype & (DisplayBlock | DisplayHeader | DisplayPre |
         2109                 DisplayTable | DisplayTableRow |
         2110                 DisplayList | DisplayListItem))) {
         2111                 startblock(); /* break line if needed */
         2112         }
         2113 
         2114         if (cur->tag.displaytype & (DisplayButton | DisplayOption)) {
         2115                 hflush();
         2116                 hputchar('[');
         2117         }
         2118 
         2119         margintop = cur->tag.margintop;
         2120         if (cur->tag.displaytype & (DisplayList)) {
         2121                 for (i = curnode - 1; i >= 0; i--) {
         2122                         if (nodes[i].tag.displaytype & DisplayList)
         2123                                 break;
         2124                         if (!(nodes[i].tag.displaytype & DisplayListItem))
         2125                                 continue;
         2126                         if (nodes[i].hasdata && margintop > 0) {
         2127                                 margintop--;
         2128                                 break;
         2129                         }
         2130                 }
         2131         } else if (cur->tag.displaytype & (DisplayBlock|DisplayTable)) {
         2132                 if (!parentcontainerhasdata(cur->tag.displaytype, curnode - 1)) {
         2133                         if (margintop > 0)
         2134                                 margintop--;
         2135                 }
         2136         }
         2137 
         2138         if (margintop > 0) {
         2139                 hflush();
         2140                 for (i = currentnewlines; i < margintop; i++) {
         2141                         putchar('\n');
         2142                         nbytesline = 0;
         2143                         ncells = 0;
         2144                         currentnewlines++;
         2145                 }
         2146                 hadnewline = 1;
         2147         }
         2148 
         2149         if (cur->tag.displaytype & DisplayPre) {
         2150                 skipinitialws = 1;
         2151         } else if (cur->tag.displaytype & DisplayTableCell) {
         2152                 if (parent && parent->visnchildren > 1)
         2153                         hputchar('\t');
         2154         } else if (cur->tag.displaytype & DisplayListItem) {
         2155                 /* find first parent node and ordered numbers or unordered */
         2156                 if (parent) {
         2157                         skipinitialws = 0;
         2158 
         2159                         /* print bullet, add columns to indentation level */
         2160                         if (parent->tag.displaytype & DisplayListOrdered) {
         2161                                 hprintf("%4zu. ", parent->nchildren);
         2162                                 cur->indent = 6;
         2163                                 indent += cur->indent; /* align to number */
         2164                         } else if (parent->tag.displaytype & DisplayList) {
         2165                                 hprint(str_bullet_item);
         2166                                 cur->indent = 2;
         2167                                 indent += 2; /* align to bullet */
         2168                         }
         2169                 }
         2170                 skipinitialws = 0;
         2171         } else if (cur->tag.displaytype & DisplayInput) {
         2172                 if (!attr_type.len) {
         2173                         hprintf("[%-15s]", attr_value.len ? attr_value.data : ""); /* default: text */
         2174                 } else if (!strcasecmp(attr_type.data, "button")) {
         2175                         hprintf("[%s]", attr_value.len ? attr_value.data : "");
         2176                 } else if (!strcasecmp(attr_type.data, "submit")) {
         2177                         hprintf("[%s]", attr_value.len ? attr_value.data : "Submit Query");
         2178                 } else if (!strcasecmp(attr_type.data, "reset")) {
         2179                         hprintf("[%s]", attr_value.len ? attr_value.data : "Reset");
         2180                 } else if (!strcasecmp(attr_type.data, "checkbox")) {
         2181                         hprintf("[%s]",
         2182                                 attr_checked.len &&
         2183                                 !strcasecmp(attr_checked.data, "checked") ? str_checkbox_checked : " ");
         2184                 } else if (!strcasecmp(attr_type.data, "radio")) {
         2185                         hprintf("[%s]",
         2186                                 attr_checked.len &&
         2187                                 !strcasecmp(attr_checked.data, "checked") ? str_radio_checked : " ");
         2188                 } else if (!strcasecmp(attr_type.data, "hidden")) {
         2189                         cur->tag.displaytype |= DisplayNone;
         2190                 } else {
         2191                         /* unrecognized / default case is text */
         2192                         hprintf("[%-15s]", attr_value.len ? attr_value.data : "");
         2193                 }
         2194         }
         2195 
         2196         startmarkup(cur->tag.markuptype);
         2197 
         2198         /* do not count data such as an item bullet as part of the data for
         2199            the node */
         2200         cur->hasdata = 0;
         2201 
         2202         if (tagid == TagHr) { /* ruler */
         2203                 i = termwidth - indent - defaultindent;
         2204                 for (; i > 0; i--)
         2205                         hprint(str_ruler);
         2206                 cur->hasdata = 1; /* treat <hr/> as data */
         2207         } else if (tagid == TagBr) {
         2208                 hflush();
         2209                 hadnewline = 0; /* forced newline */
         2210                 hputchar('\n');
         2211                 cur->hasdata = 1; /* treat <br/> as data */
         2212         }
         2213 
         2214         /* autoclose tags, such as <br>, pretend we are <br/> */
         2215         if (!isshort && cur->tag.isvoid)
         2216                 xmltagend(p, t, tl, 1); /* pretend close of short tag */
         2217 }
         2218 
         2219 static void
         2220 xmlattr(XMLParser *p, const char *t, size_t tl, const char *n,
         2221         size_t nl, const char *v, size_t vl)
         2222 {
         2223         struct node *cur;
         2224         enum TagId tagid;
         2225 
         2226         cur = &nodes[curnode];
         2227         tagid = cur->tag.id;
         2228 
         2229         /* hide tags with attribute aria-hidden or hidden */
         2230         if (!attrcmp(n, "aria-hidden") || !attrcmp(n, "hidden"))
         2231                 cur->tag.displaytype |= DisplayNone;
         2232 
         2233         if (!attr_class_set && !attrcmp(n, "class")) /* use the first set attribute */
         2234                 string_append(&attr_class, v, vl);
         2235         else if (!attr_id_set && !attrcmp(n, "id")) /* use the first set attribute */
         2236                 string_append(&attr_id, v, vl);
         2237         else if (!attrcmp(n, "type"))
         2238                 string_append(&attr_type, v, vl);
         2239         else if (!attrcmp(n, "value"))
         2240                 string_append(&attr_value, v, vl);
         2241 
         2242         /* <base href="..." /> */
         2243         if (!basehrefset && tagid == TagBase && !attrcmp(n, "href"))
         2244                 strlcat(basehrefdoc, v, sizeof(basehrefdoc));
         2245 
         2246         if (tagid == TagA && !attrcmp(n, "href"))
         2247                 string_append(&attr_href, v, vl);
         2248 
         2249         if (tagid == TagSelect && !attrcmp(n, "multiple"))
         2250                 cur->tag.displaytype |= DisplaySelectMulti;
         2251 
         2252         if (tagid == TagObject && !attrcmp(n, "data"))
         2253                 string_append(&attr_data, v, vl);
         2254 
         2255         /* show img alt attribute as text. */
         2256         if (tagid == TagImg && !attrcmp(n, "alt"))
         2257                 string_append(&attr_alt, v, vl);
         2258 
         2259         if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked"))
         2260                 string_append(&attr_checked, v, vl);
         2261 
         2262         /* src attribute */
         2263         switch (tagid) {
         2264         case TagAudio:
         2265         case TagEmbed:
         2266         case TagFrame:
         2267         case TagIframe:
         2268         case TagImg:
         2269         case TagSource:
         2270         case TagTrack:
         2271         case TagVideo:
         2272                 if (!attrcmp(n, "src"))
         2273                         string_append(&attr_src, v, vl);
         2274                 break;
         2275         default:
         2276                 break;
         2277         }
         2278 }
         2279 
         2280 static void
         2281 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n,
         2282         size_t nl, const char *v, size_t vl)
         2283 {
         2284         char buf[16];
         2285         int len;
         2286 
         2287         len = xml_entitytostr(v, buf, sizeof(buf));
         2288         if (len > 0)
         2289                 xmlattr(p, t, tl, n, nl, buf, (size_t)len);
         2290         else
         2291                 xmlattr(p, t, tl, n, nl, v, vl);
         2292 }
         2293 
         2294 static void
         2295 xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n,
         2296         size_t nl)
         2297 {
         2298         struct node *cur;
         2299         enum TagId tagid;
         2300 
         2301         cur = &nodes[curnode];
         2302         tagid = cur->tag.id;
         2303 
         2304         if (!attr_class_set && !attrcmp(n, "class"))
         2305                 attr_class_set = 1;
         2306         else if (!attr_id_set && !attrcmp(n, "id"))
         2307                 attr_id_set = 1;
         2308 
         2309         /* set base URL, if it is set it cannot be overwritten again */
         2310         if (!basehrefset && basehrefdoc[0] &&
         2311             tagid == TagBase && !attrcmp(n, "href"))
         2312                 basehrefset = uri_parse(basehrefdoc, &base) != -1 ? 1 : 0;
         2313 
         2314         /* if attribute checked is set but it has no value then set it to "checked" */
         2315         if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked") && !attr_checked.len)
         2316                 string_append(&attr_checked, "checked", sizeof("checked") - 1);
         2317 }
         2318 
         2319 static void
         2320 xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n,
         2321         size_t nl)
         2322 {
         2323         struct node *cur;
         2324         enum TagId tagid;
         2325 
         2326         cur = &nodes[curnode];
         2327         tagid = cur->tag.id;
         2328 
         2329         if (!attrcmp(n, "alt"))
         2330                 string_clear(&attr_alt);
         2331         else if (!attrcmp(n, "checked"))
         2332                 string_clear(&attr_checked);
         2333         else if (!attr_class_set && !attrcmp(n, "class"))
         2334                 string_clear(&attr_class);
         2335         else if (!attrcmp(n, "data"))
         2336                 string_clear(&attr_data);
         2337         else if (!attrcmp(n, "href"))
         2338                 string_clear(&attr_href);
         2339         else if (!attr_id_set && !attrcmp(n, "id"))
         2340                 string_clear(&attr_id);
         2341         else if (!attrcmp(n, "src"))
         2342                 string_clear(&attr_src);
         2343         else if (!attrcmp(n, "type"))
         2344                 string_clear(&attr_type);
         2345         else if (!attrcmp(n, "value"))
         2346                 string_clear(&attr_value);
         2347 
         2348         if (basehrefdoc[0] && tagid == TagBase && !attrcmp(n, "href"))
         2349                 basehrefdoc[0] = '\0';
         2350 }
         2351 
         2352 static void
         2353 usage(void)
         2354 {
         2355         fprintf(stderr, "%s [-8adiIlrx] [-b basehref] [-s selector] [-u selector] [-w termwidth]\n", argv0);
         2356         exit(1);
         2357 }
         2358 
         2359 int
         2360 main(int argc, char **argv)
         2361 {
         2362         char *basehref;
         2363 
         2364         if (pledge("stdio", NULL) < 0)
         2365                 err(1, "pledge");
         2366 
         2367         ARGBEGIN {
         2368         case '8':
         2369                 str_bullet_item = "\xe2\x80\xa2 ";
         2370                 str_ruler = "\xe2\x94\x80"; /* symbol: "light horizontal" */
         2371                 break;
         2372         case 'a':
         2373                 allowansi = !allowansi;
         2374                 break;
         2375         case 'b':
         2376                 basehref = EARGF(usage());
         2377                 if (uri_parse(basehref, &base) == -1 ||
         2378                     !base.proto[0])
         2379                         usage();
         2380                 basehrefset = 1;
         2381                 break;
         2382         case 'd':
         2383                 uniqrefs = !uniqrefs;
         2384                 break;
         2385         case 'i':
         2386                 showrefinline = !showrefinline;
         2387                 break;
         2388         case 'I':
         2389                 showurlinline = !showurlinline;
         2390                 break;
         2391         case 'l':
         2392                 showrefbottom = !showrefbottom;
         2393                 break;
         2394         case 'r':
         2395                 allowlinewrap = !allowlinewrap;
         2396                 break;
         2397         case 's':
         2398                 sel_show = compileselectors(EARGF(usage()));
         2399                 /* switch to reader/selector mode, ignore all data except when matched */
         2400                 reader_mode = 1;
         2401                 reader_ignore = 1;
         2402                 break;
         2403         case 'u':
         2404                 sel_hide = compileselectors(EARGF(usage()));
         2405                 /* switch to reader/selector mode */
         2406                 reader_mode = 1;
         2407                 break;
         2408         case 'w':
         2409                 if ((termwidth = strtol(EARGF(usage()), NULL, 10)) < 1)
         2410                         usage();
         2411                 break;
         2412         case 'x':
         2413                 resources = !resources;
         2414                 break;
         2415         default:
         2416                 usage();
         2417         } ARGEND
         2418 
         2419         linewrap = allowlinewrap;
         2420 
         2421         /* initial nodes */
         2422         ncapnodes = NODE_CAP_INC;
         2423         nodes = ecalloc(ncapnodes, sizeof(*nodes));
         2424         nodes_links = ecalloc(ncapnodes, sizeof(*nodes_links));
         2425 
         2426         parser.xmlattrstart = xmlattrstart;
         2427         parser.xmlattr = xmlattr;
         2428         parser.xmlattrentity = xmlattrentity;
         2429         parser.xmlattrend = xmlattrend;
         2430         parser.xmlcdatastart = xmlcdatastart;
         2431         parser.xmlcdata = xmlcdata;
         2432         parser.xmlcdataend = xmlcdataend;
         2433         parser.xmldatastart = xmldatastart;
         2434         parser.xmldata = xmldata;
         2435         parser.xmldataentity = xmldataentity;
         2436         parser.xmldataend = xmldataend;
         2437         parser.xmltagstart = xmltagstart;
         2438         parser.xmltagstartparsed = xmltagstartparsed;
         2439         parser.xmltagend = xmltagend;
         2440 
         2441         parser.getnext = getchar;
         2442         xml_parse(&parser);
         2443 
         2444         hflush();
         2445         if (ncells > 0)
         2446                 newline();
         2447 
         2448         if (showrefbottom || resources)
         2449                 printlinkrefs();
         2450 
         2451         hflush();
         2452         setmarkup(0);
         2453 
         2454         return 0;
         2455 }