webdump.c - webdump - HTML to plain-text converter for webpages (HTM) git clone git://git.codemadness.org/webdump (DIR) Log (DIR) Files (DIR) Refs (DIR) README (DIR) LICENSE --- webdump.c (66805B) --- 1 #include <errno.h> 2 #include <limits.h> 3 #include <stdio.h> 4 #include <stdarg.h> 5 #include <stdlib.h> 6 #include <string.h> 7 #include <strings.h> 8 #include <unistd.h> 9 10 #include "arg.h" 11 char *argv0; 12 13 #include "tree.h" 14 #include "xml.h" 15 16 static XMLParser parser; 17 18 #ifndef __OpenBSD__ 19 #define pledge(p1,p2) 0 20 #endif 21 22 #undef strlcat 23 size_t strlcat(char *, const char *, size_t); 24 #undef strlcpy 25 size_t strlcpy(char *, const char *, size_t); 26 27 /* ctype-like macros, but always compatible with ASCII / UTF-8 */ 28 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26) 29 #define ISCNTRL(c) ((c) < ' ' || (c) == 0x7f) 30 #define ISDIGIT(c) (((unsigned)c) - '0' < 10) 31 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5)) 32 #define TOLOWER(c) ((((unsigned)c) - 'A' < 26) ? ((c) | 32) : (c)) 33 34 #define LEN(x) (sizeof(x) / sizeof(x[0])) 35 36 /* URI */ 37 struct uri { 38 char proto[48]; /* scheme including ":" or "://" */ 39 char userinfo[256]; /* username [:password] */ 40 char host[256]; 41 char port[6]; /* numeric port */ 42 char path[1024]; 43 char query[1024]; 44 char fragment[1024]; 45 }; 46 47 /* options */ 48 static int allowansi = 0; /* (-a) allow ANSI escape codes */ 49 static int uniqrefs = 0; /* (-d) number unique references */ 50 static int showrefinline = 0; /* (-i) show link reference number inline */ 51 static int showurlinline = 0; /* (-I) show full link reference inline */ 52 static int showrefbottom = 0; /* (-l) show link references at the bottom */ 53 static int allowlinewrap = 0; /* (-r) line-wrapping */ 54 static int termwidth = 77; /* (-w) terminal width */ 55 static int resources = 0; /* (-x) write resources line-by-line to fd 3? */ 56 57 enum DisplayType { 58 DisplayUnknown = 0, 59 DisplayInline = 1 << 0, 60 DisplayInlineBlock = 1 << 1, /* unused for now */ 61 DisplayBlock = 1 << 2, 62 DisplayNone = 1 << 3, 63 DisplayPre = 1 << 4, 64 DisplayList = 1 << 5, 65 DisplayListOrdered = 1 << 6, 66 DisplayListItem = 1 << 7, 67 DisplayTable = 1 << 8, 68 DisplayTableRow = 1 << 9, 69 DisplayTableCell = 1 << 10, 70 DisplayHeader = 1 << 11, 71 DisplayDl = 1 << 12, 72 DisplayInput = 1 << 13, 73 DisplayButton = 1 << 14, 74 DisplaySelect = 1 << 15, 75 DisplaySelectMulti = 1 << 16, 76 DisplayOption = 1 << 17 77 }; 78 79 /* ANSI markup */ 80 enum MarkupType { 81 MarkupNone = 0, 82 MarkupBold = 1 << 0, 83 MarkupItalic = 1 << 1, 84 MarkupUnderline = 1 << 2, 85 MarkupBlink = 1 << 3, /* lol */ 86 MarkupReverse = 1 << 4, 87 MarkupStrike = 1 << 5 88 }; 89 90 /* String data / memory pool */ 91 typedef struct string { 92 char *data; /* data */ 93 size_t len; /* string length */ 94 size_t bufsiz; /* allocated size */ 95 } String; 96 97 enum TagId { TagA = 1, TagAddress, TagArea, TagArticle, TagAside, TagAudio, 98 TagB, TagBase, TagBlink, TagBlockquote, TagBody, TagBr, TagButton, 99 TagCite, TagCol, TagColgroup, TagDatalist, TagDd, TagDel, TagDetails, 100 TagDfn, TagDir, TagDiv, TagDl, TagDt, TagEm, TagEmbed, TagFieldset, 101 TagFigcaption, TagFigure, TagFooter, TagForm, TagFrame, TagH1, TagH2, 102 TagH3, TagH4, TagH5, TagH6, TagHead, TagHeader, TagHr, TagHtml, TagI, 103 TagIframe, TagImg, TagInput, TagIns, TagLabel, TagLegend, TagLi, 104 TagLink, TagMain, TagMark, TagMenu, TagMeta, TagNav, TagObject, TagOl, 105 TagOption, TagP, TagParam, TagPre, TagS, TagScript, TagSearch, 106 TagSection, TagSelect, TagSource, TagStrike, TagStrong, TagStyle, 107 TagSummary, TagSvg, TagTable, TagTbody, TagTd, TagTemplate, 108 TagTextarea, TagTfoot, TagTh, TagThead, TagTitle, TagTr, TagTrack, 109 TagU, TagUl, TagVar, TagVideo, TagWbr, TagXmp }; 110 111 struct tag { 112 const char *name; 113 enum TagId id; 114 enum DisplayType displaytype; 115 enum MarkupType markuptype; /* ANSI markup */ 116 enum DisplayType parenttype; /* display type belonging to element */ 117 int isvoid; /* "void" element */ 118 int isoptional; /* optional to close tag */ 119 int margintop; /* newlines when the tag starts */ 120 int marginbottom; /* newlines after the tag ends */ 121 int indent; /* indent in cells */ 122 }; 123 124 struct node { 125 char tagname[256]; 126 struct tag tag; 127 size_t nchildren; /* child node count */ 128 size_t visnchildren; /* child node count which are visible */ 129 /* attributes */ 130 char id[256]; 131 char classnames[1024]; 132 int indent; /* indent per node, for formatting */ 133 int hasdata; /* tag contains some data, for formatting */ 134 }; 135 136 struct selectornode { 137 char tagname[256]; 138 long index; /* index of node to match on: -1 if not matching on index */ 139 /* attributes */ 140 char id[256]; 141 char classnames[1024]; 142 }; 143 144 struct selector { 145 const char *text; 146 struct selectornode nodes[32]; 147 int depth; 148 }; 149 150 /* list of selectors */ 151 struct selectors { 152 struct selector **selectors; 153 size_t count; 154 }; 155 156 /* RB tree of link references */ 157 struct linkref { 158 char *type; 159 enum TagId tagid; 160 char *url; 161 int ishidden; 162 size_t linknr; 163 RB_ENTRY(linkref) entry; 164 }; 165 166 /* link references and hidden link references */ 167 static struct linkref **visrefs; 168 static size_t nvisrefs, ncapvisrefs; /* visible link count / capacity */ 169 static struct linkref **hiddenrefs; 170 static size_t nhiddenrefs, ncaphiddenrefs; /* hidden link count / capacity */ 171 172 /* compare link by URL for link references RB-tree */ 173 static int 174 linkrefcmp(struct linkref *r1, struct linkref *r2) 175 { 176 return strcmp(r1->url, r2->url); 177 } 178 179 RB_HEAD(linkreftree, linkref) linkrefhead = RB_INITIALIZER(&linkrefhead); 180 RB_GENERATE(linkreftree, linkref, entry, linkrefcmp) 181 182 static const char *str_bullet_item = "* "; 183 static const char *str_checkbox_checked = "x"; 184 static const char *str_ruler = "-"; 185 static const char *str_radio_checked = "*"; 186 187 /* base href, to make URLs absolute */ 188 static char basehrefdoc[4096]; /* buffer for base href in document, if any */ 189 static int basehrefset; /* base href set and can be used? */ 190 static struct uri base; /* parsed current base href */ 191 192 /* buffers for some attributes of the current tag */ 193 static String attr_alt; /* alt attribute */ 194 static String attr_checked; /* checked attribute */ 195 static String attr_class; /* class attribute */ 196 static int attr_class_set; /* class attribute is set already */ 197 static String attr_data; /* data attribute */ 198 static String attr_href; /* href attribute */ 199 static String attr_id; /* id attribute */ 200 static int attr_id_set; /* class attribute is set already */ 201 static String attr_src; /* src attribute */ 202 static String attr_type; /* type attribute */ 203 static String attr_value; /* value attribute */ 204 205 static String htmldata; /* buffered HTML data near the current tag */ 206 207 /* for white-space output handling: 208 1 = whitespace emitted (suppress repeated), 2 = other characters on this line 209 Behaviour: 210 * White-space data before non-whitespace data in tags are ignored on a line. 211 * Repeated white-space are ignored: a single space (' ') is emitted. 212 */ 213 static int whitespace_mode; 214 static int nbytesline; /* bytes on this line */ 215 static int ncells; /* current cell/column count */ 216 static int hadnewline; /* count for repeated newlines */ 217 /* flag for skipping initial white-space in tag: for HTML white-space handling */ 218 static int skipinitialws = 1; 219 #define DEFAULT_INDENT 2 220 static const int defaultindent = DEFAULT_INDENT; /* default indent / margin */ 221 static int indent; /* indent for the current line, in columns */ 222 /* previous output sequential newlines, used for calculating margins between 223 elements and reducing excessive newlines */ 224 static int currentnewlines; 225 226 /* buffers for line-wrapping (buffer per word boundary) */ 227 static char rbuf[1024]; 228 static int rbuflen; 229 static int rnbufcells; /* pending cell count to add */ 230 231 #define MAX_NODE_DEPTH 4096 /* absolute maximum node depth */ 232 static struct node *nodes; /* node tree (one per level is remembered) */ 233 static String *nodes_links; /* keep track of links per node */ 234 static size_t ncapnodes; /* current allocated node capacity */ 235 static int curnode; /* current node depth */ 236 237 /* reader / selector mode (-s) */ 238 static int reader_mode; 239 /* flag if the tags and their children should be ignored in the current context */ 240 static int reader_ignore; 241 242 static enum MarkupType curmarkup; /* current markup state (bold, underline, etc) */ 243 static int linewrap; /* allow linewrap in this context */ 244 245 /* selector to match (for -s and -u) */ 246 static struct selectors *sel_hide, *sel_show; 247 248 /* tags table: needs to be sorted like tagcmp(), alphabetically */ 249 250 /* tag id displaytype markup parent v o b a i */ 251 static struct tag tags[] = { 252 { "a", TagA, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 }, 253 { "address", TagAddress, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, 254 { "area", TagArea, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, 255 { "article", TagArticle, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, 256 { "aside", TagAside, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, 257 { "audio", TagAudio, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 }, 258 { "b", TagB, DisplayInline, MarkupBold, 0, 0, 0, 0, 0, 0 }, 259 { "base", TagBase, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, 260 { "blink", TagBlink, DisplayInline, MarkupBlink, 0, 0, 0, 0, 0, 0 }, 261 { "blockquote", TagBlockquote, DisplayBlock, 0, 0, 0, 0, 0, 0, 2 }, 262 { "body", TagBody, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, 263 { "br", TagBr, 0, 0, 0, 1, 0, 0, 0, 0 }, 264 { "button", TagButton, DisplayInline | DisplayButton, 0, 0, 0, 0, 0, 0, 0 }, 265 { "cite", TagCite, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 }, 266 { "col", TagCol, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, 267 { "colgroup", TagColgroup, DisplayInline, 0, 0, 0, 1, 0, 0, 0 }, 268 { "datalist", TagDatalist, DisplayNone, 0, 0, 0, 0, 0, 0, 0 }, 269 { "dd", TagDd, DisplayBlock, 0, 0, 0, 1, 0, 0, 4 }, 270 { "del", TagDel, DisplayInline, MarkupStrike, 0, 0, 0, 0, 0, 0 }, 271 { "details", TagDetails, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, 272 { "dfn", TagDfn, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 }, 273 { "dir", TagDir, DisplayList, 0, 0, 0, 0, 1, 1, 2 }, 274 { "div", TagDiv, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, 275 { "dl", TagDl, DisplayBlock | DisplayDl, 0, 0, 0, 0, 0, 0, 0 }, 276 { "dt", TagDt, DisplayBlock, MarkupBold, 0, 0, 1, 0, 0, 0 }, 277 { "em", TagEm, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 }, 278 { "embed", TagEmbed, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, 279 { "fieldset", TagFieldset, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, 280 { "figcaption", TagFigcaption, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, 281 { "figure", TagFigure, DisplayBlock, 0, 0, 0, 0, 1, 1, 4 }, 282 { "footer", TagFooter, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, 283 { "form", TagForm, DisplayBlock, 0, 0, 0, 0, 0, 1, 0 }, 284 { "frame", TagFrame, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, 285 { "h1", TagH1, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT }, 286 { "h2", TagH2, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT }, 287 { "h3", TagH3, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT }, 288 { "h4", TagH4, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT }, 289 { "h5", TagH5, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT }, 290 { "h6", TagH6, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT }, 291 { "head", TagHead, DisplayBlock, 0, 0, 0, 1, 0, 0, 0 }, 292 { "header", TagHeader, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, 293 { "hr", TagHr, DisplayBlock, 0, 0, 1, 0, 0, 0, 0 }, 294 { "html", TagHtml, DisplayBlock, 0, 0, 0, 1, 0, 0, 0 }, 295 { "i", TagI, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 }, 296 { "iframe", TagIframe, DisplayInline, 0, 0, 0, 0, 0, 0, 0 }, 297 { "img", TagImg, DisplayInline, MarkupUnderline, 0, 1, 0, 0, 0, 0 }, 298 { "input", TagInput, DisplayInput, 0, 0, 1, 0, 0, 0, 0 }, 299 { "ins", TagIns, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 }, 300 { "label", TagLabel, DisplayInline, 0, 0, 0, 0, 0, 0, 0 }, 301 { "legend", TagLegend, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, 302 { "li", TagLi, DisplayListItem, 0, DisplayList, 0, 1, 0, 0, 0 }, 303 { "link", TagLink, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, 304 { "main", TagMain, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, 305 { "mark", TagMark, DisplayInline, MarkupReverse, 0, 0, 0, 0, 0, 0 }, 306 { "menu", TagMenu, DisplayList, 0, 0, 0, 0, 1, 1, 2 }, 307 { "meta", TagMeta, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, 308 { "nav", TagNav, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, 309 { "object", TagObject, DisplayInline, 0, 0, 0, 0, 0, 0, 0 }, 310 { "ol", TagOl, DisplayList | DisplayListOrdered, 0, 0, 0, 0, 1, 1, 0 }, 311 { "option", TagOption, DisplayInline | DisplayOption, 0, 0, 0, 1, 0, 0, 0 }, 312 { "p", TagP, DisplayBlock, 0, 0, 0, 1, 1, 1, 0 }, 313 { "param", TagParam, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, 314 { "pre", TagPre, DisplayPre, 0, 0, 0, 0, 1, 1, 4 }, 315 { "s", TagS, DisplayInline, MarkupStrike, 0, 0, 0, 0, 0, 0 }, 316 { "script", TagScript, DisplayNone, 0, 0, 0, 0, 0, 0, 0 }, 317 { "search", TagSearch, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, 318 { "section", TagSection, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, 319 { "select", TagSelect, DisplayInline | DisplaySelect, 0, 0, 0, 0, 0, 0, 0 }, 320 { "source", TagSource, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, 321 { "strike", TagStrike, DisplayInline, MarkupStrike, 0, 0, 0, 0, 0, 0 }, 322 { "strong", TagStrong, DisplayInline, MarkupBold, 0, 0, 0, 0, 0, 0 }, 323 { "style", TagStyle, DisplayNone, 0, 0, 0, 0, 0, 0, 0 }, 324 { "summary", TagSummary, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, 325 { "svg", TagSvg, DisplayNone, 0, 0, 0, 0, 0, 0, 0 }, 326 { "table", TagTable, DisplayTable, 0, 0, 0, 0, 0, 0, 0 }, 327 { "tbody", TagTbody, DisplayInline, 0, DisplayTable, 0, 1, 0, 0, 0 }, 328 { "td", TagTd, DisplayTableCell, 0, DisplayTableRow, 0, 1, 0, 0, 0 }, 329 { "template", TagTemplate, DisplayNone, 0, 0, 0, 0, 0, 0, 0 }, 330 { "textarea", TagTextarea, DisplayInline, 0, 0, 0, 0, 0, 0, 0 }, 331 { "tfoot", TagTfoot, DisplayInline, 0, DisplayTable, 0, 1, 0, 0, 0 }, 332 { "th", TagTh, DisplayTableCell, MarkupBold, DisplayTableRow, 0, 1, 0, 0, 0 }, 333 { "thead", TagThead, DisplayInline, 0, DisplayTable, 0, 1, 0, 0, 0 }, 334 { "title", TagTitle, DisplayBlock, 0, 0, 0, 0, 0, 1, -DEFAULT_INDENT }, 335 { "tr", TagTr, DisplayTableRow, 0, DisplayTable, 0, 1, 0, 0, 0 }, 336 { "track", TagTrack, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, 337 { "u", TagU, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 }, 338 { "ul", TagUl, DisplayList, 0, 0, 0, 0, 1, 1, 2 }, 339 { "var", TagVar, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 }, 340 { "video", TagVideo, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 }, 341 { "wbr", TagWbr, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, 342 { "xmp", TagXmp, DisplayPre, 0, 0, 0, 0, 1, 1, 4 } 343 }; 344 345 /* hint for compilers and static analyzers that a function exits */ 346 #ifndef __dead 347 #define __dead 348 #endif 349 350 /* print to stderr, print error message of errno and exit(). */ 351 __dead static void 352 err(int exitstatus, const char *fmt, ...) 353 { 354 va_list ap; 355 int saved_errno; 356 357 saved_errno = errno; 358 359 fputs("webdump: ", stderr); 360 if (fmt) { 361 va_start(ap, fmt); 362 vfprintf(stderr, fmt, ap); 363 va_end(ap); 364 fputs(": ", stderr); 365 } 366 fprintf(stderr, "%s\n", strerror(saved_errno)); 367 368 exit(exitstatus); 369 } 370 371 /* print to stderr and exit(). */ 372 __dead static void 373 errx(int exitstatus, const char *fmt, ...) 374 { 375 va_list ap; 376 377 fputs("webdump: ", stderr); 378 if (fmt) { 379 va_start(ap, fmt); 380 vfprintf(stderr, fmt, ap); 381 va_end(ap); 382 } 383 fputs("\n", stderr); 384 385 exit(exitstatus); 386 } 387 388 static const char *ignorestate, *endtag; 389 static int (*getnext)(void); 390 391 /* return a space for all data until some case-insensitive string occurs. This 392 is used to parse incorrect HTML/XML that contains unescaped HTML in script 393 or style tags. If you see some </script> tag in a CDATA or comment 394 section then e-mail W3C and tell them the web is too complex. */ 395 static inline int 396 getnext_ignore(void) 397 { 398 int c; 399 400 if ((c = getnext()) == EOF) 401 return EOF; 402 403 if (TOLOWER((unsigned char)c) == TOLOWER((unsigned char)*ignorestate)) { 404 ignorestate++; 405 if (*ignorestate == '\0') { 406 parser.getnext = getnext; /* restore */ 407 return ' '; 408 } 409 } else { 410 ignorestate = endtag; /* no full match: reset to beginning */ 411 } 412 413 return ' '; /* pretend there is just SPACEs */ 414 } 415 416 /* Clear string only; don't free, prevents unnecessary reallocation. */ 417 static void 418 string_clear(String *s) 419 { 420 if (s->data) 421 s->data[0] = '\0'; 422 s->len = 0; 423 } 424 425 static void 426 string_buffer_realloc(String *s, size_t newlen) 427 { 428 size_t alloclen; 429 430 for (alloclen = 64; alloclen <= newlen; alloclen *= 2) 431 ; 432 if (!(s->data = realloc(s->data, alloclen))) 433 err(1, "realloc"); 434 s->bufsiz = alloclen; 435 } 436 437 static void 438 string_append(String *s, const char *data, size_t len) 439 { 440 if (!len) 441 return; 442 /* check if allocation is necesary, don't shrink buffer, 443 * should be more than bufsiz ofcourse. */ 444 if (s->len + len >= s->bufsiz) 445 string_buffer_realloc(s, s->len + len + 1); 446 memcpy(s->data + s->len, data, len); 447 s->len += len; 448 s->data[s->len] = '\0'; 449 } 450 451 static char * 452 estrdup(const char *s) 453 { 454 char *p; 455 456 if (!(p = strdup(s))) 457 err(1, "strdup"); 458 return p; 459 } 460 461 static char * 462 estrndup(const char *s, size_t n) 463 { 464 char *p; 465 466 if (!(p = strndup(s, n))) 467 err(1, "strndup"); 468 return p; 469 } 470 471 static void * 472 erealloc(void *p, size_t siz) 473 { 474 if (!(p = realloc(p, siz))) 475 err(1, "realloc"); 476 477 return p; 478 } 479 480 static void * 481 ecalloc(size_t nmemb, size_t size) 482 { 483 void *p; 484 485 if (!(p = calloc(nmemb, size))) 486 err(1, "calloc"); 487 return p; 488 } 489 490 /* check if string has a non-empty scheme / protocol part */ 491 static int 492 uri_hasscheme(const char *s) 493 { 494 const char *p = s; 495 496 for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) || 497 *p == '+' || *p == '-' || *p == '.'; p++) 498 ; 499 /* scheme, except if empty and starts with ":" then it is a path */ 500 return (*p == ':' && p != s); 501 } 502 503 static int 504 uri_parse(const char *s, struct uri *u) 505 { 506 const char *p = s; 507 char *endptr; 508 size_t i; 509 long l; 510 511 u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0'; 512 u->path[0] = u->query[0] = u->fragment[0] = '\0'; 513 514 /* protocol-relative */ 515 if (*p == '/' && *(p + 1) == '/') { 516 p += 2; /* skip "//" */ 517 goto parseauth; 518 } 519 520 /* scheme / protocol part */ 521 for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) || 522 *p == '+' || *p == '-' || *p == '.'; p++) 523 ; 524 /* scheme, except if empty and starts with ":" then it is a path */ 525 if (*p == ':' && p != s) { 526 if (*(p + 1) == '/' && *(p + 2) == '/') 527 p += 3; /* skip "://" */ 528 else 529 p++; /* skip ":" */ 530 531 if ((size_t)(p - s) >= sizeof(u->proto)) 532 return -1; /* protocol too long */ 533 memcpy(u->proto, s, p - s); 534 u->proto[p - s] = '\0'; 535 536 if (*(p - 1) != '/') 537 goto parsepath; 538 } else { 539 p = s; /* no scheme format, reset to start */ 540 goto parsepath; 541 } 542 543 parseauth: 544 /* userinfo (username:password) */ 545 i = strcspn(p, "@/?#"); 546 if (p[i] == '@') { 547 if (i >= sizeof(u->userinfo)) 548 return -1; /* userinfo too long */ 549 memcpy(u->userinfo, p, i); 550 u->userinfo[i] = '\0'; 551 p += i + 1; 552 } 553 554 /* IPv6 address */ 555 if (*p == '[') { 556 /* bracket not found, host too short or too long */ 557 i = strcspn(p, "]"); 558 if (p[i] != ']' || i < 3) 559 return -1; 560 i++; /* including "]" */ 561 } else { 562 /* domain / host part, skip until port, path or end. */ 563 i = strcspn(p, ":/?#"); 564 } 565 if (i >= sizeof(u->host)) 566 return -1; /* host too long */ 567 memcpy(u->host, p, i); 568 u->host[i] = '\0'; 569 p += i; 570 571 /* port */ 572 if (*p == ':') { 573 p++; 574 if ((i = strcspn(p, "/?#")) >= sizeof(u->port)) 575 return -1; /* port too long */ 576 memcpy(u->port, p, i); 577 u->port[i] = '\0'; 578 /* check for valid port: range 1 - 65535, may be empty */ 579 errno = 0; 580 l = strtol(u->port, &endptr, 10); 581 if (i && (errno || *endptr || l <= 0 || l > 65535)) 582 return -1; 583 p += i; 584 } 585 586 parsepath: 587 /* path */ 588 if ((i = strcspn(p, "?#")) >= sizeof(u->path)) 589 return -1; /* path too long */ 590 memcpy(u->path, p, i); 591 u->path[i] = '\0'; 592 p += i; 593 594 /* query */ 595 if (*p == '?') { 596 p++; 597 if ((i = strcspn(p, "#")) >= sizeof(u->query)) 598 return -1; /* query too long */ 599 memcpy(u->query, p, i); 600 u->query[i] = '\0'; 601 p += i; 602 } 603 604 /* fragment */ 605 if (*p == '#') { 606 p++; 607 if ((i = strlen(p)) >= sizeof(u->fragment)) 608 return -1; /* fragment too long */ 609 memcpy(u->fragment, p, i); 610 u->fragment[i] = '\0'; 611 } 612 613 return 0; 614 } 615 616 /* Transform and try to make the URI `u` absolute using base URI `b` into `a`. 617 Follows some of the logic from "RFC 3986 - 5.2.2. Transform References". 618 Returns 0 on success, -1 on error or truncation. */ 619 static int 620 uri_makeabs(struct uri *a, struct uri *u, struct uri *b) 621 { 622 char *p; 623 int c; 624 625 strlcpy(a->fragment, u->fragment, sizeof(a->fragment)); 626 627 if (u->proto[0] || u->host[0]) { 628 strlcpy(a->proto, u->proto[0] ? u->proto : b->proto, sizeof(a->proto)); 629 strlcpy(a->host, u->host, sizeof(a->host)); 630 strlcpy(a->userinfo, u->userinfo, sizeof(a->userinfo)); 631 strlcpy(a->host, u->host, sizeof(a->host)); 632 strlcpy(a->port, u->port, sizeof(a->port)); 633 strlcpy(a->path, u->path, sizeof(a->path)); 634 strlcpy(a->query, u->query, sizeof(a->query)); 635 return 0; 636 } 637 638 strlcpy(a->proto, b->proto, sizeof(a->proto)); 639 strlcpy(a->host, b->host, sizeof(a->host)); 640 strlcpy(a->userinfo, b->userinfo, sizeof(a->userinfo)); 641 strlcpy(a->host, b->host, sizeof(a->host)); 642 strlcpy(a->port, b->port, sizeof(a->port)); 643 644 if (!u->path[0]) { 645 strlcpy(a->path, b->path, sizeof(a->path)); 646 } else if (u->path[0] == '/') { 647 strlcpy(a->path, u->path, sizeof(a->path)); 648 } else { 649 a->path[0] = (b->host[0] && b->path[0] != '/') ? '/' : '\0'; 650 a->path[1] = '\0'; 651 652 if ((p = strrchr(b->path, '/'))) { 653 c = *(++p); 654 *p = '\0'; /* temporary NUL-terminate */ 655 if (strlcat(a->path, b->path, sizeof(a->path)) >= sizeof(a->path)) 656 return -1; 657 *p = c; /* restore */ 658 } 659 if (strlcat(a->path, u->path, sizeof(a->path)) >= sizeof(a->path)) 660 return -1; 661 } 662 663 if (u->path[0] || u->query[0]) 664 strlcpy(a->query, u->query, sizeof(a->query)); 665 else 666 strlcpy(a->query, b->query, sizeof(a->query)); 667 668 return 0; 669 } 670 671 static int 672 uri_format(char *buf, size_t bufsiz, struct uri *u) 673 { 674 return snprintf(buf, bufsiz, "%s%s%s%s%s%s%s%s%s%s%s%s", 675 u->proto, 676 u->userinfo[0] ? u->userinfo : "", 677 u->userinfo[0] ? "@" : "", 678 u->host, 679 u->port[0] ? ":" : "", 680 u->port, 681 u->host[0] && u->path[0] && u->path[0] != '/' ? "/" : "", 682 u->path, 683 u->query[0] ? "?" : "", 684 u->query, 685 u->fragment[0] ? "#" : "", 686 u->fragment); 687 } 688 689 /* compare tag name (case-insensitive) */ 690 static int 691 tagcmp(const char *s1, const char *s2) 692 { 693 return strcasecmp(s1, s2); 694 } 695 696 /* compare attribute name (case-insensitive) */ 697 static int 698 attrcmp(const char *s1, const char *s2) 699 { 700 return strcasecmp(s1, s2); 701 } 702 703 static void 704 rindent(void) 705 { 706 int i, total; 707 708 total = indent + defaultindent; 709 if (total < 0) 710 total = 0; 711 for (i = 0; i < total; i++) 712 putchar(' '); 713 714 nbytesline += total; 715 ncells += total; 716 } 717 718 static void 719 emitmarkup(int markuptype) 720 { 721 if (!allowansi) 722 return; 723 724 if (!markuptype) 725 fputs("\033[0m", stdout); /* reset all attributes */ 726 727 /* set */ 728 if (markuptype & MarkupBold) 729 fputs("\033[1m", stdout); 730 if (markuptype & MarkupItalic) 731 fputs("\033[3m", stdout); 732 if (markuptype & MarkupUnderline) 733 fputs("\033[4m", stdout); 734 if (markuptype & MarkupBlink) 735 fputs("\033[5m", stdout); 736 if (markuptype & MarkupReverse) 737 fputs("\033[7m", stdout); 738 if (markuptype & MarkupStrike) 739 fputs("\033[9m", stdout); 740 } 741 742 /* flush remaining buffer (containing a word): used for word-wrap handling */ 743 static void 744 hflush(void) 745 { 746 int i; 747 748 if (!rbuflen) 749 return; 750 751 if (!nbytesline) { 752 if (curmarkup) 753 emitmarkup(0); 754 rindent(); 755 /* emit code again per line, needed for GNU/less -R */ 756 if (curmarkup) 757 emitmarkup(curmarkup); 758 } 759 760 for (i = 0; i < rbuflen; i++) 761 putchar(rbuf[i]); 762 763 nbytesline += rbuflen; 764 ncells += rnbufcells; 765 rbuflen = 0; 766 rnbufcells = 0; 767 } 768 769 static void 770 printansi(const char *s) 771 { 772 size_t len; 773 774 if (!allowansi) 775 return; 776 777 if (linewrap) { 778 len = strlen(s); 779 if (rbuflen + len + 1 >= sizeof(rbuf)) 780 hflush(); 781 if (rbuflen + len + 1 < sizeof(rbuf)) { 782 memcpy(rbuf + rbuflen, s, len); 783 rbuflen += len; 784 /* NOTE: nbytesline and ncells are not counted for markup */ 785 } 786 } else { 787 fputs(s, stdout); 788 } 789 } 790 791 static void 792 setmarkup(int markuptype) 793 { 794 if (!allowansi) 795 return; 796 797 /* need change? */ 798 if (curmarkup == markuptype) 799 return; 800 801 if (!markuptype) { 802 printansi("\033[0m"); /* reset all attributes */ 803 curmarkup = markuptype; 804 return; 805 } 806 807 /* set */ 808 if (!(curmarkup & MarkupBold) && (markuptype & MarkupBold)) 809 printansi("\033[1m"); 810 if (!(curmarkup & MarkupItalic) && (markuptype & MarkupItalic)) 811 printansi("\033[3m"); 812 if (!(curmarkup & MarkupUnderline) && (markuptype & MarkupUnderline)) 813 printansi("\033[4m"); 814 if (!(curmarkup & MarkupBlink) && (markuptype & MarkupBlink)) 815 printansi("\033[5m"); 816 if (!(curmarkup & MarkupReverse) && (markuptype & MarkupReverse)) 817 printansi("\033[7m"); 818 if (!(curmarkup & MarkupStrike) && (markuptype & MarkupStrike)) 819 printansi("\033[9m"); 820 821 /* unset */ 822 if ((curmarkup & MarkupBold) && !(markuptype & MarkupBold)) 823 printansi("\033[22m"); /* reset bold or faint */ 824 if ((curmarkup & MarkupItalic) && !(markuptype & MarkupItalic)) 825 printansi("\033[23m"); /* reset italic */ 826 if ((curmarkup & MarkupUnderline) && !(markuptype & MarkupUnderline)) 827 printansi("\033[24m"); /* reset underline */ 828 if ((curmarkup & MarkupBlink) && !(markuptype & MarkupBlink)) 829 printansi("\033[25m"); /* reset blink */ 830 if ((curmarkup & MarkupReverse) && !(markuptype & MarkupReverse)) 831 printansi("\033[27m"); /* reset reverse */ 832 if ((curmarkup & MarkupStrike) && !(markuptype & MarkupStrike)) 833 printansi("\033[29m"); /* reset strike */ 834 835 curmarkup = markuptype; 836 } 837 838 static void 839 startmarkup(int markuptype) 840 { 841 setmarkup(curmarkup | markuptype); 842 } 843 844 static void 845 endmarkup(int markuptype) 846 { 847 setmarkup(curmarkup & ~markuptype); 848 } 849 850 /* rough cell width of a unicode codepoint by counting a unicode codepoint as 1 851 cell in general. 852 NOTE: this is of course incorrect since characters can be 2 width aswell, 853 in the future maybe replace this with wcwidth() or similar */ 854 static int 855 utfwidth(int c) 856 { 857 /* not the start of a codepoint */ 858 if ((c & 0xc0) == 0x80) 859 return 0; 860 /* count TAB as 8 */ 861 if (c == '\t') 862 return 8; 863 return 1; 864 } 865 866 /* write a character, handling state of repeated newlines, some HTML 867 white-space rules, indentation and word-wrapping */ 868 static void 869 hputchar(int c) 870 { 871 struct node *cur = &nodes[curnode]; 872 cur->hasdata = 1; 873 874 if (c == '\n') { 875 /* previous line had characters, so not a repeated newline */ 876 if (nbytesline > 0) 877 hadnewline = 0; 878 879 /* start a new line, no chars on this line yet */ 880 whitespace_mode &= ~2; /* no chars on this line yet */ 881 nbytesline = 0; 882 ncells = 0; 883 884 if (hadnewline) 885 currentnewlines++; /* repeating newlines */ 886 hadnewline = 1; 887 } else { 888 hadnewline = 0; 889 currentnewlines = 0; 890 } 891 892 /* skip initial/leading white-space */ 893 if (ISSPACE((unsigned char)c)) { 894 if (skipinitialws) 895 return; 896 } else { 897 skipinitialws = 0; 898 } 899 900 if (!(c == '\n' || c == '\t' || !ISCNTRL((unsigned char)c))) 901 return; 902 903 if (!linewrap) { 904 if (c == '\n') { 905 putchar('\n'); 906 nbytesline = 0; 907 ncells = 0; 908 } else { 909 if (!nbytesline) { 910 if (curmarkup) 911 emitmarkup(0); 912 rindent(); 913 /* emit code again per line, needed for GNU/less -R */ 914 if (curmarkup) 915 emitmarkup(curmarkup); 916 } 917 putchar(c); 918 nbytesline++; 919 ncells += utfwidth(c); 920 } 921 return; 922 } 923 924 /* really too long: the whole word doesn't even fit, flush it */ 925 if (ncells + rnbufcells >= termwidth || rbuflen >= sizeof(rbuf) - 1) { 926 putchar('\n'); 927 nbytesline = 0; 928 ncells = 0; 929 hflush(); 930 } 931 932 if (c == '\n') { 933 putchar('\n'); 934 hflush(); 935 return; 936 } else if (ISSPACE((unsigned char)c) || c == '-') { 937 if (ncells + rnbufcells >= termwidth) { 938 putchar('\n'); 939 nbytesline = 0; 940 ncells = 0; 941 } 942 rbuf[rbuflen++] = c; 943 rnbufcells += utfwidth(c); 944 hflush(); 945 return; 946 } 947 948 rbuf[rbuflen++] = c; 949 rnbufcells += utfwidth(c); 950 } 951 952 /* calculate indentation of current node depth, using the sum of each 953 indentation per node */ 954 static int 955 calcindent(void) 956 { 957 int i, n = 0; 958 959 for (i = curnode; i >= 0; i--) 960 n += nodes[i].indent; 961 962 return n; 963 } 964 965 static void 966 hprint(const char *s) 967 { 968 for (; *s; ++s) 969 hputchar(*s); 970 } 971 972 /* printf(), max 256 bytes for now */ 973 static void 974 hprintf(const char *fmt, ...) 975 { 976 va_list ap; 977 char buf[256]; 978 979 va_start(ap, fmt); 980 vsnprintf(buf, sizeof(buf), fmt, ap); 981 va_end(ap); 982 983 /* use hprint() formatting logic. */ 984 hprint(buf); 985 } 986 987 static void 988 newline(void) 989 { 990 if (skipinitialws) 991 return; 992 hputchar('\n'); 993 } 994 995 static int 996 parentcontainerhasdata(int curtype, int n) 997 { 998 int i; 999 1000 for (i = n; i >= 0; i--) { 1001 if (nodes[i].tag.displaytype & (DisplayList|DisplayTable)) 1002 break; 1003 if (nodes[i].hasdata) 1004 return 1; 1005 } 1006 1007 return 0; 1008 } 1009 1010 /* start on a newline for the start of a block element or not */ 1011 static void 1012 startblock(void) 1013 { 1014 hflush(); 1015 whitespace_mode &= ~2; /* no characters on this line yet */ 1016 if (nbytesline <= 0) 1017 return; 1018 if (!hadnewline && curnode >= 0 && nodes[curnode - 1].hasdata) 1019 hputchar('\n'); 1020 } 1021 1022 /* start on a newline for the end of a block element or not */ 1023 static void 1024 endblock(void) 1025 { 1026 hflush(); 1027 whitespace_mode &= ~2; /* no characters on this line yet */ 1028 if (nbytesline <= 0) 1029 return; 1030 if (!hadnewline) 1031 hputchar('\n'); 1032 } 1033 1034 /* print one character safely: no control characters, 1035 handle HTML white-space rules */ 1036 static void 1037 printc(int c) 1038 { 1039 if (ISSPACE((unsigned char)c)) { 1040 if (whitespace_mode == 2) 1041 hputchar(' '); 1042 whitespace_mode |= 1; 1043 } else { 1044 whitespace_mode = 2; 1045 if (!ISCNTRL((unsigned char)c)) 1046 hputchar(c); 1047 } 1048 } 1049 1050 static void 1051 printpre(const char *s, size_t len) 1052 { 1053 struct node *cur; 1054 size_t i; 1055 1056 /* reset state of newlines because this data is printed literally */ 1057 hadnewline = 0; 1058 currentnewlines = 0; 1059 1060 /* skip leading newline */ 1061 i = 0; 1062 if (skipinitialws) { 1063 if (*s == '\n' && i < len) { 1064 s++; 1065 i++; 1066 } 1067 } 1068 1069 hflush(); 1070 1071 skipinitialws = 0; 1072 1073 if (*s) { 1074 cur = &nodes[curnode]; 1075 cur->hasdata = 1; 1076 } 1077 1078 for (; *s && i < len; s++, i++) { 1079 switch (*s) { 1080 case '\n': 1081 putchar('\n'); 1082 nbytesline = 0; 1083 ncells = 0; 1084 break; 1085 case '\t': 1086 hadnewline = 0; 1087 if (!nbytesline) { 1088 if (curmarkup) 1089 emitmarkup(0); 1090 rindent(); 1091 /* emit code again per line, needed for GNU/less -R */ 1092 if (curmarkup) 1093 emitmarkup(curmarkup); 1094 } 1095 1096 /* TAB to 8 spaces */ 1097 fputs(" ", stdout); 1098 nbytesline += 8; 1099 ncells += 8; 1100 break; 1101 default: 1102 if (ISCNTRL((unsigned char)*s)) 1103 continue; 1104 1105 if (!nbytesline) { 1106 if (curmarkup) 1107 emitmarkup(0); 1108 rindent(); 1109 /* emit code again per line, needed for GNU/less -R */ 1110 if (curmarkup) 1111 emitmarkup(curmarkup); 1112 } 1113 1114 putchar(*s); 1115 nbytesline++; 1116 /* start of rune: incorrectly assume 1 rune is 1 cell for now */ 1117 ncells += utfwidth((unsigned char)*s); 1118 } 1119 } 1120 } 1121 1122 static struct node * 1123 findparenttype(int cur, int findtype) 1124 { 1125 int i; 1126 1127 for (i = cur; i >= 0; i--) { 1128 if ((nodes[i].tag.displaytype & findtype)) 1129 return &nodes[i]; 1130 } 1131 return NULL; 1132 } 1133 1134 static int 1135 isclassmatch(const char *haystack, const char *needle) 1136 { 1137 const char *p; 1138 size_t needlelen; 1139 size_t matched = 0; 1140 1141 needlelen = strlen(needle); 1142 for (p = haystack; *p; p++) { 1143 if (ISSPACE((unsigned char)*p)) { 1144 matched = 0; 1145 continue; 1146 } 1147 if (needle[matched] == *p) 1148 matched++; 1149 else 1150 matched = 0; 1151 if (matched == needlelen) { 1152 if (*(p + 1) == '\0' || ISSPACE((unsigned char)*(p + 1))) 1153 return 1; 1154 } 1155 } 1156 1157 return 0; 1158 } 1159 1160 /* very limited CSS-like selector, supports: main, main#id, main.class, 1161 ".class", "#id", "ul li a" */ 1162 static int 1163 compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes) 1164 { 1165 int depth = 0, len; 1166 long l; 1167 const char *s, *start; 1168 char tmp[256]; 1169 int nameset = 0; 1170 1171 memset(&nodes[0], 0, sizeof(nodes[0])); 1172 nodes[0].index = -1; 1173 1174 s = sel; 1175 for (; *s && ISSPACE((unsigned char)*s); s++) 1176 ; 1177 1178 start = s; 1179 for (; ; s++) { 1180 /* end of tag */ 1181 if (!nameset && 1182 (*s == '#' || *s == '.' || *s == '@' || 1183 *s == '\0' || ISSPACE((unsigned char)*s))) { 1184 nameset = 1; 1185 len = s - start; /* tag name */ 1186 if (len >= sizeof(tmp)) 1187 return 0; 1188 if (len) 1189 memcpy(tmp, start, len); 1190 tmp[len] = '\0'; 1191 1192 memcpy(nodes[depth].tagname, tmp, len + 1); 1193 } 1194 1195 /* end */ 1196 if (*s == '\0' || ISSPACE((unsigned char)*s)) { 1197 for (; ISSPACE((unsigned char)*s); s++) 1198 ; 1199 start = s; /* start of a new tag */ 1200 depth++; 1201 if (depth >= maxnodes) 1202 return 0; 1203 1204 nameset = 0; 1205 memset(&nodes[depth], 0, sizeof(nodes[depth])); 1206 nodes[depth].index = -1; 1207 1208 /* end of selector */ 1209 if (*s == '\0') 1210 break; 1211 } 1212 1213 /* index */ 1214 if (*s == '@') { 1215 len = strcspn(s + 1, ".#@ \t\n"); 1216 if (len >= sizeof(tmp)) 1217 return 0; 1218 memcpy(tmp, s + 1, len); 1219 tmp[len] = '\0'; 1220 1221 l = strtol(tmp, NULL, 10); 1222 if (l >= 0) 1223 nodes[depth].index = l; 1224 s += len; 1225 start = s + 1; 1226 continue; 1227 } 1228 1229 /* id */ 1230 if (*s == '#') { 1231 len = strcspn(s + 1, ".#@ \t\n"); 1232 if (len >= sizeof(tmp)) 1233 return 0; 1234 memcpy(tmp, s + 1, len); 1235 tmp[len] = '\0'; 1236 memcpy(nodes[depth].id, tmp, len + 1); 1237 s += len; 1238 start = s + 1; 1239 continue; 1240 } 1241 1242 /* class */ 1243 if (*s == '.') { 1244 len = strcspn(s + 1, ".#@ \t\n"); 1245 if (len >= sizeof(tmp)) 1246 return 0; 1247 memcpy(tmp, s + 1, len); 1248 tmp[len] = '\0'; 1249 /* allow only one classname for now */ 1250 memcpy(nodes[depth].classnames, tmp, len + 1); 1251 s += len; 1252 start = s + 1; 1253 continue; 1254 } 1255 } 1256 1257 return depth; 1258 } 1259 1260 static struct selector * 1261 newselector(const char *q) 1262 { 1263 struct selector *sel; 1264 int r; 1265 1266 sel = ecalloc(1, sizeof(*sel)); 1267 sel->text = estrdup(q); 1268 1269 r = compileselector(sel->text, sel->nodes, LEN(sel->nodes)); 1270 if (r <= 0) { 1271 free(sel); 1272 return NULL; 1273 } 1274 sel->depth = r; 1275 1276 return sel; 1277 } 1278 1279 static struct selectors * 1280 compileselectors(const char *q) 1281 { 1282 struct selectors *sels = NULL; 1283 struct selector *sel; 1284 const char *start; 1285 char *qe; 1286 int count = 0; 1287 size_t siz; 1288 1289 sels = ecalloc(1, sizeof(*sels)); 1290 1291 start = q; 1292 for (; ; q++) { 1293 if (*q == ',' || *q == '\0') { 1294 qe = estrndup(start, q - start); 1295 sel = newselector(qe); 1296 free(qe); 1297 1298 /* add new selector */ 1299 siz = (count + 1) * sizeof(struct selector *); 1300 sels->selectors = erealloc(sels->selectors, siz); 1301 sels->selectors[count] = sel; 1302 count++; 1303 1304 if (*q == '\0') 1305 break; 1306 start = q + 1; 1307 } 1308 } 1309 sels->count = count; 1310 1311 return sels; 1312 } 1313 1314 /* very limited CSS-like matcher, supports: main, main#id, main.class, 1315 ".class", "#id", "ul li a" */ 1316 static int 1317 iscssmatch(struct selector *sel, struct node *root, int maxdepth) 1318 { 1319 int d, md = 0; 1320 1321 for (d = 0; d <= maxdepth; d++) { 1322 /* tag matched? */ 1323 if (sel->nodes[md].tagname[0] && 1324 strcasecmp(sel->nodes[md].tagname, root[d].tagname)) 1325 continue; /* no */ 1326 1327 /* id matched? */ 1328 if (sel->nodes[md].id[0] && strcmp(sel->nodes[md].id, root[d].id)) 1329 continue; /* no */ 1330 1331 /* class matched, for now allow only one classname in the selector, 1332 matching multiple classnames */ 1333 if (sel->nodes[md].classnames[0] && 1334 !isclassmatch(root[d].classnames, sel->nodes[md].classnames)) 1335 continue; /* no */ 1336 1337 /* index matched */ 1338 if (sel->nodes[md].index != -1 && 1339 (d == 0 || 1340 root[d - 1].nchildren == 0 || 1341 sel->nodes[md].index != root[d - 1].nchildren - 1)) 1342 continue; 1343 1344 md++; 1345 /* all matched of one selector */ 1346 if (md == sel->depth) 1347 return 1; 1348 } 1349 1350 return 0; 1351 } 1352 1353 static int 1354 iscssmatchany(struct selectors *sels, struct node *root, int maxdepth) 1355 { 1356 struct selector *sel; 1357 int i; 1358 1359 for (i = 0; i < sels->count; i++) { 1360 sel = sels->selectors[i]; 1361 if (iscssmatch(sel, root, maxdepth)) 1362 return 1; 1363 } 1364 return 0; 1365 } 1366 1367 static void 1368 handleinlinealt(void) 1369 { 1370 struct node *cur; 1371 char *start, *s, *e; 1372 1373 /* do not show the alt text if the element is hidden */ 1374 cur = &nodes[curnode]; 1375 if (cur->tag.displaytype & DisplayNone) 1376 return; 1377 1378 /* show img alt attribute as text. */ 1379 if (attr_alt.len) { 1380 start = attr_alt.data; 1381 e = attr_alt.data + attr_alt.len; 1382 1383 for (s = start; s < e; s++) 1384 printc((unsigned char)*s); 1385 hflush(); 1386 } else if (cur->tag.id == TagImg && !showurlinline) { 1387 /* if there is no alt text and no URL is shown inline, then 1388 show "[IMG]" to indicate there was an image there */ 1389 hprint("[IMG]"); 1390 } 1391 } 1392 1393 /* lookup a link reference by url in the red-black tree */ 1394 static struct linkref * 1395 findlinkref(const char *url) 1396 { 1397 struct linkref find; 1398 1399 find.url = (char *)url; 1400 1401 return RB_FIND(linkreftree, &linkrefhead, &find); 1402 } 1403 1404 /* add a link reference. Returns the added link reference, or the existing link 1405 reference if links are deduplicated */ 1406 static struct linkref * 1407 addlinkref(const char *url, const char *_type, enum TagId tagid, int ishidden) 1408 { 1409 struct linkref *link; 1410 size_t linknr; 1411 1412 /* if links are deduplicates return the existing link */ 1413 if (uniqrefs && (link = findlinkref(url))) 1414 return link; 1415 1416 if (tagid == TagA) 1417 _type = "link"; 1418 1419 link = ecalloc(1, sizeof(*link)); 1420 1421 if (!ishidden) { 1422 linknr = ++nvisrefs; 1423 if (nvisrefs >= ncapvisrefs) 1424 ncapvisrefs += 256; /* greedy alloc */ 1425 visrefs = erealloc(visrefs, sizeof(*visrefs) * ncapvisrefs); 1426 visrefs[linknr - 1] = link; /* add pointer to list */ 1427 } else { 1428 linknr = ++nhiddenrefs; 1429 if (nhiddenrefs >= ncaphiddenrefs) 1430 ncaphiddenrefs += 256; /* greedy alloc */ 1431 hiddenrefs = erealloc(hiddenrefs, sizeof(*hiddenrefs) * ncaphiddenrefs); 1432 hiddenrefs[linknr - 1] = link; /* add pointer to list */ 1433 } 1434 1435 link->url = estrdup(url); 1436 link->type = estrdup(_type); 1437 link->tagid = tagid; 1438 link->ishidden = ishidden; 1439 link->linknr = linknr; 1440 1441 /* add to tree: the tree is only used for checking unique link references */ 1442 if (uniqrefs) 1443 RB_INSERT(linkreftree, &linkrefhead, link); 1444 1445 return link; 1446 } 1447 1448 static void 1449 handleinlinelink(void) 1450 { 1451 struct uri newuri, olduri; 1452 struct node *cur; 1453 char buf[4096], *url; 1454 int r; 1455 1456 if (!showrefbottom && !showrefinline && !showurlinline && !resources) 1457 return; /* there is no need to collect the reference */ 1458 1459 if (!attr_href.len && !attr_src.len && !attr_data.len) 1460 return; /* there is no reference */ 1461 1462 /* by default use the original URL */ 1463 if (attr_src.len) 1464 url = attr_src.data; 1465 else if (attr_href.len) 1466 url = attr_href.data; 1467 else 1468 url = attr_data.data; 1469 1470 if (!url) 1471 return; 1472 1473 /* Not an absolute URL yet: try to make it absolute. 1474 If it is not possible use the relative URL */ 1475 if (!uri_hasscheme(url) && basehrefset && 1476 uri_parse(url, &olduri) != -1 && 1477 uri_makeabs(&newuri, &olduri, &base) != -1 && 1478 newuri.proto[0]) { 1479 r = uri_format(buf, sizeof(buf), &newuri); 1480 if (r >= 0 && (size_t)r < sizeof(buf)) 1481 url = buf; 1482 } 1483 1484 if (!url[0]) 1485 return; 1486 1487 cur = &nodes[curnode]; 1488 1489 if (!(cur->tag.displaytype & DisplayNone)) { 1490 string_clear(&nodes_links[curnode]); 1491 string_append(&nodes_links[curnode], url, strlen(url)); 1492 } 1493 1494 /* add hidden links directly to the reference, 1495 the order doesn't matter */ 1496 if (cur->tag.displaytype & DisplayNone) 1497 addlinkref(url, cur->tag.name, cur->tag.id, 1); 1498 } 1499 1500 static void 1501 printlinkrefs(void) 1502 { 1503 struct linkref *ref; 1504 size_t i; 1505 1506 if (!nvisrefs && !nhiddenrefs) 1507 return; 1508 1509 if (resources) { 1510 for (i = 0; i < nvisrefs; i++) { 1511 ref = visrefs[i]; 1512 dprintf(3, "%s\t%s\n", ref->type, ref->url); 1513 } 1514 for (i = 0; i < nhiddenrefs; i++) { 1515 ref = hiddenrefs[i]; 1516 dprintf(3, "%s\t%s\n", ref->type, ref->url); 1517 } 1518 } 1519 1520 printf("\nReferences\n\n"); 1521 1522 for (i = 0; i < nvisrefs; i++) { 1523 ref = visrefs[i]; 1524 printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->type); 1525 } 1526 1527 if (nhiddenrefs > 0) 1528 printf("\n\nHidden references\n\n"); 1529 /* hidden links don't have a link number, just count them */ 1530 for (i = 0; i < nhiddenrefs; i++) { 1531 ref = hiddenrefs[i]; 1532 printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->type); 1533 } 1534 } 1535 1536 /* size to grow node capacity (greedy) */ 1537 #define NODE_CAP_INC 256 1538 1539 /* increase node depth, allocate space for nodes if needed */ 1540 static void 1541 incnode(void) 1542 { 1543 size_t i; 1544 1545 curnode++; 1546 1547 if (curnode >= MAX_NODE_DEPTH) 1548 errx(1, "max node depth reached: %d", curnode); 1549 1550 if (curnode >= ncapnodes) { 1551 nodes = erealloc(nodes, sizeof(*nodes) * (ncapnodes + NODE_CAP_INC)); 1552 nodes_links = erealloc(nodes_links, sizeof(*nodes_links) * (ncapnodes + NODE_CAP_INC)); 1553 1554 /* clear new region */ 1555 memset(&nodes[ncapnodes], 0, sizeof(*nodes) * NODE_CAP_INC); 1556 memset(&nodes_links[ncapnodes], 0, sizeof(*nodes_links) * NODE_CAP_INC); 1557 1558 for (i = ncapnodes; i < ncapnodes + NODE_CAP_INC; i++) { 1559 nodes[i].tag.displaytype = DisplayInline; 1560 nodes[i].tag.name = nodes[i].tagname; /* assign to use fixed-size buffer */ 1561 } 1562 1563 ncapnodes += NODE_CAP_INC; /* greedy alloc */ 1564 } 1565 } 1566 1567 static void 1568 xmldatastart(XMLParser *p) 1569 { 1570 } 1571 1572 static void 1573 xmldataend(XMLParser *p) 1574 { 1575 struct node *cur; 1576 char *start, *s, *e; 1577 1578 if (!htmldata.data || !htmldata.len) 1579 return; 1580 1581 cur = &nodes[curnode]; 1582 1583 if (reader_ignore || (cur->tag.displaytype & DisplayNone)) { 1584 /* print nothing */ 1585 } else if ((cur->tag.displaytype & DisplayPre) || 1586 findparenttype(curnode - 1, DisplayPre)) { 1587 printpre(htmldata.data, htmldata.len); 1588 } else { 1589 start = htmldata.data; 1590 e = htmldata.data + htmldata.len; 1591 1592 for (s = start; s < e; s++) 1593 printc((unsigned char)*s); 1594 } 1595 1596 string_clear(&htmldata); 1597 } 1598 1599 static void 1600 xmldata(XMLParser *p, const char *data, size_t datalen) 1601 { 1602 struct node *cur; 1603 1604 if (reader_ignore) 1605 return; 1606 1607 cur = &nodes[curnode]; 1608 if (cur->tag.displaytype & DisplayNone) 1609 return; 1610 1611 string_append(&htmldata, data, datalen); 1612 } 1613 1614 static void 1615 xmldataentity(XMLParser *p, const char *data, size_t datalen) 1616 { 1617 struct node *cur; 1618 char buf[16]; 1619 int n; 1620 1621 if (reader_ignore) 1622 return; 1623 1624 cur = &nodes[curnode]; 1625 if (cur->tag.displaytype & DisplayNone) 1626 return; 1627 1628 n = xml_entitytostr(data, buf, sizeof(buf)); 1629 if (n > 0) 1630 xmldata(p, buf, (size_t)n); 1631 else 1632 xmldata(p, data, datalen); 1633 } 1634 1635 static void 1636 xmlcdatastart(XMLParser *p) 1637 { 1638 xmldatastart(p); 1639 } 1640 1641 static void 1642 xmlcdataend(XMLParser *p) 1643 { 1644 xmldataend(p); /* treat CDATA as data */ 1645 } 1646 1647 static void 1648 xmlcdata(XMLParser *p, const char *data, size_t datalen) 1649 { 1650 xmldata(p, data, datalen); /* treat CDATA as data */ 1651 } 1652 1653 /* lookup function to compare tag name (case-insensitive) for sort functions */ 1654 static int 1655 findtagcmp(const void *v1, const void *v2) 1656 { 1657 struct tag *t1 = (struct tag *)v1; 1658 struct tag *t2 = (struct tag *)v2; 1659 1660 return strcasecmp(t1->name, t2->name); 1661 } 1662 1663 /* binary search tag by tag name */ 1664 static struct tag * 1665 findtag(const char *t) 1666 { 1667 struct tag find = { 0 }; 1668 1669 find.name = t; 1670 1671 return bsearch(&find, tags, LEN(tags), sizeof(*tags), findtagcmp); 1672 } 1673 1674 static void 1675 handleendtag(struct tag *tag) 1676 { 1677 int i, marginbottom; 1678 1679 if (tag->displaytype & DisplayNone) 1680 return; 1681 if (reader_ignore) 1682 return; 1683 1684 if (tag->displaytype & (DisplayButton | DisplayOption)) { 1685 hputchar(']'); 1686 hflush(); 1687 } 1688 1689 if (tag->displaytype & (DisplayBlock | DisplayHeader | DisplayTable | DisplayTableRow | 1690 DisplayList | DisplayListItem | DisplayPre)) { 1691 endblock(); /* break line if needed */ 1692 } 1693 1694 /* when a list ends and its not inside a list add an extra bottom margin */ 1695 marginbottom = tag->marginbottom; 1696 1697 if (marginbottom > 0) { 1698 if (tag->displaytype & DisplayList) { 1699 if (findparenttype(curnode - 1, DisplayList)) 1700 marginbottom--; 1701 } 1702 } 1703 1704 if (marginbottom > 0) { 1705 hflush(); 1706 for (i = currentnewlines; i < marginbottom; i++) { 1707 putchar('\n'); 1708 nbytesline = 0; 1709 ncells = 0; 1710 currentnewlines++; 1711 } 1712 hadnewline = 1; 1713 } 1714 } 1715 1716 static void 1717 endnode(struct node *cur) 1718 { 1719 struct linkref *ref; 1720 int i, ishidden; 1721 1722 /* set a flag indicating the element and its parent containers have data. 1723 This is used for some formatting */ 1724 if (cur->hasdata) { 1725 for (i = curnode; i >= 0; i--) 1726 nodes[i].hasdata = 1; 1727 } 1728 1729 endmarkup(cur->tag.markuptype); 1730 1731 ishidden = reader_ignore || (cur->tag.displaytype & DisplayNone); 1732 1733 /* add link and show the link number in the visible order */ 1734 if (!ishidden && nodes_links[curnode].len > 0) { 1735 ref = addlinkref(nodes_links[curnode].data, 1736 cur->tag.name, cur->tag.id, ishidden); 1737 1738 if (showrefinline || showurlinline) { 1739 hflush(); 1740 startmarkup(MarkupReverse); 1741 } 1742 1743 if (showrefinline) 1744 hprintf("[%zu]", ref->linknr); 1745 if (showurlinline) { 1746 if (ref->tagid == TagA) 1747 hprintf("[%s]", ref->url); 1748 else 1749 hprintf("[%s: %s]", ref->type, ref->url); 1750 } 1751 if (showrefinline || showurlinline) { 1752 endmarkup(MarkupReverse); 1753 hflush(); 1754 } 1755 } 1756 1757 handleendtag(&(cur->tag)); 1758 } 1759 1760 static void 1761 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) 1762 { 1763 struct tag *found, *tag; 1764 enum TagId child, childs[16]; 1765 size_t nchilds; 1766 int i, j, k, nchildfound, parenttype; 1767 1768 /* match tag and lookup metadata */ 1769 /* ignore closing of void elements, like </br>, which is not allowed */ 1770 if ((found = findtag(t))) { 1771 if (!isshort && found->isvoid) 1772 return; 1773 } 1774 1775 /* TODO: implement more complete optional tag handling. 1776 in reality the optional tag rules are more complex, see: 1777 https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */ 1778 1779 child = 0; 1780 nchilds = 0; 1781 nchildfound = 0; 1782 parenttype = 0; /* by default, seek until the root */ 1783 1784 if (found && found->displaytype & DisplayPre) { 1785 skipinitialws = 0; /* do not skip white-space, for margins */ 1786 } else if (found && found->displaytype & DisplayList) { 1787 childs[0] = TagLi; 1788 nchilds = 1; 1789 parenttype = DisplayList; 1790 } else if (found && found->displaytype & DisplayTableRow) { 1791 childs[0] = TagTd; 1792 nchilds = 1; 1793 parenttype = DisplayTableRow; 1794 } else if (found && found->displaytype & DisplayTable) { 1795 childs[0] = TagTd; 1796 nchilds = 1; 1797 parenttype = DisplayTable; 1798 } else if (found && found->displaytype & DisplaySelect) { 1799 childs[0] = TagOption; 1800 nchilds = 1; 1801 parenttype = DisplaySelect; 1802 } else if (found && found->displaytype & DisplayDl) { 1803 childs[0] = TagP; 1804 childs[1] = TagDd; 1805 childs[2] = TagDt; 1806 nchilds = 3; 1807 parenttype = DisplayDl; 1808 } else if (found && found->displaytype & DisplayBlock) { 1809 childs[0] = TagP; 1810 nchilds = 1; 1811 parenttype = 0; /* seek until the root */ 1812 } 1813 1814 if (nchilds > 0) { 1815 for (i = curnode; i >= 0; i--) { 1816 if (nchildfound) 1817 break; 1818 if ((nodes[i].tag.displaytype & parenttype)) 1819 break; 1820 for (j = 0; j < nchilds; j++) { 1821 child = childs[j]; 1822 if (nodes[i].tag.id == child) { 1823 /* fake closing the previous tags */ 1824 for (k = curnode; k >= i; k--) 1825 endnode(&nodes[k]); 1826 curnode = k; 1827 nchildfound = 1; 1828 break; 1829 } 1830 } 1831 } 1832 } 1833 1834 /* if the current closing tag matches the current open tag */ 1835 if (nodes[curnode].tag.name && 1836 !tagcmp(nodes[curnode].tag.name, t)) { 1837 endnode(&nodes[curnode]); 1838 if (curnode) 1839 curnode--; 1840 } else { 1841 /* ... else lookup the first matching start tag. This is also 1842 for handling optional closing tags */ 1843 tag = NULL; 1844 for (i = curnode; i >= 0; i--) { 1845 if (nodes[i].tag.name && 1846 !tagcmp(nodes[i].tag.name, t)) { 1847 endnode(&nodes[i]); 1848 curnode = i > 0 ? i - 1 : 0; 1849 tag = &nodes[i].tag; 1850 break; 1851 } 1852 } 1853 /* unmatched closing tag found */ 1854 if (!tag && found) 1855 handleendtag(found); 1856 } 1857 indent = calcindent(); 1858 1859 #if 0 1860 /* check if linewrap is enabled, but currently is disabled and needs to 1861 be restored */ 1862 if (allowlinewrap && !linewrap) { 1863 tag = NULL; 1864 for (i = curnode; i >= 0; i--) { 1865 if (nodes[i].tag.id == TagTable) { 1866 tag = &nodes[i].tag; 1867 break; 1868 } 1869 } 1870 if (!tag) 1871 linewrap = allowlinewrap; 1872 } 1873 #endif 1874 1875 /* restore markup of the tag we are in now */ 1876 startmarkup(nodes[curnode].tag.markuptype); 1877 1878 /* check if the current node still matches the visible selector */ 1879 if (reader_mode && sel_show && !reader_ignore) { 1880 if (!iscssmatchany(sel_show, nodes, curnode)) { 1881 reader_ignore = 1; 1882 newline(); 1883 } 1884 } 1885 } 1886 1887 static void 1888 xmltagstart(XMLParser *p, const char *t, size_t tl) 1889 { 1890 struct tag *found; 1891 struct node *cur; 1892 enum TagId tagid; 1893 enum TagId child, childs[16]; 1894 size_t nchilds; 1895 char *s; 1896 int i, j, k, nchildfound, parenttype; 1897 1898 cur = &nodes[curnode]; 1899 1900 string_clear(&attr_alt); 1901 string_clear(&attr_checked); 1902 string_clear(&attr_class); 1903 attr_class_set = 0; 1904 string_clear(&attr_data); 1905 string_clear(&attr_href); 1906 string_clear(&attr_id); 1907 attr_id_set = 0; 1908 string_clear(&attr_src); 1909 string_clear(&attr_type); 1910 string_clear(&attr_value); 1911 1912 /* match tag and lookup metadata */ 1913 found = findtag(t); 1914 1915 /* TODO: implement more complete optional tag handling. 1916 in reality the optional tag rules are more complex, see: 1917 https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */ 1918 1919 child = 0; 1920 nchilds = 0; 1921 nchildfound = 0; 1922 parenttype = 0; /* by default, seek until the root */ 1923 1924 /* if optional tag <p> is open and a list element is found, close </p>. */ 1925 if (found && found->displaytype & DisplayList) { 1926 /* not inside a list */ 1927 childs[0] = TagP; 1928 nchilds = 1; 1929 parenttype = DisplayList; 1930 } else if (found && found->isoptional) { 1931 tagid = found->id; 1932 if (tagid == TagLi) { 1933 childs[0] = TagLi; 1934 nchilds = 1; 1935 parenttype = DisplayList; 1936 } else if (tagid == TagTd) { 1937 childs[0] = TagTd; 1938 nchilds = 1; 1939 parenttype = DisplayTableRow; 1940 } else if (tagid == TagTr) { 1941 childs[0] = TagTr; 1942 nchilds = 1; 1943 parenttype = DisplayTable; 1944 } else if (tagid == TagP) { 1945 childs[0] = TagP; 1946 nchilds = 1; 1947 parenttype = 0; /* seek until the root */ 1948 } else if (tagid == TagOption) { 1949 childs[0] = TagOption; 1950 nchilds = 1; 1951 parenttype = DisplaySelect; 1952 } else if (tagid == TagDt) { 1953 childs[0] = TagDd; 1954 nchilds = 1; 1955 parenttype = DisplayDl; 1956 } else if (tagid == TagDd) { 1957 childs[0] = TagDd; 1958 childs[1] = TagDt; 1959 nchilds = 2; 1960 parenttype = DisplayDl; 1961 } else if (tagid == cur->tag.id) { 1962 /* fake closing the previous tag if it is the same and repeated */ 1963 xmltagend(p, t, tl, 0); 1964 } 1965 } else if (found && found->displaytype & DisplayBlock) { 1966 /* check if we have an open "<p>" tag */ 1967 childs[0] = TagP; 1968 childs[1] = TagDl; 1969 nchilds = 2; 1970 parenttype = DisplayDl; 1971 } 1972 1973 if (nchilds > 0) { 1974 for (i = curnode; i >= 0; i--) { 1975 if (nchildfound) 1976 break; 1977 if ((nodes[i].tag.displaytype & parenttype)) 1978 break; 1979 for (j = 0; j < nchilds; j++) { 1980 child = childs[j]; 1981 if (nodes[i].tag.id == child) { 1982 /* fake closing the previous tags */ 1983 for (k = curnode; k >= i; k--) 1984 xmltagend(p, nodes[k].tag.name, strlen(nodes[k].tag.name), 0); 1985 nchildfound = 1; 1986 break; 1987 } 1988 } 1989 } 1990 } 1991 1992 incnode(); 1993 string_clear(&nodes_links[curnode]); /* clear possible link reference for this node */ 1994 cur = &nodes[curnode]; 1995 memset(cur, 0, sizeof(*cur)); /* clear / reset node */ 1996 /* tag defaults */ 1997 cur->tag.displaytype = DisplayInline; 1998 cur->tag.name = cur->tagname; /* assign fixed-size buffer */ 1999 strlcpy(cur->tagname, t, sizeof(cur->tagname)); 2000 2001 /* force to lowercase */ 2002 for (s = cur->tagname; *s; s++) 2003 *s = TOLOWER((unsigned char)*s); 2004 2005 /* matched tag: copy tag information to current node */ 2006 if (found) 2007 memcpy(&(cur->tag), found, sizeof(*found)); 2008 2009 /* if parent tag is hidden then hide itself too */ 2010 if (curnode > 0 && (nodes[curnode - 1].tag.displaytype & DisplayNone)) 2011 cur->tag.displaytype |= DisplayNone; 2012 } 2013 2014 static void 2015 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) 2016 { 2017 struct tag *found; 2018 enum TagId tagid; 2019 struct node *cur, *parent; 2020 int i, margintop; 2021 2022 /* match tag and lookup metadata */ 2023 tagid = 0; 2024 if ((found = findtag(t))) 2025 tagid = found->id; 2026 2027 /* temporary replace the callback except the reader and end of tag 2028 restore the context once we receive the same ignored tag in the 2029 end tag handler */ 2030 if (tagid == TagScript) { 2031 ignorestate = endtag = "</script>"; 2032 getnext = p->getnext; /* for restore */ 2033 p->getnext = getnext_ignore; 2034 xmltagend(p, t, tl, 0); /* fake the call the tag was ended */ 2035 return; 2036 } else if (tagid == TagStyle) { 2037 ignorestate = endtag = "</style>"; 2038 getnext = p->getnext; /* for restore */ 2039 p->getnext = getnext_ignore; 2040 xmltagend(p, t, tl, 0); /* fake the call the tag was ended */ 2041 return; 2042 } 2043 2044 #if 0 2045 /* disable line-wrapping inside tables */ 2046 if (tagid == TagTable) 2047 linewrap = 0; 2048 #endif 2049 2050 cur = &nodes[curnode]; 2051 2052 /* copy attributes if set */ 2053 if (attr_id.len) 2054 strlcpy(cur->id, attr_id.data, sizeof(cur->id)); 2055 else 2056 cur->id[0] = '\0'; 2057 if (attr_class.len) 2058 strlcpy(cur->classnames, attr_class.data, sizeof(cur->classnames)); 2059 else 2060 cur->classnames[0] = '\0'; 2061 2062 /* parent node */ 2063 if (curnode > 0) { 2064 parent = &nodes[curnode - 1]; 2065 parent->nchildren++; /* increase child node count */ 2066 /* count visible childnodes */ 2067 if (!(cur->tag.displaytype & DisplayNone)) 2068 parent->visnchildren++; 2069 } else { 2070 parent = NULL; 2071 } 2072 2073 if (reader_mode && sel_show && reader_ignore && 2074 iscssmatchany(sel_show, nodes, curnode)) 2075 reader_ignore = 0; 2076 2077 /* hide element */ 2078 if (reader_mode && sel_hide && 2079 iscssmatchany(sel_hide, nodes, curnode)) 2080 cur->tag.displaytype |= DisplayNone; 2081 2082 /* indent for this tag */ 2083 cur->indent = cur->tag.indent; 2084 2085 if (!reader_ignore) { 2086 /* add link reference, print links and alt text */ 2087 handleinlinelink(); 2088 handleinlinealt(); 2089 } 2090 2091 /* <select><option> */ 2092 if (cur->tag.displaytype & DisplayOption) { 2093 /* <select multiple>: show all options */ 2094 if (parent->tag.displaytype & DisplaySelectMulti) 2095 cur->tag.displaytype |= DisplayBlock; 2096 else if (parent->nchildren > 1) /* show the first item as selected */ 2097 cur->tag.displaytype |= DisplayNone; /* else hide */ 2098 } 2099 2100 if (cur->tag.displaytype & DisplayNone) 2101 return; 2102 2103 if (reader_ignore) 2104 return; 2105 2106 indent = calcindent(); 2107 2108 if ((cur->tag.displaytype & (DisplayBlock | DisplayHeader | DisplayPre | 2109 DisplayTable | DisplayTableRow | 2110 DisplayList | DisplayListItem))) { 2111 startblock(); /* break line if needed */ 2112 } 2113 2114 if (cur->tag.displaytype & (DisplayButton | DisplayOption)) { 2115 hflush(); 2116 hputchar('['); 2117 } 2118 2119 margintop = cur->tag.margintop; 2120 if (cur->tag.displaytype & (DisplayList)) { 2121 for (i = curnode - 1; i >= 0; i--) { 2122 if (nodes[i].tag.displaytype & DisplayList) 2123 break; 2124 if (!(nodes[i].tag.displaytype & DisplayListItem)) 2125 continue; 2126 if (nodes[i].hasdata && margintop > 0) { 2127 margintop--; 2128 break; 2129 } 2130 } 2131 } else if (cur->tag.displaytype & (DisplayBlock|DisplayTable)) { 2132 if (!parentcontainerhasdata(cur->tag.displaytype, curnode - 1)) { 2133 if (margintop > 0) 2134 margintop--; 2135 } 2136 } 2137 2138 if (margintop > 0) { 2139 hflush(); 2140 for (i = currentnewlines; i < margintop; i++) { 2141 putchar('\n'); 2142 nbytesline = 0; 2143 ncells = 0; 2144 currentnewlines++; 2145 } 2146 hadnewline = 1; 2147 } 2148 2149 if (cur->tag.displaytype & DisplayPre) { 2150 skipinitialws = 1; 2151 } else if (cur->tag.displaytype & DisplayTableCell) { 2152 if (parent && parent->visnchildren > 1) 2153 hputchar('\t'); 2154 } else if (cur->tag.displaytype & DisplayListItem) { 2155 /* find first parent node and ordered numbers or unordered */ 2156 if (parent) { 2157 skipinitialws = 0; 2158 2159 /* print bullet, add columns to indentation level */ 2160 if (parent->tag.displaytype & DisplayListOrdered) { 2161 hprintf("%4zu. ", parent->nchildren); 2162 cur->indent = 6; 2163 indent += cur->indent; /* align to number */ 2164 } else if (parent->tag.displaytype & DisplayList) { 2165 hprint(str_bullet_item); 2166 cur->indent = 2; 2167 indent += 2; /* align to bullet */ 2168 } 2169 } 2170 skipinitialws = 0; 2171 } else if (cur->tag.displaytype & DisplayInput) { 2172 if (!attr_type.len) { 2173 hprintf("[%-15s]", attr_value.len ? attr_value.data : ""); /* default: text */ 2174 } else if (!strcasecmp(attr_type.data, "button")) { 2175 hprintf("[%s]", attr_value.len ? attr_value.data : ""); 2176 } else if (!strcasecmp(attr_type.data, "submit")) { 2177 hprintf("[%s]", attr_value.len ? attr_value.data : "Submit Query"); 2178 } else if (!strcasecmp(attr_type.data, "reset")) { 2179 hprintf("[%s]", attr_value.len ? attr_value.data : "Reset"); 2180 } else if (!strcasecmp(attr_type.data, "checkbox")) { 2181 hprintf("[%s]", 2182 attr_checked.len && 2183 !strcasecmp(attr_checked.data, "checked") ? str_checkbox_checked : " "); 2184 } else if (!strcasecmp(attr_type.data, "radio")) { 2185 hprintf("[%s]", 2186 attr_checked.len && 2187 !strcasecmp(attr_checked.data, "checked") ? str_radio_checked : " "); 2188 } else if (!strcasecmp(attr_type.data, "hidden")) { 2189 cur->tag.displaytype |= DisplayNone; 2190 } else { 2191 /* unrecognized / default case is text */ 2192 hprintf("[%-15s]", attr_value.len ? attr_value.data : ""); 2193 } 2194 } 2195 2196 startmarkup(cur->tag.markuptype); 2197 2198 /* do not count data such as an item bullet as part of the data for 2199 the node */ 2200 cur->hasdata = 0; 2201 2202 if (tagid == TagHr) { /* ruler */ 2203 i = termwidth - indent - defaultindent; 2204 for (; i > 0; i--) 2205 hprint(str_ruler); 2206 cur->hasdata = 1; /* treat <hr/> as data */ 2207 } else if (tagid == TagBr) { 2208 hflush(); 2209 hadnewline = 0; /* forced newline */ 2210 hputchar('\n'); 2211 cur->hasdata = 1; /* treat <br/> as data */ 2212 } 2213 2214 /* autoclose tags, such as <br>, pretend we are <br/> */ 2215 if (!isshort && cur->tag.isvoid) 2216 xmltagend(p, t, tl, 1); /* pretend close of short tag */ 2217 } 2218 2219 static void 2220 xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, 2221 size_t nl, const char *v, size_t vl) 2222 { 2223 struct node *cur; 2224 enum TagId tagid; 2225 2226 cur = &nodes[curnode]; 2227 tagid = cur->tag.id; 2228 2229 /* hide tags with attribute aria-hidden or hidden */ 2230 if (!attrcmp(n, "aria-hidden") || !attrcmp(n, "hidden")) 2231 cur->tag.displaytype |= DisplayNone; 2232 2233 if (!attr_class_set && !attrcmp(n, "class")) /* use the first set attribute */ 2234 string_append(&attr_class, v, vl); 2235 else if (!attr_id_set && !attrcmp(n, "id")) /* use the first set attribute */ 2236 string_append(&attr_id, v, vl); 2237 else if (!attrcmp(n, "type")) 2238 string_append(&attr_type, v, vl); 2239 else if (!attrcmp(n, "value")) 2240 string_append(&attr_value, v, vl); 2241 2242 /* <base href="..." /> */ 2243 if (!basehrefset && tagid == TagBase && !attrcmp(n, "href")) 2244 strlcat(basehrefdoc, v, sizeof(basehrefdoc)); 2245 2246 if (tagid == TagA && !attrcmp(n, "href")) 2247 string_append(&attr_href, v, vl); 2248 2249 if (tagid == TagSelect && !attrcmp(n, "multiple")) 2250 cur->tag.displaytype |= DisplaySelectMulti; 2251 2252 if (tagid == TagObject && !attrcmp(n, "data")) 2253 string_append(&attr_data, v, vl); 2254 2255 /* show img alt attribute as text. */ 2256 if (tagid == TagImg && !attrcmp(n, "alt")) 2257 string_append(&attr_alt, v, vl); 2258 2259 if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked")) 2260 string_append(&attr_checked, v, vl); 2261 2262 /* src attribute */ 2263 switch (tagid) { 2264 case TagAudio: 2265 case TagEmbed: 2266 case TagFrame: 2267 case TagIframe: 2268 case TagImg: 2269 case TagSource: 2270 case TagTrack: 2271 case TagVideo: 2272 if (!attrcmp(n, "src")) 2273 string_append(&attr_src, v, vl); 2274 break; 2275 default: 2276 break; 2277 } 2278 } 2279 2280 static void 2281 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, 2282 size_t nl, const char *v, size_t vl) 2283 { 2284 char buf[16]; 2285 int len; 2286 2287 len = xml_entitytostr(v, buf, sizeof(buf)); 2288 if (len > 0) 2289 xmlattr(p, t, tl, n, nl, buf, (size_t)len); 2290 else 2291 xmlattr(p, t, tl, n, nl, v, vl); 2292 } 2293 2294 static void 2295 xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, 2296 size_t nl) 2297 { 2298 struct node *cur; 2299 enum TagId tagid; 2300 2301 cur = &nodes[curnode]; 2302 tagid = cur->tag.id; 2303 2304 if (!attr_class_set && !attrcmp(n, "class")) 2305 attr_class_set = 1; 2306 else if (!attr_id_set && !attrcmp(n, "id")) 2307 attr_id_set = 1; 2308 2309 /* set base URL, if it is set it cannot be overwritten again */ 2310 if (!basehrefset && basehrefdoc[0] && 2311 tagid == TagBase && !attrcmp(n, "href")) 2312 basehrefset = uri_parse(basehrefdoc, &base) != -1 ? 1 : 0; 2313 2314 /* if attribute checked is set but it has no value then set it to "checked" */ 2315 if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked") && !attr_checked.len) 2316 string_append(&attr_checked, "checked", sizeof("checked") - 1); 2317 } 2318 2319 static void 2320 xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, 2321 size_t nl) 2322 { 2323 struct node *cur; 2324 enum TagId tagid; 2325 2326 cur = &nodes[curnode]; 2327 tagid = cur->tag.id; 2328 2329 if (!attrcmp(n, "alt")) 2330 string_clear(&attr_alt); 2331 else if (!attrcmp(n, "checked")) 2332 string_clear(&attr_checked); 2333 else if (!attr_class_set && !attrcmp(n, "class")) 2334 string_clear(&attr_class); 2335 else if (!attrcmp(n, "data")) 2336 string_clear(&attr_data); 2337 else if (!attrcmp(n, "href")) 2338 string_clear(&attr_href); 2339 else if (!attr_id_set && !attrcmp(n, "id")) 2340 string_clear(&attr_id); 2341 else if (!attrcmp(n, "src")) 2342 string_clear(&attr_src); 2343 else if (!attrcmp(n, "type")) 2344 string_clear(&attr_type); 2345 else if (!attrcmp(n, "value")) 2346 string_clear(&attr_value); 2347 2348 if (basehrefdoc[0] && tagid == TagBase && !attrcmp(n, "href")) 2349 basehrefdoc[0] = '\0'; 2350 } 2351 2352 static void 2353 usage(void) 2354 { 2355 fprintf(stderr, "%s [-8adiIlrx] [-b basehref] [-s selector] [-u selector] [-w termwidth]\n", argv0); 2356 exit(1); 2357 } 2358 2359 int 2360 main(int argc, char **argv) 2361 { 2362 char *basehref; 2363 2364 if (pledge("stdio", NULL) < 0) 2365 err(1, "pledge"); 2366 2367 ARGBEGIN { 2368 case '8': 2369 str_bullet_item = "\xe2\x80\xa2 "; 2370 str_ruler = "\xe2\x94\x80"; /* symbol: "light horizontal" */ 2371 break; 2372 case 'a': 2373 allowansi = !allowansi; 2374 break; 2375 case 'b': 2376 basehref = EARGF(usage()); 2377 if (uri_parse(basehref, &base) == -1 || 2378 !base.proto[0]) 2379 usage(); 2380 basehrefset = 1; 2381 break; 2382 case 'd': 2383 uniqrefs = !uniqrefs; 2384 break; 2385 case 'i': 2386 showrefinline = !showrefinline; 2387 break; 2388 case 'I': 2389 showurlinline = !showurlinline; 2390 break; 2391 case 'l': 2392 showrefbottom = !showrefbottom; 2393 break; 2394 case 'r': 2395 allowlinewrap = !allowlinewrap; 2396 break; 2397 case 's': 2398 sel_show = compileselectors(EARGF(usage())); 2399 /* switch to reader/selector mode, ignore all data except when matched */ 2400 reader_mode = 1; 2401 reader_ignore = 1; 2402 break; 2403 case 'u': 2404 sel_hide = compileselectors(EARGF(usage())); 2405 /* switch to reader/selector mode */ 2406 reader_mode = 1; 2407 break; 2408 case 'w': 2409 if ((termwidth = strtol(EARGF(usage()), NULL, 10)) < 1) 2410 usage(); 2411 break; 2412 case 'x': 2413 resources = !resources; 2414 break; 2415 default: 2416 usage(); 2417 } ARGEND 2418 2419 linewrap = allowlinewrap; 2420 2421 /* initial nodes */ 2422 ncapnodes = NODE_CAP_INC; 2423 nodes = ecalloc(ncapnodes, sizeof(*nodes)); 2424 nodes_links = ecalloc(ncapnodes, sizeof(*nodes_links)); 2425 2426 parser.xmlattrstart = xmlattrstart; 2427 parser.xmlattr = xmlattr; 2428 parser.xmlattrentity = xmlattrentity; 2429 parser.xmlattrend = xmlattrend; 2430 parser.xmlcdatastart = xmlcdatastart; 2431 parser.xmlcdata = xmlcdata; 2432 parser.xmlcdataend = xmlcdataend; 2433 parser.xmldatastart = xmldatastart; 2434 parser.xmldata = xmldata; 2435 parser.xmldataentity = xmldataentity; 2436 parser.xmldataend = xmldataend; 2437 parser.xmltagstart = xmltagstart; 2438 parser.xmltagstartparsed = xmltagstartparsed; 2439 parser.xmltagend = xmltagend; 2440 2441 parser.getnext = getchar; 2442 xml_parse(&parser); 2443 2444 hflush(); 2445 if (ncells > 0) 2446 newline(); 2447 2448 if (showrefbottom || resources) 2449 printlinkrefs(); 2450 2451 hflush(); 2452 setmarkup(0); 2453 2454 return 0; 2455 }