cleanup code a bit and add some comments - webdump - HTML to plain-text converter for webpages (HTM) git clone git://git.codemadness.org/webdump (DIR) Log (DIR) Files (DIR) Refs (DIR) README (DIR) LICENSE --- (DIR) commit 4793272ce07153284318336426796cb7e3c93af4 (DIR) parent 589d7d1ed851b5226a4782de8c9f00001f25c599 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org> Date: Tue, 19 Sep 2023 20:05:02 +0200 cleanup code a bit and add some comments Diffstat: M webdump.c | 129 +++++++++++++++---------------- 1 file changed, 62 insertions(+), 67 deletions(-) --- (DIR) diff --git a/webdump.c b/webdump.c @@ -45,14 +45,14 @@ struct uri { }; /* options */ -static int allowansi = 0; /* allow ANSI escape codes */ -static int showrefbottom = 0; /* show link references at the bottom */ -static int showrefinline = 0; /* show link reference number inline */ -static int showurlinline = 0; /* show full link reference inline */ -static int linewrap = 0; /* line-wrapping */ -static int termwidth = 77; /* terminal width */ -static int resources = 0; /* write resources line-by-line to fd 3? */ -static int uniqrefs = 0; /* number unique references */ +static int allowansi = 0; /* (-a) allow ANSI escape codes */ +static int uniqrefs = 0; /* (-d) number unique references */ +static int showrefinline = 0; /* (-i) show link reference number inline */ +static int showurlinline = 0; /* (-I) show full link reference inline */ +static int showrefbottom = 0; /* (-l) show link references at the bottom */ +static int linewrap = 0; /* (-r) line-wrapping */ +static int termwidth = 77; /* (-w) terminal width */ +static int resources = 0; /* (-x) write resources line-by-line to fd 3? */ enum DisplayType { DisplayUnknown = 0, @@ -95,17 +95,19 @@ typedef struct string { } String; enum TagId { TagA = 1, TagAddress, TagArea, TagArticle, TagAside, TagAudio, -TagB, TagBase, TagBlink, TagBlockquote, TagBody, TagBr, TagButton, TagCite, -TagCol, TagColgroup, TagDatalist, TagDd, TagDel, TagDetails, TagDfn, TagDir, -TagDiv, TagDl, TagDt, TagEm, TagEmbed, TagFieldset, TagFigcaption, TagFigure, -TagFooter, TagForm, TagFrame, TagH1, TagH2, TagH3, TagH4, TagH5, TagH6, -TagHead, TagHeader, TagHr, TagHtml, TagI, TagIframe, TagImg, TagInput, TagIns, -TagLabel, TagLegend, TagLi, TagLink, TagMain, TagMark, TagMenu, TagMeta, -TagNav, TagObject, TagOl, TagOption, TagP, TagParam, TagPre, TagS, TagScript, -TagSearch, TagSection, TagSelect, TagSource, TagStrike, TagStrong, TagStyle, -TagSummary, TagTable, TagTbody, TagTd, TagTemplate, TagTextarea, TagTfoot, -TagTh, TagThead, TagTitle, TagTr, TagTrack, TagU, TagUl, TagVar, TagVideo, -TagWbr, TagXmp }; + + TagB, TagBase, TagBlink, TagBlockquote, TagBody, TagBr, TagButton, + TagCite, TagCol, TagColgroup, TagDatalist, TagDd, TagDel, TagDetails, + TagDfn, TagDir, TagDiv, TagDl, TagDt, TagEm, TagEmbed, TagFieldset, + TagFigcaption, TagFigure, TagFooter, TagForm, TagFrame, TagH1, TagH2, + TagH3, TagH4, TagH5, TagH6, TagHead, TagHeader, TagHr, TagHtml, TagI, + TagIframe, TagImg, TagInput, TagIns, TagLabel, TagLegend, TagLi, + TagLink, TagMain, TagMark, TagMenu, TagMeta, TagNav, TagObject, TagOl, + TagOption, TagP, TagParam, TagPre, TagS, TagScript, TagSearch, + TagSection, TagSelect, TagSource, TagStrike, TagStrong, TagStyle, + TagSummary, TagTable, TagTbody, TagTd, TagTemplate, TagTextarea, + TagTfoot, TagTh, TagThead, TagTitle, TagTr, TagTrack, TagU, TagUl, + TagVar, TagVideo, TagWbr, TagXmp }; struct tag { const char *name; @@ -168,6 +170,7 @@ static size_t nvisrefs, ncapvisrefs; /* visible link count / capacity */ struct linkref **hiddenrefs; static size_t nhiddenrefs, ncaphiddenrefs; /* hidden link count / capacity */ +/* compare link by URL for link references RB-tree */ int linkrefcmp(struct linkref *r1, struct linkref *r2) { @@ -175,7 +178,6 @@ linkrefcmp(struct linkref *r1, struct linkref *r2) } RB_HEAD(linkreftree, linkref) linkrefhead = RB_INITIALIZER(&linkrefhead); -RB_PROTOTYPE(linkreftree, linkref, entry, linkrefcmp) RB_GENERATE(linkreftree, linkref, entry, linkrefcmp) static const char *str_bullet_item = "* "; @@ -184,10 +186,9 @@ static const char *str_ruler = "-"; static const char *str_radio_checked = "*"; /* base href, to make URLs absolute */ -static char *basehref = ""; -static char basehrefdoc[4096]; /* base href in document, if any */ -static int basehrefset = 0; /* base href set and can be used? */ -static struct uri base; +static char basehrefdoc[4096]; /* buffer for base href in document, if any */ +static int basehrefset; /* base href set and can be used? */ +static struct uri base; /* parsed current base href */ /* buffers for some attributes of the current tag */ String attr_alt; /* alt attribute */ @@ -200,7 +201,7 @@ String attr_src; /* src attribute */ String attr_type; /* type attribute */ String attr_value; /* value attribute */ -static String htmldata; +static String htmldata; /* buffered HTML data near the current tag */ /* for white-space output handling: 1 = whitespace emitted (suppress repeated), 2 = other characters on this line @@ -208,15 +209,15 @@ static String htmldata; * White-space data before non-whitespace data in tags are ignored on a line. * Repeated white-space are ignored: a single space (' ') is emitted. */ -static int whitespace_mode = 0; -static int nbytesline = 0; -static int ncells = 0; /* current cell count */ -static int hadnewline = 0; /* count for repeated newlines */ +static int whitespace_mode; +static int nbytesline; /* bytes on this line */ +static int ncells; /* current cell/column count */ +static int hadnewline; /* count for repeated newlines */ /* flag for skipping initial white-space in tag: for HTML white-space handling */ static int skipinitialws = 1; #define DEFAULT_INDENT 2 -static const int defaultindent = DEFAULT_INDENT; -static int indent; +static const int defaultindent = DEFAULT_INDENT; /* default indent / margin */ +static int indent; /* indent for the current line, in columns */ /* previous output sequential newlines, used for calculating margins between elements and reducing excessive newlines */ static int currentnewlines; @@ -224,21 +225,22 @@ static int currentnewlines; /* buffers for line-wrapping (buffer per word boundary) */ static char rbuf[1024]; static int rbuflen; -static int rnbufcells = 0; /* pending cell count to add */ +static int rnbufcells; /* pending cell count to add */ #define MAX_NODE_DEPTH 65535 /* absolute maximum node depth */ -static struct node *nodes; +static struct node *nodes; /* node tree (one per level is remembered) */ static String *nodes_links; /* keep track of links per node */ -static size_t ncapnodes; +static size_t ncapnodes; /* current allocated node capacity */ static int curnode; /* current node depth */ -/* reader / selector mode */ -static int reader_mode = 0; -static int reader_ignore = 0; +/* reader / selector mode (-s) */ +static int reader_mode; +/* flag if the tags and their children should be ignored in the current context */ +static int reader_ignore; -static enum MarkupType curmarkup; +static enum MarkupType curmarkup; /* current markup state (bold, underline, etc) */ -/* selector to match */ +/* selector to match (for -s and -u) */ static struct selectors *sel_hide, *sel_show; /* tags table: needs to be sorted like tagcmp(), alphabetically */ @@ -483,7 +485,7 @@ ecalloc(size_t nmemb, size_t size) } /* check if string has a non-empty scheme / protocol part */ -int +static int uri_hasscheme(const char *s) { const char *p = s; @@ -495,7 +497,7 @@ uri_hasscheme(const char *s) return (*p == ':' && p != s); } -int +static int uri_parse(const char *s, struct uri *u) { const char *p = s; @@ -611,7 +613,7 @@ parsepath: /* Transform and try to make the URI `u` absolute using base URI `b` into `a`. Follows some of the logic from "RFC 3986 - 5.2.2. Transform References". Returns 0 on success, -1 on error or truncation. */ -int +static int uri_makeabs(struct uri *a, struct uri *u, struct uri *b) { char *p; @@ -663,7 +665,7 @@ uri_makeabs(struct uri *a, struct uri *u, struct uri *b) return 0; } -int +static int uri_format(char *buf, size_t bufsiz, struct uri *u) { return snprintf(buf, bufsiz, "%s%s%s%s%s%s%s%s%s%s%s%s", @@ -682,14 +684,14 @@ uri_format(char *buf, size_t bufsiz, struct uri *u) } /* compare tag name (case-insensitive) */ -int +static int tagcmp(const char *s1, const char *s2) { return strcasecmp(s1, s2); } /* compare attribute name (case-insensitive) */ -int +static int attrcmp(const char *s1, const char *s2) { return strcasecmp(s1, s2); @@ -846,7 +848,7 @@ endmarkup(int markuptype) cell in general. NOTE: this is of course incorrect since characters can be 2 width aswell, in the future maybe replace this with wcwidth() or similar */ -int +static int utfwidth(int c) { /* not the start of a codepoint */ @@ -1002,17 +1004,6 @@ parentcontainerhasdata(int curtype, int n) return 0; } -static int -parenthasdata(int n) -{ - int i; - - for (i = n; i >= 0; i--) - return nodes[i].hasdata; - - return 0; -} - /* start on a newline for the start of a block element or not */ static void startblock(void) @@ -1021,7 +1012,7 @@ startblock(void) whitespace_mode &= ~2; /* no characters on this line yet */ if (nbytesline <= 0) return; - if (!hadnewline && parenthasdata(curnode - 1)) + if (!hadnewline && curnode >= 0 && nodes[curnode - 1].hasdata) hputchar('\n'); } @@ -1137,7 +1128,7 @@ findparenttype(int cur, int findtype) return NULL; } -int +static int isclassmatch(const char *haystack, const char *needle) { const char *p; @@ -1165,7 +1156,7 @@ isclassmatch(const char *haystack, const char *needle) /* very limited CSS-like selector, supports: main, main#id, main.class, ".class", "#id", "ul li a" */ -int +static int compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes) { int depth = 0, len; @@ -1263,7 +1254,7 @@ compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes) return depth; } -struct selector * +static struct selector * newselector(const char *q) { struct selector *sel; @@ -1282,7 +1273,7 @@ newselector(const char *q) return sel; } -struct selectors * +static struct selectors * compileselectors(const char *q) { struct selectors *sels = NULL; @@ -1319,7 +1310,7 @@ compileselectors(const char *q) /* very limited CSS-like matcher, supports: main, main#id, main.class, ".class", "#id", "ul li a" */ -int +static int iscssmatch(struct selector *sel, struct node *root, int maxdepth) { int d, md = 0; @@ -1356,7 +1347,7 @@ iscssmatch(struct selector *sel, struct node *root, int maxdepth) return 0; } -int +static int iscssmatchany(struct selectors *sels, struct node *root, int maxdepth) { struct selector *sel; @@ -1499,7 +1490,7 @@ handleinlinelink(void) addlinkref(url, cur->tag.name, cur->tag.id, 1); } -void +static void printlinkrefs(void) { struct linkref *ref; @@ -1535,6 +1526,7 @@ printlinkrefs(void) } } +/* size to grow node capacity (greedy) */ #define NODE_CAP_INC 256 /* increase node depth, allocate space for nodes if needed */ @@ -1759,6 +1751,7 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) size_t nchilds; int i, j, k, nchildfound, parenttype; + /* match tag and lookup metadata */ /* ignore closing of void elements, like </br>, which is not allowed */ if ((found = findtag(t))) { if (!isshort && found->isvoid) @@ -1884,7 +1877,7 @@ xmltagstart(XMLParser *p, const char *t, size_t tl) string_clear(&attr_type); string_clear(&attr_value); - /* match tag */ + /* match tag and lookup metadata */ found = findtag(t); /* TODO: implement more complete optional tag handling. @@ -1993,7 +1986,7 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) struct node *cur, *parent; int i, margintop; - /* match tag */ + /* match tag and lookup metadata */ tagid = 0; if ((found = findtag(t))) tagid = found->id; @@ -2322,6 +2315,8 @@ usage(void) int main(int argc, char **argv) { + char *basehref; + if (pledge("stdio", NULL) < 0) err(1, "pledge");