selector syntax: document it and add feature to filter on a specific nth node - webdump - HTML to plain-text converter for webpages (HTM) git clone git://git.codemadness.org/webdump (DIR) Log (DIR) Files (DIR) Refs (DIR) README (DIR) LICENSE --- (DIR) commit 56ec7ea6c49d79cc3aaf301d2e6040e15d17785a (DIR) parent 94f0ad42fcfbe17b01d9e573a786435d1acc0232 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org> Date: Fri, 8 Sep 2023 11:07:57 +0200 selector syntax: document it and add feature to filter on a specific nth node Diffstat: M webdump.1 | 38 +++++++++++++++++++++++++++---- M webdump.c | 33 ++++++++++++++++++++++++++++--- 2 files changed, 63 insertions(+), 8 deletions(-) --- (DIR) diff --git a/webdump.1 b/webdump.1 @@ -1,4 +1,4 @@ -.Dd September 7, 2023 +.Dd September 8, 2023 .Dt WEBDUMP 1 .Os .Sh NAME @@ -36,12 +36,16 @@ not enabled. .It Fl r Toggle if line-wrapping mode is enabled, by default it is not enabled. .It Fl s -CSS-like selectors, this sets a reader mode to hide content -matching the selector, for example: "main" or "main#id" or "main.class". +CSS-like selectors, this sets a reader mode to hide content matching the +selector, see the section +.Sx SELECTOR SYNTAX +for the syntax. Multiple selectors can be specified by separating them with a comma. .It Fl u -CSS-like selectors, this sets a reader mode to hide content -matching the selector, for example: "main" or "main#id" or "main.class". +CSS-like selectors, this sets a reader mode to hide content matching the +selector, see the section +.Sx SELECTOR SYNTAX +for the syntax. Multiple selectors can be specified by separating them with a comma. .It Fl w Ar termwidth The terminal width. @@ -49,6 +53,30 @@ The default is 77 characters. .It Fl x Write resources as TAB-separated lines to file descriptor 3. .El +.Sh SELECTOR SYNTAX +The syntax has some inspiration from CSS, but it is more limited. +Some examples: +.Bl -item +.It +"main" would match on the "main" tags. +.It +"#someid" would match on any tag which has the id attribute set to "someid". +.It +".someclass" would match on any tag which has the class attribute set to +"someclass". +.It +"main#someid" would match on the "main" tag which has the id attribute set to +"someid". +.It +"main.someclass" would match on the "main" tags which has the class +attribute set to "someclass". +.It +"ul li" would match on any "li" tag which also has a parent "ul" tag. +.It +"li@0" would match on any "li" tag which is also the first child element of its +parent container. +Note that this differs from filtering on a collection of "li" elements. +.El .Sh EXIT STATUS .Ex -std .Sh EXAMPLES (DIR) diff --git a/webdump.c b/webdump.c @@ -123,6 +123,7 @@ struct node { struct selectornode { char tagname[256]; + long index; /* index of node to match on: -1 if not matching on index */ /* attributes */ char id[256]; char classnames[1024]; @@ -1073,11 +1074,13 @@ int compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes) { int depth = 0, len; + long l; const char *s, *start; char tmp[256]; int nameset = 0; memset(&nodes[0], 0, sizeof(nodes[0])); + nodes[0].index = -1; s = sel; for (; *s && ISSPACE((unsigned char)*s); s++) @@ -1087,7 +1090,7 @@ compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes) for (; ; s++) { /* end of tag */ if (!nameset && - (*s == '#' || *s == '.' || *s == '[' || + (*s == '#' || *s == '.' || *s == '@' || *s == '\0' || ISSPACE((unsigned char)*s))) { nameset = 1; len = s - start; /* tag name */ @@ -1111,15 +1114,32 @@ compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes) nameset = 0; memset(&nodes[depth], 0, sizeof(nodes[depth])); + nodes[depth].index = -1; /* end of selector */ if (*s == '\0') break; } + /* index */ + if (*s == '@') { + len = strcspn(s + 1, ".#@ \t\n"); + if (len >= sizeof(tmp)) + return 0; + memcpy(tmp, s + 1, len); + tmp[len] = '\0'; + + l = strtol(tmp, NULL, 10); + if (l >= 0) + nodes[depth].index = l; + s += len; + start = s + 1; + continue; + } + /* id */ if (*s == '#') { - len = strcspn(s + 1, ".#[ \t\n"); + len = strcspn(s + 1, ".#@ \t\n"); if (len >= sizeof(tmp)) return 0; memcpy(tmp, s + 1, len); @@ -1132,7 +1152,7 @@ compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes) /* class */ if (*s == '.') { - len = strcspn(s + 1, ".#[ \t\n"); + len = strcspn(s + 1, ".#@ \t\n"); if (len >= sizeof(tmp)) return 0; memcpy(tmp, s + 1, len); @@ -1225,6 +1245,13 @@ iscssmatch(struct selector *sel, struct node *root, int maxdepth) !isclassmatch(root[d].classnames, sel->nodes[md].classnames)) continue; /* no */ + /* index matched */ + if (sel->nodes[md].index != -1 && + (d == 0 || + root[d - 1].nchildren == 0 || + sel->nodes[md].index != root[d - 1].nchildren - 1)) + continue; + md++; /* all matched of one selector */ if (md == sel->depth)