selector syntax: document it and add feature to filter on a specific nth node - webdump - HTML to plain-text converter for webpages
 (HTM) git clone git://git.codemadness.org/webdump
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 56ec7ea6c49d79cc3aaf301d2e6040e15d17785a
 (DIR) parent 94f0ad42fcfbe17b01d9e573a786435d1acc0232
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Fri,  8 Sep 2023 11:07:57 +0200
       
       selector syntax: document it and add feature to filter on a specific nth node
       
       Diffstat:
         M webdump.1                           |      38 +++++++++++++++++++++++++++----
         M webdump.c                           |      33 ++++++++++++++++++++++++++++---
       
       2 files changed, 63 insertions(+), 8 deletions(-)
       ---
 (DIR) diff --git a/webdump.1 b/webdump.1
       @@ -1,4 +1,4 @@
       -.Dd September 7, 2023
       +.Dd September 8, 2023
        .Dt WEBDUMP 1
        .Os
        .Sh NAME
       @@ -36,12 +36,16 @@ not enabled.
        .It Fl r
        Toggle if line-wrapping mode is enabled, by default it is not enabled.
        .It Fl s
       -CSS-like selectors, this sets a reader mode to hide content
       -matching the selector, for example: "main" or "main#id" or "main.class".
       +CSS-like selectors, this sets a reader mode to hide content matching the
       +selector, see the section
       +.Sx SELECTOR SYNTAX
       +for the syntax.
        Multiple selectors can be specified by separating them with a comma.
        .It Fl u
       -CSS-like selectors, this sets a reader mode to hide content
       -matching the selector, for example: "main" or "main#id" or "main.class".
       +CSS-like selectors, this sets a reader mode to hide content matching the
       +selector, see the section
       +.Sx SELECTOR SYNTAX
       +for the syntax.
        Multiple selectors can be specified by separating them with a comma.
        .It Fl w Ar termwidth
        The terminal width.
       @@ -49,6 +53,30 @@ The default is 77 characters.
        .It Fl x
        Write resources as TAB-separated lines to file descriptor 3.
        .El
       +.Sh SELECTOR SYNTAX
       +The syntax has some inspiration from CSS, but it is more limited.
       +Some examples:
       +.Bl -item
       +.It
       +"main" would match on the "main" tags.
       +.It
       +"#someid" would match on any tag which has the id attribute set to "someid".
       +.It
       +".someclass" would match on any tag which has the class attribute set to
       +"someclass".
       +.It
       +"main#someid" would match on the "main" tag which has the id attribute set to
       +"someid".
       +.It
       +"main.someclass" would match on the "main" tags which has the class
       +attribute set to "someclass".
       +.It
       +"ul li" would match on any "li" tag which also has a parent "ul" tag.
       +.It
       +"li@0" would match on any "li" tag which is also the first child element of its
       +parent container.
       +Note that this differs from filtering on a collection of "li" elements.
       +.El
        .Sh EXIT STATUS
        .Ex -std
        .Sh EXAMPLES
 (DIR) diff --git a/webdump.c b/webdump.c
       @@ -123,6 +123,7 @@ struct node {
        
        struct selectornode {
                char tagname[256];
       +        long index; /* index of node to match on: -1 if not matching on index */
                /* attributes */
                char id[256];
                char classnames[1024];
       @@ -1073,11 +1074,13 @@ int
        compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes)
        {
                int depth = 0, len;
       +        long l;
                const char *s, *start;
                char tmp[256];
                int nameset = 0;
        
                memset(&nodes[0], 0, sizeof(nodes[0]));
       +        nodes[0].index = -1;
        
                s = sel;
                for (; *s && ISSPACE((unsigned char)*s); s++)
       @@ -1087,7 +1090,7 @@ compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes)
                for (; ; s++) {
                        /* end of tag */
                        if (!nameset &&
       -                    (*s == '#' || *s == '.' || *s == '[' ||
       +                    (*s == '#' || *s == '.' || *s == '@' ||
                             *s == '\0' || ISSPACE((unsigned char)*s))) {
                                nameset = 1;
                                len = s - start; /* tag name */
       @@ -1111,15 +1114,32 @@ compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes)
        
                                nameset = 0;
                                memset(&nodes[depth], 0, sizeof(nodes[depth]));
       +                        nodes[depth].index = -1;
        
                                /* end of selector */
                                if (*s == '\0')
                                        break;
                        }
        
       +                /* index */
       +                if (*s == '@') {
       +                        len = strcspn(s + 1, ".#@ \t\n");
       +                        if (len >= sizeof(tmp))
       +                                return 0;
       +                        memcpy(tmp, s + 1, len);
       +                        tmp[len] = '\0';
       +
       +                        l = strtol(tmp, NULL, 10);
       +                        if (l >= 0)
       +                                nodes[depth].index = l;
       +                        s += len;
       +                        start = s + 1;
       +                        continue;
       +                }
       +
                        /* id */
                        if (*s == '#') {
       -                        len = strcspn(s + 1, ".#[ \t\n");
       +                        len = strcspn(s + 1, ".#@ \t\n");
                                if (len >= sizeof(tmp))
                                        return 0;
                                memcpy(tmp, s + 1, len);
       @@ -1132,7 +1152,7 @@ compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes)
        
                        /* class */
                        if (*s == '.') {
       -                        len = strcspn(s + 1, ".#[ \t\n");
       +                        len = strcspn(s + 1, ".#@ \t\n");
                                if (len >= sizeof(tmp))
                                        return 0;
                                memcpy(tmp, s + 1, len);
       @@ -1225,6 +1245,13 @@ iscssmatch(struct selector *sel, struct node *root, int maxdepth)
                            !isclassmatch(root[d].classnames, sel->nodes[md].classnames))
                                continue; /* no */
        
       +                /* index matched */
       +                if (sel->nodes[md].index != -1 &&
       +                    (d == 0 ||
       +                    root[d - 1].nchildren == 0 ||
       +                    sel->nodes[md].index != root[d - 1].nchildren - 1))
       +                        continue;
       +
                        md++;
                        /* all matched of one selector */
                        if (md == sel->depth)