improvements - surf-adblock - Surf adblock web extension
 (HTM) git clone git://git.codemadness.org/surf-adblock
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit d781090ae7718310fb13c83c1a8406be46a613b8
 (DIR) parent b6cc76e9fcac3112086f2d2348ef53b16b59da9d
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sat,  3 Jun 2017 21:54:05 +0200
       
       improvements
       
       - WIP: faster matching for simple rules, just a test atm, but ~50ms max to ~20ms on my
         machine.
       - add support for exception rules.
       - debug.sh add debug script for testing: compile as main().
       
       Diffstat:
         M TODO                                |      14 +++++++++++++-
         A debug.sh                            |      18 ++++++++++++++++++
         M surf-adblock.c                      |     279 +++++++++++++++++++++++++------
       
       3 files changed, 255 insertions(+), 56 deletions(-)
       ---
 (DIR) diff --git a/TODO b/TODO
       @@ -1,5 +1,15 @@
        - fix tweakers.net popup / rule.
       -- benchmark rule matching (timing).
       +        this is in an exception rule...
       +
       +        make sure exception rules are always below in the list? modify awk script?
       +
       +- performance:
       +  - benchmark rule matching (timing).
       +  - bloom filters? some kind of cache?
       +  - optimize simple filter case.
       +
       +- support separator "^" = [/\?]?
       +  - test it better.
        
        ===
        
       @@ -23,6 +33,8 @@ Docs:
                  and matchbegin or matchend set.
                - make less CPU intensive.
                - maybe even include it statically?
       +        - optimize CSS rule matching (only per site?).
       +
        - optimize memory allocation.
        - optimize: pregenerate one global stylesheet that applies to all sites?
        - separate adblocker into daemon? not sure.
 (DIR) diff --git a/debug.sh b/debug.sh
       @@ -0,0 +1,18 @@
       +#!/bin/sh
       +# ugly debug script: compile as standalone program for testing.
       +
       +cc -std=c99 -pedantic -Wall -Os -I. -I/usr/include -I/usr/X11R6/include \
       +        `pkg-config --cflags gtk+-3.0 webkit2gtk-4.0  webkit2gtk-web-extension-4.0` \
       +        -DVERSION=\"0.1\" -DWEBEXTDIR=\"/usr/local/lib/surf\"  -D_DEFAULT_SOURCE \
       +        -DWEBEXTDIR=\"/usr/local/lib/surf\" \
       +        `pkg-config --cflags gtk+-3.0 webkit2gtk-4.0  webkit2gtk-web-extension-4.0` \
       +        -DDEBUG \
       +        -c surf-adblock.c
       +cc  -s -L/usr/lib -lc -L/usr/X11R6/lib -lX11 \
       +        `pkg-config --libs gtk+-3.0 webkit2gtk-4.0  webkit2gtk-web-extension-4.0` -lgthread-2.0 -module -avoid-version -o surf-adblock \
       +        surf-adblock.o
       +
       +chmod +x surf-adblock
       +# NOTE: need to copy because of W^X.
       +doas cp surf-adblock /usr/local/bin
       +/usr/local/bin/surf-adblock
 (DIR) diff --git a/surf-adblock.c b/surf-adblock.c
       @@ -305,11 +305,11 @@ match(const char *pat, const char *str, int fcase)
                                break;
                        default:
                                k = str_next(str, n, &sinc);
       +                        /* TODO: write a test-case */
       +                        if (c == CARET && (k == '?' || k == '/' || k <= 0))
       +                                return 1;
                                if (k <= 0)
                                        return (c==END) ? 0 : 1;
       -                        if (c == CARET && (iswdigit(k) || iswalpha(k) ||
       -                            strchr("_-.%", k)))
       -                                return 1;
                                str += sinc;
                                n -= sinc;
                                kfold = fcase ? casefold(k) : k;
       @@ -410,7 +410,6 @@ match(const char *pat, const char *str, int fcase)
                return 0;
        }
        
       -
        /*
        domain=...   if domain is prefixed with ~, ignore.
        multiple domains can be separated with |
       @@ -521,28 +520,63 @@ matchrule(struct filterrule *f, const char *uri, const char *type,
                /* NOTE: order matters, see FilterType enum values */
                struct filterdomain *d;
                char pat[1024];
       -        int r;
       -
       -        /* ignore exception rules for now, these are usually paid
       -         * for by sites to allow advertisements. */
       -        if (f->isexception)
       -                return 0;
       +        int r, m;
        
       -        if (f->css) {
       -                r = f->domains ? 0 : 1;
       -                for (d = f->domains; d; d = d->next) {
       -                        if (matchdomain(d->domain, domain)) {
       -                                if (r && d->inverse)
       -                                        r = 0;
       -                                else if (!r && !d->inverse)
       -                                        r = 1;
       -                        } else if (r && !d->inverse) {
       +        r = f->domains ? 0 : 1;
       +        for (d = f->domains; d; d = d->next) {
       +                if (matchdomain(d->domain, domain)) {
       +                        if (r && d->inverse)
                                        r = 0;
       -                        }
       +                        else if (!r && !d->inverse)
       +                                r = 1;
       +                } else if (r && !d->inverse) {
       +                        r = 0;
                        }
       +        }
       +        if (f->css) {
       +                /* DEBUG */
       +                if (f->isexception)
       +                        printf("DEBUG, exception rule, CSS: %s, match? %d\n",
       +                        f->css, r);
                        return r;
                }
        
       +#if 1
       +        /* skip allow rule, TODO: inverse? */
       +        if (!r)
       +                return 0;
       +#endif
       +
       +#if 1
       +        /* DEBUG: test, match if it is a simple pattern */
       +        char *p;
       +        p = strchr(f->uri, '*');
       +        if (!p)
       +                p = strchr(f->uri, '^');
       +        if (!p) {
       +                /* TODO: write a test-case */
       +                if (f->block & FilterTypeMatchCase) {
       +                        if (f->matchbegin)
       +                                m = strncmp(uri, f->uri, strlen(f->uri)) == 0;
       +                        else if (f->matchend)
       +                                m = strlen(f->uri) <= strlen(uri) &&
       +                                        strcmp(&uri[strlen(uri) - strlen(f->uri)], f->uri) == 0;
       +                        else
       +                                m = strstr(uri, f->uri) ? 1 : 0;
       +                } else {
       +                        if (f->matchbegin)
       +                                m = strncasecmp(uri, f->uri, strlen(f->uri)) == 0;
       +                        else if (f->matchend)
       +                                m = strlen(f->uri) <= strlen(uri) &&
       +                                        strcasecmp(&uri[strlen(uri) - strlen(f->uri)], f->uri) == 0;
       +                        else
       +                                m = strcasestr(uri, f->uri) ? 1 : 0;
       +                }
       +                /*m = r ? !m : m;*/
       +                return m;
       +        }
       +#endif
       +
                r = snprintf(pat, sizeof(pat), "%s%s%s",
                        f->matchbegin ? "" : "*",
                        f->uri,
       @@ -552,19 +586,8 @@ matchrule(struct filterrule *f, const char *uri, const char *type,
                        return 0;
                }
        
       -        r = f->domains ? 0 : 1;
       -        for (d = f->domains; d; d = d->next) {
       -                if (matchdomain(d->domain, domain)) {
       -                        if (r && d->inverse)
       -                                r = 0;
       -                        else if (!r && !d->inverse)
       -                                r = 1;
       -                } else if (r && !d->inverse) {
       -                        r = 0;
       -                }
       -        }
       -
       -        if (r && !match(pat, uri, (f->block & FilterTypeMatchCase) ? 0 : 1)) {
       +        m = 0;
       +        if (!match(pat, uri, (f->block & FilterTypeMatchCase) ? 0 : 1)) {
        #if 0
                        for (; *type; type++) {
                                for (i = 0; blockstr[i]; i++) {
       @@ -575,11 +598,13 @@ matchrule(struct filterrule *f, const char *uri, const char *type,
                                        }
                                }
                        }
       +
                        return 0;
        #endif
       -                return 1;
       +                m = 1;
                }
       -        return 0;
       +        /*m = r ? !m : m;*/
       +        return m;
        }
        
        static int
       @@ -695,6 +720,7 @@ end:
                return 1;
        }
        
       +#if 0
        static void
        debugrule(struct filterrule *r)
        {
       @@ -702,6 +728,7 @@ debugrule(struct filterrule *r)
                       "%lu\n===\n", r->uri ? r->uri : "", r->css ? r->css : "",
                       r->isexception, r->block);
        }
       +#endif
        
        static int
        loadrules(FILE *fp)
       @@ -775,6 +802,12 @@ documentloaded(WebKitWebPage *wp, Page *p)
                printf("uri: %s\n", uri);
                printf("domain: %s\n", domain);
        
       +        /* DEBUG: timing */
       +        struct timespec tp_start, tp_end, tp_diff;
       +        if (clock_gettime(CLOCK_MONOTONIC, &tp_start) == -1) {
       +                fprintf(stderr, "clock_gettime: %s\n", strerror(errno));
       +        }
       +
                /* site-specific CSS */
                memset(&sitecss, 0, sizeof(sitecss));
                for (r = rules; r; r = r->next) {
       @@ -783,11 +816,38 @@ documentloaded(WebKitWebPage *wp, Page *p)
                        len = strlen(r->css);
                        if (string_append(&sitecss, r->css, len) < len)
                                return;
       -                len = sizeof("{display:none;}") -1;
       -                if (string_append(&sitecss, "{display:none;}", len) < len)
       -                        return;
       +
       +                if (r->isexception) {
       +                        len = sizeof("{display:initial;}") -1;
       +                        if (string_append(&sitecss, "{display:initial;}", len) < len)
       +                                return;
       +                } else {
       +                        len = sizeof("{display:none;}") -1;
       +                        if (string_append(&sitecss, "{display:none;}", len) < len)
       +                                return;
       +                }
       +        }
       +/*        printf("sitecss: %s\n", sitecss.data ? sitecss.data : "<empty>");*/
       +
       +        /* DEBUG: timing */
       +        if (clock_gettime(CLOCK_MONOTONIC, &tp_end) == -1) {
       +                fprintf(stderr, "clock_gettime: %s\n", strerror(errno));
                }
       -        printf("sitecss: %s\n", sitecss.data ? sitecss.data : "<empty>");
       +
       +        tp_diff.tv_sec = tp_end.tv_sec - tp_start.tv_sec;
       +        tp_diff.tv_nsec = tp_end.tv_nsec - tp_start.tv_nsec;
       +        if (tp_diff.tv_nsec < 0) {
       +                tp_diff.tv_sec--;
       +                tp_diff.tv_nsec += 1000000000L;
       +        }
       +
       +        printf("timing: %zu sec, %.3f ms\n",
       +                tp_diff.tv_sec, (float)tp_diff.tv_nsec / 1000000.0f);
       +
       +        if (globalcss.data)
       +                printf("global CSS length in bytes: %zu\n", strlen(globalcss.data));
       +        if (sitecss.data)
       +                printf("site CSS length in bytes: %zu\n", strlen(sitecss.data));
        
                p->view = webkit_dom_document_get_default_view(doc);
        
       @@ -819,6 +879,7 @@ sendrequest(WebKitWebPage *wp, WebKitURIRequest *req,
                const char *s, *uri = webkit_web_page_get_uri(p->webpage),
                           *requri = webkit_uri_request_get_uri(req);
                size_t len;
       +        gboolean status = FALSE;
        
                if (!uri || !strcmp(requri, uri) ||
                    (strncmp(uri, "http://", sizeof("http://") - 1) &&
       @@ -830,6 +891,12 @@ sendrequest(WebKitWebPage *wp, WebKitURIRequest *req,
                memcpy(domain, s, len);
                domain[len] = '\0';
        
       +        /* DEBUG: timing */
       +        struct timespec tp_start, tp_end, tp_diff;
       +        if (clock_gettime(CLOCK_MONOTONIC, &tp_start) == -1) {
       +                fprintf(stderr, "clock_gettime: %s\n", strerror(errno));
       +        }
       +
                /* match rules */
                for (r = rules; r; r = r->next) {
                        if (!r->css && matchrule(r, requri, "csio^", domain)) {
       @@ -839,30 +906,32 @@ sendrequest(WebKitWebPage *wp, WebKitURIRequest *req,
        
                                fprintf(stderr, "blocked: %s, %s\n", domain, requri);
        
       -                        return TRUE;
       +                        status = TRUE;
       +                        goto end;
                        }
                }
        
       -        return FALSE;
       -}
       -
       -static void
       -webpagecreated(WebKitWebExtension *e, WebKitWebPage *p, gpointer unused)
       -{
       -        Page *np;
       +end:
       +        /* DEBUG: timing */
       +        if (clock_gettime(CLOCK_MONOTONIC, &tp_end) == -1) {
       +                fprintf(stderr, "clock_gettime: %s\n", strerror(errno));
       +        }
        
       -        if (!(np = newpage(p))) {
       -                weprintf("cannot associate webext with new page: %s\n",
       -                         strerror(errno));
       -                return;
       +        tp_diff.tv_sec = tp_end.tv_sec - tp_start.tv_sec;
       +        tp_diff.tv_nsec = tp_end.tv_nsec - tp_start.tv_nsec;
       +        if (tp_diff.tv_nsec < 0) {
       +                tp_diff.tv_sec--;
       +                tp_diff.tv_nsec += 1000000000L;
                }
        
       -        g_signal_connect(p, "document-loaded", G_CALLBACK(documentloaded), np);
       -        g_signal_connect(p, "send-request", G_CALLBACK(sendrequest), np);
       +        printf("%s [%s] timing: %zu sec, %.3f ms\n",
       +                requri, uri, tp_diff.tv_sec, (float)tp_diff.tv_nsec / 1000000.0f);
       +
       +        return status;
        }
        
       -G_MODULE_EXPORT void
       -webkit_web_extension_initialize(WebKitWebExtension *ext)
       +void
       +init(void)
        {
                struct filterrule *r;
                FILE *fp;
       @@ -922,6 +991,106 @@ webkit_web_extension_initialize(WebKitWebExtension *ext)
                                return;
                        }
                }
       +}
       +
       +static void
       +webpagecreated(WebKitWebExtension *e, WebKitWebPage *p, gpointer unused)
       +{
       +        Page *np;
       +
       +        if (!(np = newpage(p))) {
       +                weprintf("cannot associate webext with new page: %s\n",
       +                         strerror(errno));
       +                return;
       +        }
       +
       +        g_signal_connect(p, "document-loaded", G_CALLBACK(documentloaded), np);
       +        g_signal_connect(p, "send-request", G_CALLBACK(sendrequest), np);
       +}
        
       +G_MODULE_EXPORT void
       +webkit_web_extension_initialize(WebKitWebExtension *ext)
       +{
       +        init();
                g_signal_connect(ext, "page-created", G_CALLBACK(webpagecreated), NULL);
        }
       +
       +#ifdef DEBUG
       +int
       +main(void)
       +{
       +        char domain[256];
       +        String sitecss;
       +        struct filterrule *r;
       +        const char *s, *uri;
       +        size_t len;
       +
       +        /* TEST */
       +        uri = "https://tweakers.net/";
       +
       +        if (!uri || (strncmp(uri, "http://", sizeof("http://") - 1) &&
       +            strncmp(uri, "https://", sizeof("https://") - 1)))
       +                return;
       +
       +        init();
       +
       +        s = strstr(uri, "://") + sizeof("://") - 1;
       +        len = strcspn(s, "/");
       +        memcpy(domain, s, len);
       +        domain[len] = '\0';
       +
       +        printf("uri: %s\n", uri);
       +        printf("domain: %s\n", domain);
       +
       +        /* DEBUG: timing */
       +        struct timespec tp_start, tp_end, tp_diff;
       +        if (clock_gettime(CLOCK_MONOTONIC, &tp_start) == -1) {
       +                fprintf(stderr, "clock_gettime: %s\n", strerror(errno));
       +        }
       +
       +        /* site-specific CSS */
       +        memset(&sitecss, 0, sizeof(sitecss));
       +        for (r = rules; r; r = r->next) {
       +                if (!r->css || !r->domains || !matchrule(r, "", "", domain))
       +                        continue;
       +                len = strlen(r->css);
       +                if (string_append(&sitecss, r->css, len) < len)
       +                        return;
       +                if (r->isexception) {
       +                        len = sizeof("{display:initial;}") -1;
       +                        if (string_append(&sitecss, "{display:initial;}", len) < len)
       +                                return;
       +                } else {
       +                        len = sizeof("{display:none;}") -1;
       +                        if (string_append(&sitecss, "{display:none;}", len) < len)
       +                                return;
       +                }
       +        }
       +        printf("sitecss: %s\n", sitecss.data ? sitecss.data : "<empty>");
       +
       +        /* DEBUG: timing */
       +        if (clock_gettime(CLOCK_MONOTONIC, &tp_end) == -1) {
       +                fprintf(stderr, "clock_gettime: %s\n", strerror(errno));
       +        }
       +
       +        tp_diff.tv_sec = tp_end.tv_sec - tp_start.tv_sec;
       +        tp_diff.tv_nsec = tp_end.tv_nsec - tp_start.tv_nsec;
       +        if (tp_diff.tv_nsec < 0) {
       +                tp_diff.tv_sec--;
       +                tp_diff.tv_nsec += 1000000000L;
       +        }
       +
       +        printf("timing: %zu sec, %.3f ms\n",
       +                tp_diff.tv_sec, (float)tp_diff.tv_nsec / 1000000.0f);
       +
       +        if (globalcss.data)
       +                printf("global CSS length in bytes: %zu\n", strlen(globalcss.data));
       +        if (sitecss.data)
       +                printf("site CSS length in bytes: %zu\n", strlen(sitecss.data));
       +
       +        free(sitecss.data);
       +        cleanup();
       +
       +        return 0;
       +}
       +#endif