improvements - surf-adblock - Surf adblock web extension (HTM) git clone git://git.codemadness.org/surf-adblock (DIR) Log (DIR) Files (DIR) Refs (DIR) README (DIR) LICENSE --- (DIR) commit d781090ae7718310fb13c83c1a8406be46a613b8 (DIR) parent b6cc76e9fcac3112086f2d2348ef53b16b59da9d (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org> Date: Sat, 3 Jun 2017 21:54:05 +0200 improvements - WIP: faster matching for simple rules, just a test atm, but ~50ms max to ~20ms on my machine. - add support for exception rules. - debug.sh add debug script for testing: compile as main(). Diffstat: M TODO | 14 +++++++++++++- A debug.sh | 18 ++++++++++++++++++ M surf-adblock.c | 279 +++++++++++++++++++++++++------ 3 files changed, 255 insertions(+), 56 deletions(-) --- (DIR) diff --git a/TODO b/TODO @@ -1,5 +1,15 @@ - fix tweakers.net popup / rule. -- benchmark rule matching (timing). + this is in an exception rule... + + make sure exception rules are always below in the list? modify awk script? + +- performance: + - benchmark rule matching (timing). + - bloom filters? some kind of cache? + - optimize simple filter case. + +- support separator "^" = [/\?]? + - test it better. === @@ -23,6 +33,8 @@ Docs: and matchbegin or matchend set. - make less CPU intensive. - maybe even include it statically? + - optimize CSS rule matching (only per site?). + - optimize memory allocation. - optimize: pregenerate one global stylesheet that applies to all sites? - separate adblocker into daemon? not sure. (DIR) diff --git a/debug.sh b/debug.sh @@ -0,0 +1,18 @@ +#!/bin/sh +# ugly debug script: compile as standalone program for testing. + +cc -std=c99 -pedantic -Wall -Os -I. -I/usr/include -I/usr/X11R6/include \ + `pkg-config --cflags gtk+-3.0 webkit2gtk-4.0 webkit2gtk-web-extension-4.0` \ + -DVERSION=\"0.1\" -DWEBEXTDIR=\"/usr/local/lib/surf\" -D_DEFAULT_SOURCE \ + -DWEBEXTDIR=\"/usr/local/lib/surf\" \ + `pkg-config --cflags gtk+-3.0 webkit2gtk-4.0 webkit2gtk-web-extension-4.0` \ + -DDEBUG \ + -c surf-adblock.c +cc -s -L/usr/lib -lc -L/usr/X11R6/lib -lX11 \ + `pkg-config --libs gtk+-3.0 webkit2gtk-4.0 webkit2gtk-web-extension-4.0` -lgthread-2.0 -module -avoid-version -o surf-adblock \ + surf-adblock.o + +chmod +x surf-adblock +# NOTE: need to copy because of W^X. +doas cp surf-adblock /usr/local/bin +/usr/local/bin/surf-adblock (DIR) diff --git a/surf-adblock.c b/surf-adblock.c @@ -305,11 +305,11 @@ match(const char *pat, const char *str, int fcase) break; default: k = str_next(str, n, &sinc); + /* TODO: write a test-case */ + if (c == CARET && (k == '?' || k == '/' || k <= 0)) + return 1; if (k <= 0) return (c==END) ? 0 : 1; - if (c == CARET && (iswdigit(k) || iswalpha(k) || - strchr("_-.%", k))) - return 1; str += sinc; n -= sinc; kfold = fcase ? casefold(k) : k; @@ -410,7 +410,6 @@ match(const char *pat, const char *str, int fcase) return 0; } - /* domain=... if domain is prefixed with ~, ignore. multiple domains can be separated with | @@ -521,28 +520,63 @@ matchrule(struct filterrule *f, const char *uri, const char *type, /* NOTE: order matters, see FilterType enum values */ struct filterdomain *d; char pat[1024]; - int r; - - /* ignore exception rules for now, these are usually paid - * for by sites to allow advertisements. */ - if (f->isexception) - return 0; + int r, m; - if (f->css) { - r = f->domains ? 0 : 1; - for (d = f->domains; d; d = d->next) { - if (matchdomain(d->domain, domain)) { - if (r && d->inverse) - r = 0; - else if (!r && !d->inverse) - r = 1; - } else if (r && !d->inverse) { + r = f->domains ? 0 : 1; + for (d = f->domains; d; d = d->next) { + if (matchdomain(d->domain, domain)) { + if (r && d->inverse) r = 0; - } + else if (!r && !d->inverse) + r = 1; + } else if (r && !d->inverse) { + r = 0; } + } + if (f->css) { + /* DEBUG */ + if (f->isexception) + printf("DEBUG, exception rule, CSS: %s, match? %d\n", + f->css, r); return r; } +#if 1 + /* skip allow rule, TODO: inverse? */ + if (!r) + return 0; +#endif + +#if 1 + /* DEBUG: test, match if it is a simple pattern */ + char *p; + p = strchr(f->uri, '*'); + if (!p) + p = strchr(f->uri, '^'); + if (!p) { + /* TODO: write a test-case */ + if (f->block & FilterTypeMatchCase) { + if (f->matchbegin) + m = strncmp(uri, f->uri, strlen(f->uri)) == 0; + else if (f->matchend) + m = strlen(f->uri) <= strlen(uri) && + strcmp(&uri[strlen(uri) - strlen(f->uri)], f->uri) == 0; + else + m = strstr(uri, f->uri) ? 1 : 0; + } else { + if (f->matchbegin) + m = strncasecmp(uri, f->uri, strlen(f->uri)) == 0; + else if (f->matchend) + m = strlen(f->uri) <= strlen(uri) && + strcasecmp(&uri[strlen(uri) - strlen(f->uri)], f->uri) == 0; + else + m = strcasestr(uri, f->uri) ? 1 : 0; + } + /*m = r ? !m : m;*/ + return m; + } +#endif + r = snprintf(pat, sizeof(pat), "%s%s%s", f->matchbegin ? "" : "*", f->uri, @@ -552,19 +586,8 @@ matchrule(struct filterrule *f, const char *uri, const char *type, return 0; } - r = f->domains ? 0 : 1; - for (d = f->domains; d; d = d->next) { - if (matchdomain(d->domain, domain)) { - if (r && d->inverse) - r = 0; - else if (!r && !d->inverse) - r = 1; - } else if (r && !d->inverse) { - r = 0; - } - } - - if (r && !match(pat, uri, (f->block & FilterTypeMatchCase) ? 0 : 1)) { + m = 0; + if (!match(pat, uri, (f->block & FilterTypeMatchCase) ? 0 : 1)) { #if 0 for (; *type; type++) { for (i = 0; blockstr[i]; i++) { @@ -575,11 +598,13 @@ matchrule(struct filterrule *f, const char *uri, const char *type, } } } + return 0; #endif - return 1; + m = 1; } - return 0; + /*m = r ? !m : m;*/ + return m; } static int @@ -695,6 +720,7 @@ end: return 1; } +#if 0 static void debugrule(struct filterrule *r) { @@ -702,6 +728,7 @@ debugrule(struct filterrule *r) "%lu\n===\n", r->uri ? r->uri : "", r->css ? r->css : "", r->isexception, r->block); } +#endif static int loadrules(FILE *fp) @@ -775,6 +802,12 @@ documentloaded(WebKitWebPage *wp, Page *p) printf("uri: %s\n", uri); printf("domain: %s\n", domain); + /* DEBUG: timing */ + struct timespec tp_start, tp_end, tp_diff; + if (clock_gettime(CLOCK_MONOTONIC, &tp_start) == -1) { + fprintf(stderr, "clock_gettime: %s\n", strerror(errno)); + } + /* site-specific CSS */ memset(&sitecss, 0, sizeof(sitecss)); for (r = rules; r; r = r->next) { @@ -783,11 +816,38 @@ documentloaded(WebKitWebPage *wp, Page *p) len = strlen(r->css); if (string_append(&sitecss, r->css, len) < len) return; - len = sizeof("{display:none;}") -1; - if (string_append(&sitecss, "{display:none;}", len) < len) - return; + + if (r->isexception) { + len = sizeof("{display:initial;}") -1; + if (string_append(&sitecss, "{display:initial;}", len) < len) + return; + } else { + len = sizeof("{display:none;}") -1; + if (string_append(&sitecss, "{display:none;}", len) < len) + return; + } + } +/* printf("sitecss: %s\n", sitecss.data ? sitecss.data : "<empty>");*/ + + /* DEBUG: timing */ + if (clock_gettime(CLOCK_MONOTONIC, &tp_end) == -1) { + fprintf(stderr, "clock_gettime: %s\n", strerror(errno)); } - printf("sitecss: %s\n", sitecss.data ? sitecss.data : "<empty>"); + + tp_diff.tv_sec = tp_end.tv_sec - tp_start.tv_sec; + tp_diff.tv_nsec = tp_end.tv_nsec - tp_start.tv_nsec; + if (tp_diff.tv_nsec < 0) { + tp_diff.tv_sec--; + tp_diff.tv_nsec += 1000000000L; + } + + printf("timing: %zu sec, %.3f ms\n", + tp_diff.tv_sec, (float)tp_diff.tv_nsec / 1000000.0f); + + if (globalcss.data) + printf("global CSS length in bytes: %zu\n", strlen(globalcss.data)); + if (sitecss.data) + printf("site CSS length in bytes: %zu\n", strlen(sitecss.data)); p->view = webkit_dom_document_get_default_view(doc); @@ -819,6 +879,7 @@ sendrequest(WebKitWebPage *wp, WebKitURIRequest *req, const char *s, *uri = webkit_web_page_get_uri(p->webpage), *requri = webkit_uri_request_get_uri(req); size_t len; + gboolean status = FALSE; if (!uri || !strcmp(requri, uri) || (strncmp(uri, "http://", sizeof("http://") - 1) && @@ -830,6 +891,12 @@ sendrequest(WebKitWebPage *wp, WebKitURIRequest *req, memcpy(domain, s, len); domain[len] = '\0'; + /* DEBUG: timing */ + struct timespec tp_start, tp_end, tp_diff; + if (clock_gettime(CLOCK_MONOTONIC, &tp_start) == -1) { + fprintf(stderr, "clock_gettime: %s\n", strerror(errno)); + } + /* match rules */ for (r = rules; r; r = r->next) { if (!r->css && matchrule(r, requri, "csio^", domain)) { @@ -839,30 +906,32 @@ sendrequest(WebKitWebPage *wp, WebKitURIRequest *req, fprintf(stderr, "blocked: %s, %s\n", domain, requri); - return TRUE; + status = TRUE; + goto end; } } - return FALSE; -} - -static void -webpagecreated(WebKitWebExtension *e, WebKitWebPage *p, gpointer unused) -{ - Page *np; +end: + /* DEBUG: timing */ + if (clock_gettime(CLOCK_MONOTONIC, &tp_end) == -1) { + fprintf(stderr, "clock_gettime: %s\n", strerror(errno)); + } - if (!(np = newpage(p))) { - weprintf("cannot associate webext with new page: %s\n", - strerror(errno)); - return; + tp_diff.tv_sec = tp_end.tv_sec - tp_start.tv_sec; + tp_diff.tv_nsec = tp_end.tv_nsec - tp_start.tv_nsec; + if (tp_diff.tv_nsec < 0) { + tp_diff.tv_sec--; + tp_diff.tv_nsec += 1000000000L; } - g_signal_connect(p, "document-loaded", G_CALLBACK(documentloaded), np); - g_signal_connect(p, "send-request", G_CALLBACK(sendrequest), np); + printf("%s [%s] timing: %zu sec, %.3f ms\n", + requri, uri, tp_diff.tv_sec, (float)tp_diff.tv_nsec / 1000000.0f); + + return status; } -G_MODULE_EXPORT void -webkit_web_extension_initialize(WebKitWebExtension *ext) +void +init(void) { struct filterrule *r; FILE *fp; @@ -922,6 +991,106 @@ webkit_web_extension_initialize(WebKitWebExtension *ext) return; } } +} + +static void +webpagecreated(WebKitWebExtension *e, WebKitWebPage *p, gpointer unused) +{ + Page *np; + + if (!(np = newpage(p))) { + weprintf("cannot associate webext with new page: %s\n", + strerror(errno)); + return; + } + + g_signal_connect(p, "document-loaded", G_CALLBACK(documentloaded), np); + g_signal_connect(p, "send-request", G_CALLBACK(sendrequest), np); +} +G_MODULE_EXPORT void +webkit_web_extension_initialize(WebKitWebExtension *ext) +{ + init(); g_signal_connect(ext, "page-created", G_CALLBACK(webpagecreated), NULL); } + +#ifdef DEBUG +int +main(void) +{ + char domain[256]; + String sitecss; + struct filterrule *r; + const char *s, *uri; + size_t len; + + /* TEST */ + uri = "https://tweakers.net/"; + + if (!uri || (strncmp(uri, "http://", sizeof("http://") - 1) && + strncmp(uri, "https://", sizeof("https://") - 1))) + return; + + init(); + + s = strstr(uri, "://") + sizeof("://") - 1; + len = strcspn(s, "/"); + memcpy(domain, s, len); + domain[len] = '\0'; + + printf("uri: %s\n", uri); + printf("domain: %s\n", domain); + + /* DEBUG: timing */ + struct timespec tp_start, tp_end, tp_diff; + if (clock_gettime(CLOCK_MONOTONIC, &tp_start) == -1) { + fprintf(stderr, "clock_gettime: %s\n", strerror(errno)); + } + + /* site-specific CSS */ + memset(&sitecss, 0, sizeof(sitecss)); + for (r = rules; r; r = r->next) { + if (!r->css || !r->domains || !matchrule(r, "", "", domain)) + continue; + len = strlen(r->css); + if (string_append(&sitecss, r->css, len) < len) + return; + if (r->isexception) { + len = sizeof("{display:initial;}") -1; + if (string_append(&sitecss, "{display:initial;}", len) < len) + return; + } else { + len = sizeof("{display:none;}") -1; + if (string_append(&sitecss, "{display:none;}", len) < len) + return; + } + } + printf("sitecss: %s\n", sitecss.data ? sitecss.data : "<empty>"); + + /* DEBUG: timing */ + if (clock_gettime(CLOCK_MONOTONIC, &tp_end) == -1) { + fprintf(stderr, "clock_gettime: %s\n", strerror(errno)); + } + + tp_diff.tv_sec = tp_end.tv_sec - tp_start.tv_sec; + tp_diff.tv_nsec = tp_end.tv_nsec - tp_start.tv_nsec; + if (tp_diff.tv_nsec < 0) { + tp_diff.tv_sec--; + tp_diff.tv_nsec += 1000000000L; + } + + printf("timing: %zu sec, %.3f ms\n", + tp_diff.tv_sec, (float)tp_diff.tv_nsec / 1000000.0f); + + if (globalcss.data) + printf("global CSS length in bytes: %zu\n", strlen(globalcss.data)); + if (sitecss.data) + printf("site CSS length in bytes: %zu\n", strlen(sitecss.data)); + + free(sitecss.data); + cleanup(); + + return 0; +} +#endif