codemadness.org

       youtube: fix using the new layout and JSON extraction - frontends - front-ends for some sites (experiment)
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit a9b8d9a25d11ec18fdee7fa98ad93db35325672a
 (DIR) parent 6f3fa93b7099d8bf5df5ba3fc04958aedd1bb099
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Thu,  3 Sep 2020 11:23:10 +0200
       
       youtube: fix using the new layout and JSON extraction
       
       Instead of scraping HTML from the site it now extracts the initial JSON data
       and parses it.
       
       Diffstat:
         M youtube/youtube.c                   |     399 ++++++++++++-------------------
       
       1 file changed, 149 insertions(+), 250 deletions(-)
       ---
 (DIR) diff --git a/youtube/youtube.c b/youtube/youtube.c
       @@ -11,291 +11,192 @@
        #include <unistd.h>
        
        #include "https.h"
       +#include "json.h"
        #include "util.h"
        #include "youtube.h"
       -#include "xml.h"
       -
       -#define STRP(s) s,sizeof(s)-1
       -
       -/* temporary variables to copy for states */
       -static char id[256], userid[256];
       -
       -/* states */
       -static int metainfocount;
       -static enum ItemState {
       -        None  = 0,
       -        Item  = 1, Pager = 2,
       -        Metainfo = 4, Title = 8, User = 16, Videotime = 32,
       -} state;
       -
       -static struct item *videos;
       -static size_t nvideos;
        
        static char *
        youtube_request(const char *path)
        {
       -        return request("www.youtube.com", path,
       -                       "User-Agent: Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\r\n");
       -}
       -
       -static int
       -isclassmatch(const char *classes, const char *clss, size_t len)
       -{
       -        const char *p;
       -
       -        if (!(p = strstr(classes, clss)))
       -                return 0;
       -        return (p == classes || isspace((unsigned char)p[-1])) &&
       -                (isspace((unsigned char)p[len]) || !p[len]);
       -}
       -
       -/* XML/HTML entity conversion */
       -static const char *
       -entitytostr(const char *s)
       -{
       -        static char buf[16];
       -        ssize_t len;
       -
       -        if ((len = xml_entitytostr(s, buf, sizeof(buf))) > 0)
       -                return buf;
       -
       -        return s;
       +        return request("www.youtube.com", path, "");
        }
        
       -static void
       -xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
       -        const char *v, size_t vl)
       +static char *
       +request_search(const char *s, const char *page, const char *order)
        {
       -        /* grouped channel index, used for channelid and channel title */
       -        static int grouped = -1;
       -
       -        if (!strcmp(t, "div") && !strcmp(a, "class") && isclassmatch(v, STRP("search-pager"))) {
       -                /* last video */
       -                if (nvideos < MAX_VIDEOS && videos[nvideos].linktype) {
       -                        if (grouped != -1 && !videos[nvideos].channelid[0]) {
       -                                strlcpy(videos[nvideos].channelid, videos[grouped].channelid, sizeof(videos[nvideos].channelid));
       -                                strlcpy(videos[nvideos].channeltitle, videos[grouped].channeltitle, sizeof(videos[nvideos].channeltitle));
       -                        }
       -                        nvideos++;
       -                }
       -                state &= ~Item;
       -                state |= Pager;
       -        }
       -
       -        if (nvideos >= MAX_VIDEOS)
       -                return;
       +        char path[4096];
        
       -        if (!strcmp(t, "div") && !strcmp(a, "class") &&
       -                isclassmatch(v, STRP("yt-lockup"))) {
       -                state |= Item;
       -                if (videos[nvideos].linktype) {
       -                        if (videos[nvideos].channelid[0] || videos[nvideos].userid[0] ||
       -                            videos[nvideos].linktype != Video)
       -                                grouped = -1;
       -                        if (videos[nvideos].linktype == Channel)
       -                                grouped = nvideos;
       -                        if (grouped != -1 && !videos[nvideos].channelid[0]) {
       -                                strlcpy(videos[nvideos].channelid, videos[grouped].channelid, sizeof(videos[nvideos].channelid));
       -                                strlcpy(videos[nvideos].channeltitle, videos[grouped].channeltitle, sizeof(videos[nvideos].channeltitle));
       -                        }
       -                        nvideos++;
       -                }
       -                if (strstr(v, " yt-lockup-channel "))
       -                        videos[nvideos].linktype = Channel;
       -                else if (strstr(v, "yt-lockup-movie-"))
       -                        videos[nvideos].linktype = Movie;
       -                else if (strstr(v, " yt-lockup-playlist "))
       -                        videos[nvideos].linktype = Playlist;
       -                if (strstr(v, " yt-lockup-video "))
       -                        videos[nvideos].linktype = Video;
       -        }
       -        if (!(state & Item))
       -                return;
       +        snprintf(path, sizeof(path), "/results?search_query=%s", s);
        
       -        if (!strcmp(t, "span") && !strcmp(a, "class") && isclassmatch(v, STRP("video-time")))
       -                state |= Videotime;
       -        if (!strcmp(t, "ul") && !strcmp(a, "class") && isclassmatch(v, STRP("yt-lockup-meta-info"))) {
       -                state |= Metainfo;
       -                metainfocount = 0;
       +        if (page[0]) {
       +                strlcat(path, "&page=", sizeof(path));
       +                strlcat(path, page, sizeof(path));
                }
       -        if (!strcmp(t, "h3") && !strcmp(a, "class") && isclassmatch(v, STRP("yt-lockup-title")))
       -                state |= Title;
       -        if (!strcmp(t, "div") && !strcmp(a, "class") && isclassmatch(v, STRP("yt-lockup-byline")))
       -                state |= User;
        
       -        if ((state & Title) && !strcmp(t, "a") && !strcmp(a, "title")) {
       -                if (videos[nvideos].linktype == Channel)
       -                        strlcat(videos[nvideos].channeltitle, v, sizeof(videos[nvideos].channeltitle));
       -                else
       -                        strlcat(videos[nvideos].title, v, sizeof(videos[nvideos].title));
       +        if (order[0]) {
       +                strlcat(path, "&search_sort=", sizeof(path));
       +                if (!strcmp(order, "date"))
       +                        strlcat(path, "video_date_uploaded", sizeof(path));
       +                else if (!strcmp(order, "relevance"))
       +                        strlcat(path, "video_relevance", sizeof(path));
       +                else if (!strcmp(order, "views"))
       +                        strlcat(path, "video_view_count", sizeof(path));
       +                else if (!strcmp(order, "rating"))
       +                        strlcat(path, "video_avg_rating", sizeof(path));
                }
        
       -        if ((state & Title) && !strcmp(t, "a") && !strcmp(a, "href"))
       -                strlcat(id, v, sizeof(id));
       -
       -        if (!strcmp(t, "button") && !strcmp(a, "data-channel-external-id"))
       -                strlcat(videos[nvideos].channelid, v, sizeof(videos[nvideos].channelid));
       +        /* check if request is too long (truncation) */
       +        if (strlen(path) >= sizeof(path) - 1)
       +                return NULL;
        
       -        if ((state & User) && !strcmp(t, "a") && !strcmp(a, "href"))
       -                strlcat(userid, v, sizeof(userid));
       +        return youtube_request(path);
        }
        
       -static void
       -xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
       -              const char *v, size_t vl)
       +int
       +extractjson(const char *s, char **start, char **end)
        {
       -        const char *s;
       +        if (!(*start = strstr(s, "window[\"ytInitialData\"] = ")))
       +                return -1;
       +        if (!(*end = strstr(*start, "};\n")))
       +                return -1;
        
       -        if (!(state & Pager) && nvideos >= MAX_VIDEOS)
       -                return;
       +        (*start) += sizeof("window[\"ytInitialData\"] = ") - 1;
       +        (*end)++;
        
       -        s = entitytostr(v);
       -        xmlattr(x, t, tl, a, al, s, strlen(s));
       +        return 0;
        }
        
       -static void
       -xmldata(XMLParser *x, const char *d, size_t dl)
       +void
       +processnode(struct json_node *nodes, size_t depth, const char *value,
       +        void *pp)
        {
       -        if ((state & Pager))
       -                return;
       +        struct search_response *r = (struct search_response *)pp;
       +        static struct item *item;
        
       -        /* optimization: no need to process and must not process videos after this */
       -        if (!state || nvideos >= MAX_VIDEOS)
       +        if (r->nitems > MAX_VIDEOS)
                        return;
        
       -        /* use parsed link type for meta info since this metainfo differs per type like:
       -           channel, playlist, video */
       -        if ((state & Metainfo)) {
       -                switch (videos[nvideos].linktype) {
       -                case Playlist:
       -                        break; /* ignore */
       -                case Channel:
       -                        if (metainfocount == 1)
       -                                strlcat(videos[nvideos].channelvideos, d, sizeof(videos[nvideos].channelvideos));
       -                        break;
       -                default:
       -                        if (metainfocount == 1)
       -                                strlcat(videos[nvideos].publishedat, d, sizeof(videos[nvideos].publishedat));
       -                        else if (metainfocount == 2)
       -                                strlcat(videos[nvideos].viewcount, d, sizeof(videos[nvideos].viewcount));
       -                }
       +        /* new item, structures can be very deep, just check the end for:
       +           (items|contents)[].videoRenderer objects */
       +        if (depth >= 3 &&
       +            nodes[depth - 3].type == TYPE_ARRAY &&
       +            nodes[depth - 2].type == TYPE_OBJECT &&
       +            nodes[depth - 1].type == TYPE_OBJECT &&
       +            (!strcmp(nodes[depth - 3].name, "items") ||
       +             !strcmp(nodes[depth - 3].name, "contents")) &&
       +            !strcmp(nodes[depth - 1].name, "videoRenderer")) {
       +                r->nitems++;
       +                return;
                }
       -        if ((state & Videotime) && !strcmp(x->tag, "span"))
       -                strlcat(videos[nvideos].duration, d, sizeof(videos[nvideos].duration));
       -        if ((state & User) && !strcmp(x->tag, "a"))
       -                strlcat(videos[nvideos].channeltitle, d, sizeof(videos[nvideos].channeltitle));
       -}
       -
       -static void
       -xmldataentity(XMLParser *x, const char *d, size_t dl)
       -{
       -        const char *s;
        
       -        /* optimization: no need for entity conversion */
       -        if (!state || nvideos >= MAX_VIDEOS)
       +        if (r->nitems == 0)
                        return;
       -
       -        s = entitytostr(d);
       -        xmldata(x, s, strlen(s));
       -}
       -
       -static void
       -xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
       -{
       -        char *p;
       -
       -        if ((state & Metainfo) && !strcmp(t, "ul"))
       -                state &= ~Metainfo;
       -        if ((state & Title) && !strcmp(t, "h3")) {
       -                state &= ~Title;
       -
       -                if (nvideos >= MAX_VIDEOS)
       -                        return;
       -
       -                if (!strncmp(id, "/watch", sizeof("/watch") - 1)) {
       -                        if (!videos[nvideos].linktype)
       -                                videos[nvideos].linktype = Video;
       -                        if ((p = getparam(id, "v"))) {
       -                                if (decodeparam(videos[nvideos].id, sizeof(videos[nvideos].id), p) == -1)
       -                                        videos[nvideos].id[0] = '\0';
       -                        }
       -                }
       -
       -                id[0] = '\0';
       +        item = &(r->items[r->nitems - 1]);
       +
       +        if (depth >= 4 &&
       +            nodes[depth - 4].type == TYPE_ARRAY &&
       +            nodes[depth - 3].type == TYPE_OBJECT &&
       +            nodes[depth - 2].type == TYPE_OBJECT &&
       +            nodes[depth - 1].type == TYPE_STRING &&
       +            (!strcmp(nodes[depth - 4].name, "items") ||
       +             !strcmp(nodes[depth - 4].name, "contents")) &&
       +            !strcmp(nodes[depth - 2].name, "videoRenderer") &&
       +            !strcmp(nodes[depth - 1].name, "videoId")) {
       +                strlcpy(item->id, value, sizeof(item->id));
                }
       -        if ((state & User)) {
       -                state &= ~User;
        
       -                if (nvideos >= MAX_VIDEOS)
       -                        return;
       +        if (depth >= 7 &&
       +            nodes[depth - 7].type == TYPE_ARRAY &&
       +            nodes[depth - 6].type == TYPE_OBJECT &&
       +            nodes[depth - 5].type == TYPE_OBJECT &&
       +            nodes[depth - 4].type == TYPE_OBJECT &&
       +            nodes[depth - 3].type == TYPE_ARRAY &&
       +            nodes[depth - 2].type == TYPE_OBJECT &&
       +            nodes[depth - 1].type == TYPE_STRING &&
       +            (!strcmp(nodes[depth - 7].name, "items") ||
       +             !strcmp(nodes[depth - 7].name, "contents")) &&
       +            !strcmp(nodes[depth - 5].name, "videoRenderer") &&
       +            !strcmp(nodes[depth - 4].name, "title") &&
       +            !strcmp(nodes[depth - 3].name, "runs") &&
       +            !strcmp(nodes[depth - 1].name, "text") &&
       +                !item->title[0]) {
       +                strlcpy(item->title, value, sizeof(item->title));
       +        }
        
       -                /* can be user or channel */
       -                if (!strncmp(userid, "/channel/", sizeof("/channel/") - 1)) {
       -                        strlcpy(videos[nvideos].channelid,
       -                                userid + sizeof("/channel/") - 1,
       -                                sizeof(videos[nvideos].channelid));
       -                } else if (!strncmp(userid, "/user/", sizeof("/user/") - 1)) {
       -                        strlcpy(videos[nvideos].userid,
       -                                userid + sizeof("/user/") - 1,
       -                                sizeof(videos[nvideos].userid));
       +        if (depth >= 5 &&
       +            nodes[depth - 5].type == TYPE_ARRAY &&
       +            nodes[depth - 4].type == TYPE_OBJECT &&
       +            nodes[depth - 3].type == TYPE_OBJECT &&
       +            nodes[depth - 2].type == TYPE_OBJECT &&
       +            nodes[depth - 1].type == TYPE_STRING &&
       +            (!strcmp(nodes[depth - 5].name, "items") ||
       +             !strcmp(nodes[depth - 5].name, "contents")) &&
       +            !strcmp(nodes[depth - 3].name, "videoRenderer") &&
       +            !strcmp(nodes[depth - 1].name, "simpleText")) {
       +                if (!strcmp(nodes[depth - 2].name, "viewCountText") &&
       +                    !item->viewcount[0]) {
       +                        strlcpy(item->viewcount, value, sizeof(item->viewcount));
       +                } else if (!strcmp(nodes[depth - 2].name, "lengthText") &&
       +                    !item->duration[0]) {
       +                        strlcpy(item->duration, value, sizeof(item->duration));
       +                } else if (!strcmp(nodes[depth - 2].name, "publishedTimeText") &&
       +                    !item->publishedat[0]) {
       +                        strlcpy(item->publishedat, value, sizeof(item->publishedat));
                        }
       -
       -                userid[0] = '\0';
                }
       -        if ((state & Videotime))
       -                state &= ~Videotime;
       -}
        
       -static void
       -xmltagstart(XMLParser *x, const char *t, size_t tl)
       -{
       -        if ((state & Metainfo) && !strcmp(t, "li"))
       -                metainfocount++;
       -}
       -
       -static char *
       -request_search(const char *s, const char *page, const char *order)
       -{
       -        char path[4096];
       -
       -        snprintf(path, sizeof(path), "/results?search_query=%s", s);
       -        if (page[0]) {
       -                strlcat(path, "&page=", sizeof(path));
       -                strlcat(path, page, sizeof(path));
       +        if (depth >= 9 &&
       +            nodes[depth - 9].type == TYPE_ARRAY &&
       +            nodes[depth - 8].type == TYPE_OBJECT &&
       +            nodes[depth - 7].type == TYPE_OBJECT &&
       +            nodes[depth - 6].type == TYPE_OBJECT &&
       +            nodes[depth - 5].type == TYPE_ARRAY &&
       +            nodes[depth - 4].type == TYPE_OBJECT &&
       +            nodes[depth - 3].type == TYPE_OBJECT &&
       +            nodes[depth - 2].type == TYPE_OBJECT &&
       +            nodes[depth - 1].type == TYPE_STRING &&
       +            (!strcmp(nodes[depth - 9].name, "items") ||
       +             !strcmp(nodes[depth - 9].name, "contents")) &&
       +            !strcmp(nodes[depth - 7].name, "videoRenderer") &&
       +            !strcmp(nodes[depth - 6].name, "longBylineText") &&
       +            !strcmp(nodes[depth - 5].name, "runs") &&
       +            !strcmp(nodes[depth - 3].name, "navigationEndpoint") &&
       +            !strcmp(nodes[depth - 2].name, "browseEndpoint")) {
       +                if (!strcmp(nodes[depth - 1].name, "browseId")) {
       +                        strlcpy(item->channelid, value, sizeof(item->channelid));
       +                }
                }
        
       -        if (order[0]) {
       -                strlcat(path, "&search_sort=", sizeof(path));
       -                if (!strcmp(order, "date"))
       -                        strlcat(path, "video_date_uploaded", sizeof(path));
       -                else if (!strcmp(order, "relevance"))
       -                        strlcat(path, "video_relevance", sizeof(path));
       -                else if (!strcmp(order, "views"))
       -                        strlcat(path, "video_view_count", sizeof(path));
       -                else if (!strcmp(order, "rating"))
       -                        strlcat(path, "video_avg_rating", sizeof(path));
       +        if (depth >= 7 &&
       +            nodes[depth - 7].type == TYPE_ARRAY &&
       +            nodes[depth - 6].type == TYPE_OBJECT &&
       +            nodes[depth - 5].type == TYPE_OBJECT &&
       +            nodes[depth - 4].type == TYPE_OBJECT &&
       +            nodes[depth - 3].type == TYPE_ARRAY &&
       +            nodes[depth - 2].type == TYPE_OBJECT &&
       +            nodes[depth - 1].type == TYPE_STRING &&
       +            (!strcmp(nodes[depth - 7].name, "items") ||
       +             !strcmp(nodes[depth - 7].name, "contents")) &&
       +            !strcmp(nodes[depth - 5].name, "videoRenderer") &&
       +            !strcmp(nodes[depth - 4].name, "longBylineText") &&
       +            !strcmp(nodes[depth - 3].name, "runs")) {
       +                if (!strcmp(nodes[depth - 1].name, "text") &&
       +                    !item->channeltitle[0]) {
       +                        strlcpy(item->channeltitle, value, sizeof(item->channeltitle));
       +                }
                }
       -
       -        /* force older youtube layout, else youtube will try to randomly serve
       -           a new layout sometimes breaking the parsing */
       -        strlcat(path, "&disable_polymer=1", sizeof(path));
       -
       -        /* check if request is too long (truncation) */
       -        if (strlen(path) >= sizeof(path) - 1)
       -                return NULL;
       -
       -        return youtube_request(path);
        }
        
        struct search_response *
        youtube_search(const char *rawsearch, const char *page, const char *order)
        {
                struct search_response *r;
       -        XMLParser x = { 0 };
       -        char *data, *s;
       +        char *data, *s, *start, *end;
       +        int ret;
        
                if (!(data = request_search(rawsearch, page, order)))
                        return NULL;
       +
                if (!(s = strstr(data, "\r\n\r\n")))
                        return NULL; /* invalid response */
                /* skip header */
       @@ -304,20 +205,18 @@ youtube_search(const char *rawsearch, const char *page, const char *order)
                if (!(r = calloc(1, sizeof(*r))))
                        return NULL;
        
       -        nvideos = 0;
       -        videos = r->items;
       -
       -        x.xmlattr = xmlattr;
       -        x.xmlattrentity = xmlattrentity;
       -        x.xmldata = xmldata;
       -        x.xmldataentity = xmldataentity;
       -        x.xmltagend = xmltagend;
       -        x.xmltagstart = xmltagstart;
       -
       -        setxmldata(s, strlen(s));
       -        xml_parse(&x);
       +        if (extractjson(s, &start, &end) == -1) {
       +//                fprintf(stderr, "error extracting JSON");
       +                free(r);
       +                return NULL;
       +        }
        
       -        r->nitems = nvideos;
       +        ret = parsejson(start, end - start, processnode, r);
       +        if (ret < 0) {
       +//                fprintf(stderr, "error parsing JSON");
       +                free(r);
       +                return NULL;
       +        }
        
                return r;
        }