youtube: fix using the new layout and JSON extraction - frontends - front-ends for some sites (experiment) (DIR) Log (DIR) Files (DIR) Refs (DIR) README (DIR) LICENSE --- (DIR) commit a9b8d9a25d11ec18fdee7fa98ad93db35325672a (DIR) parent 6f3fa93b7099d8bf5df5ba3fc04958aedd1bb099 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org> Date: Thu, 3 Sep 2020 11:23:10 +0200 youtube: fix using the new layout and JSON extraction Instead of scraping HTML from the site it now extracts the initial JSON data and parses it. Diffstat: M youtube/youtube.c | 399 ++++++++++++------------------- 1 file changed, 149 insertions(+), 250 deletions(-) --- (DIR) diff --git a/youtube/youtube.c b/youtube/youtube.c @@ -11,291 +11,192 @@ #include <unistd.h> #include "https.h" +#include "json.h" #include "util.h" #include "youtube.h" -#include "xml.h" - -#define STRP(s) s,sizeof(s)-1 - -/* temporary variables to copy for states */ -static char id[256], userid[256]; - -/* states */ -static int metainfocount; -static enum ItemState { - None = 0, - Item = 1, Pager = 2, - Metainfo = 4, Title = 8, User = 16, Videotime = 32, -} state; - -static struct item *videos; -static size_t nvideos; static char * youtube_request(const char *path) { - return request("www.youtube.com", path, - "User-Agent: Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\r\n"); -} - -static int -isclassmatch(const char *classes, const char *clss, size_t len) -{ - const char *p; - - if (!(p = strstr(classes, clss))) - return 0; - return (p == classes || isspace((unsigned char)p[-1])) && - (isspace((unsigned char)p[len]) || !p[len]); -} - -/* XML/HTML entity conversion */ -static const char * -entitytostr(const char *s) -{ - static char buf[16]; - ssize_t len; - - if ((len = xml_entitytostr(s, buf, sizeof(buf))) > 0) - return buf; - - return s; + return request("www.youtube.com", path, ""); } -static void -xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, - const char *v, size_t vl) +static char * +request_search(const char *s, const char *page, const char *order) { - /* grouped channel index, used for channelid and channel title */ - static int grouped = -1; - - if (!strcmp(t, "div") && !strcmp(a, "class") && isclassmatch(v, STRP("search-pager"))) { - /* last video */ - if (nvideos < MAX_VIDEOS && videos[nvideos].linktype) { - if (grouped != -1 && !videos[nvideos].channelid[0]) { - strlcpy(videos[nvideos].channelid, videos[grouped].channelid, sizeof(videos[nvideos].channelid)); - strlcpy(videos[nvideos].channeltitle, videos[grouped].channeltitle, sizeof(videos[nvideos].channeltitle)); - } - nvideos++; - } - state &= ~Item; - state |= Pager; - } - - if (nvideos >= MAX_VIDEOS) - return; + char path[4096]; - if (!strcmp(t, "div") && !strcmp(a, "class") && - isclassmatch(v, STRP("yt-lockup"))) { - state |= Item; - if (videos[nvideos].linktype) { - if (videos[nvideos].channelid[0] || videos[nvideos].userid[0] || - videos[nvideos].linktype != Video) - grouped = -1; - if (videos[nvideos].linktype == Channel) - grouped = nvideos; - if (grouped != -1 && !videos[nvideos].channelid[0]) { - strlcpy(videos[nvideos].channelid, videos[grouped].channelid, sizeof(videos[nvideos].channelid)); - strlcpy(videos[nvideos].channeltitle, videos[grouped].channeltitle, sizeof(videos[nvideos].channeltitle)); - } - nvideos++; - } - if (strstr(v, " yt-lockup-channel ")) - videos[nvideos].linktype = Channel; - else if (strstr(v, "yt-lockup-movie-")) - videos[nvideos].linktype = Movie; - else if (strstr(v, " yt-lockup-playlist ")) - videos[nvideos].linktype = Playlist; - if (strstr(v, " yt-lockup-video ")) - videos[nvideos].linktype = Video; - } - if (!(state & Item)) - return; + snprintf(path, sizeof(path), "/results?search_query=%s", s); - if (!strcmp(t, "span") && !strcmp(a, "class") && isclassmatch(v, STRP("video-time"))) - state |= Videotime; - if (!strcmp(t, "ul") && !strcmp(a, "class") && isclassmatch(v, STRP("yt-lockup-meta-info"))) { - state |= Metainfo; - metainfocount = 0; + if (page[0]) { + strlcat(path, "&page=", sizeof(path)); + strlcat(path, page, sizeof(path)); } - if (!strcmp(t, "h3") && !strcmp(a, "class") && isclassmatch(v, STRP("yt-lockup-title"))) - state |= Title; - if (!strcmp(t, "div") && !strcmp(a, "class") && isclassmatch(v, STRP("yt-lockup-byline"))) - state |= User; - if ((state & Title) && !strcmp(t, "a") && !strcmp(a, "title")) { - if (videos[nvideos].linktype == Channel) - strlcat(videos[nvideos].channeltitle, v, sizeof(videos[nvideos].channeltitle)); - else - strlcat(videos[nvideos].title, v, sizeof(videos[nvideos].title)); + if (order[0]) { + strlcat(path, "&search_sort=", sizeof(path)); + if (!strcmp(order, "date")) + strlcat(path, "video_date_uploaded", sizeof(path)); + else if (!strcmp(order, "relevance")) + strlcat(path, "video_relevance", sizeof(path)); + else if (!strcmp(order, "views")) + strlcat(path, "video_view_count", sizeof(path)); + else if (!strcmp(order, "rating")) + strlcat(path, "video_avg_rating", sizeof(path)); } - if ((state & Title) && !strcmp(t, "a") && !strcmp(a, "href")) - strlcat(id, v, sizeof(id)); - - if (!strcmp(t, "button") && !strcmp(a, "data-channel-external-id")) - strlcat(videos[nvideos].channelid, v, sizeof(videos[nvideos].channelid)); + /* check if request is too long (truncation) */ + if (strlen(path) >= sizeof(path) - 1) + return NULL; - if ((state & User) && !strcmp(t, "a") && !strcmp(a, "href")) - strlcat(userid, v, sizeof(userid)); + return youtube_request(path); } -static void -xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, - const char *v, size_t vl) +int +extractjson(const char *s, char **start, char **end) { - const char *s; + if (!(*start = strstr(s, "window[\"ytInitialData\"] = "))) + return -1; + if (!(*end = strstr(*start, "};\n"))) + return -1; - if (!(state & Pager) && nvideos >= MAX_VIDEOS) - return; + (*start) += sizeof("window[\"ytInitialData\"] = ") - 1; + (*end)++; - s = entitytostr(v); - xmlattr(x, t, tl, a, al, s, strlen(s)); + return 0; } -static void -xmldata(XMLParser *x, const char *d, size_t dl) +void +processnode(struct json_node *nodes, size_t depth, const char *value, + void *pp) { - if ((state & Pager)) - return; + struct search_response *r = (struct search_response *)pp; + static struct item *item; - /* optimization: no need to process and must not process videos after this */ - if (!state || nvideos >= MAX_VIDEOS) + if (r->nitems > MAX_VIDEOS) return; - /* use parsed link type for meta info since this metainfo differs per type like: - channel, playlist, video */ - if ((state & Metainfo)) { - switch (videos[nvideos].linktype) { - case Playlist: - break; /* ignore */ - case Channel: - if (metainfocount == 1) - strlcat(videos[nvideos].channelvideos, d, sizeof(videos[nvideos].channelvideos)); - break; - default: - if (metainfocount == 1) - strlcat(videos[nvideos].publishedat, d, sizeof(videos[nvideos].publishedat)); - else if (metainfocount == 2) - strlcat(videos[nvideos].viewcount, d, sizeof(videos[nvideos].viewcount)); - } + /* new item, structures can be very deep, just check the end for: + (items|contents)[].videoRenderer objects */ + if (depth >= 3 && + nodes[depth - 3].type == TYPE_ARRAY && + nodes[depth - 2].type == TYPE_OBJECT && + nodes[depth - 1].type == TYPE_OBJECT && + (!strcmp(nodes[depth - 3].name, "items") || + !strcmp(nodes[depth - 3].name, "contents")) && + !strcmp(nodes[depth - 1].name, "videoRenderer")) { + r->nitems++; + return; } - if ((state & Videotime) && !strcmp(x->tag, "span")) - strlcat(videos[nvideos].duration, d, sizeof(videos[nvideos].duration)); - if ((state & User) && !strcmp(x->tag, "a")) - strlcat(videos[nvideos].channeltitle, d, sizeof(videos[nvideos].channeltitle)); -} - -static void -xmldataentity(XMLParser *x, const char *d, size_t dl) -{ - const char *s; - /* optimization: no need for entity conversion */ - if (!state || nvideos >= MAX_VIDEOS) + if (r->nitems == 0) return; - - s = entitytostr(d); - xmldata(x, s, strlen(s)); -} - -static void -xmltagend(XMLParser *x, const char *t, size_t tl, int isshort) -{ - char *p; - - if ((state & Metainfo) && !strcmp(t, "ul")) - state &= ~Metainfo; - if ((state & Title) && !strcmp(t, "h3")) { - state &= ~Title; - - if (nvideos >= MAX_VIDEOS) - return; - - if (!strncmp(id, "/watch", sizeof("/watch") - 1)) { - if (!videos[nvideos].linktype) - videos[nvideos].linktype = Video; - if ((p = getparam(id, "v"))) { - if (decodeparam(videos[nvideos].id, sizeof(videos[nvideos].id), p) == -1) - videos[nvideos].id[0] = '\0'; - } - } - - id[0] = '\0'; + item = &(r->items[r->nitems - 1]); + + if (depth >= 4 && + nodes[depth - 4].type == TYPE_ARRAY && + nodes[depth - 3].type == TYPE_OBJECT && + nodes[depth - 2].type == TYPE_OBJECT && + nodes[depth - 1].type == TYPE_STRING && + (!strcmp(nodes[depth - 4].name, "items") || + !strcmp(nodes[depth - 4].name, "contents")) && + !strcmp(nodes[depth - 2].name, "videoRenderer") && + !strcmp(nodes[depth - 1].name, "videoId")) { + strlcpy(item->id, value, sizeof(item->id)); } - if ((state & User)) { - state &= ~User; - if (nvideos >= MAX_VIDEOS) - return; + if (depth >= 7 && + nodes[depth - 7].type == TYPE_ARRAY && + nodes[depth - 6].type == TYPE_OBJECT && + nodes[depth - 5].type == TYPE_OBJECT && + nodes[depth - 4].type == TYPE_OBJECT && + nodes[depth - 3].type == TYPE_ARRAY && + nodes[depth - 2].type == TYPE_OBJECT && + nodes[depth - 1].type == TYPE_STRING && + (!strcmp(nodes[depth - 7].name, "items") || + !strcmp(nodes[depth - 7].name, "contents")) && + !strcmp(nodes[depth - 5].name, "videoRenderer") && + !strcmp(nodes[depth - 4].name, "title") && + !strcmp(nodes[depth - 3].name, "runs") && + !strcmp(nodes[depth - 1].name, "text") && + !item->title[0]) { + strlcpy(item->title, value, sizeof(item->title)); + } - /* can be user or channel */ - if (!strncmp(userid, "/channel/", sizeof("/channel/") - 1)) { - strlcpy(videos[nvideos].channelid, - userid + sizeof("/channel/") - 1, - sizeof(videos[nvideos].channelid)); - } else if (!strncmp(userid, "/user/", sizeof("/user/") - 1)) { - strlcpy(videos[nvideos].userid, - userid + sizeof("/user/") - 1, - sizeof(videos[nvideos].userid)); + if (depth >= 5 && + nodes[depth - 5].type == TYPE_ARRAY && + nodes[depth - 4].type == TYPE_OBJECT && + nodes[depth - 3].type == TYPE_OBJECT && + nodes[depth - 2].type == TYPE_OBJECT && + nodes[depth - 1].type == TYPE_STRING && + (!strcmp(nodes[depth - 5].name, "items") || + !strcmp(nodes[depth - 5].name, "contents")) && + !strcmp(nodes[depth - 3].name, "videoRenderer") && + !strcmp(nodes[depth - 1].name, "simpleText")) { + if (!strcmp(nodes[depth - 2].name, "viewCountText") && + !item->viewcount[0]) { + strlcpy(item->viewcount, value, sizeof(item->viewcount)); + } else if (!strcmp(nodes[depth - 2].name, "lengthText") && + !item->duration[0]) { + strlcpy(item->duration, value, sizeof(item->duration)); + } else if (!strcmp(nodes[depth - 2].name, "publishedTimeText") && + !item->publishedat[0]) { + strlcpy(item->publishedat, value, sizeof(item->publishedat)); } - - userid[0] = '\0'; } - if ((state & Videotime)) - state &= ~Videotime; -} -static void -xmltagstart(XMLParser *x, const char *t, size_t tl) -{ - if ((state & Metainfo) && !strcmp(t, "li")) - metainfocount++; -} - -static char * -request_search(const char *s, const char *page, const char *order) -{ - char path[4096]; - - snprintf(path, sizeof(path), "/results?search_query=%s", s); - if (page[0]) { - strlcat(path, "&page=", sizeof(path)); - strlcat(path, page, sizeof(path)); + if (depth >= 9 && + nodes[depth - 9].type == TYPE_ARRAY && + nodes[depth - 8].type == TYPE_OBJECT && + nodes[depth - 7].type == TYPE_OBJECT && + nodes[depth - 6].type == TYPE_OBJECT && + nodes[depth - 5].type == TYPE_ARRAY && + nodes[depth - 4].type == TYPE_OBJECT && + nodes[depth - 3].type == TYPE_OBJECT && + nodes[depth - 2].type == TYPE_OBJECT && + nodes[depth - 1].type == TYPE_STRING && + (!strcmp(nodes[depth - 9].name, "items") || + !strcmp(nodes[depth - 9].name, "contents")) && + !strcmp(nodes[depth - 7].name, "videoRenderer") && + !strcmp(nodes[depth - 6].name, "longBylineText") && + !strcmp(nodes[depth - 5].name, "runs") && + !strcmp(nodes[depth - 3].name, "navigationEndpoint") && + !strcmp(nodes[depth - 2].name, "browseEndpoint")) { + if (!strcmp(nodes[depth - 1].name, "browseId")) { + strlcpy(item->channelid, value, sizeof(item->channelid)); + } } - if (order[0]) { - strlcat(path, "&search_sort=", sizeof(path)); - if (!strcmp(order, "date")) - strlcat(path, "video_date_uploaded", sizeof(path)); - else if (!strcmp(order, "relevance")) - strlcat(path, "video_relevance", sizeof(path)); - else if (!strcmp(order, "views")) - strlcat(path, "video_view_count", sizeof(path)); - else if (!strcmp(order, "rating")) - strlcat(path, "video_avg_rating", sizeof(path)); + if (depth >= 7 && + nodes[depth - 7].type == TYPE_ARRAY && + nodes[depth - 6].type == TYPE_OBJECT && + nodes[depth - 5].type == TYPE_OBJECT && + nodes[depth - 4].type == TYPE_OBJECT && + nodes[depth - 3].type == TYPE_ARRAY && + nodes[depth - 2].type == TYPE_OBJECT && + nodes[depth - 1].type == TYPE_STRING && + (!strcmp(nodes[depth - 7].name, "items") || + !strcmp(nodes[depth - 7].name, "contents")) && + !strcmp(nodes[depth - 5].name, "videoRenderer") && + !strcmp(nodes[depth - 4].name, "longBylineText") && + !strcmp(nodes[depth - 3].name, "runs")) { + if (!strcmp(nodes[depth - 1].name, "text") && + !item->channeltitle[0]) { + strlcpy(item->channeltitle, value, sizeof(item->channeltitle)); + } } - - /* force older youtube layout, else youtube will try to randomly serve - a new layout sometimes breaking the parsing */ - strlcat(path, "&disable_polymer=1", sizeof(path)); - - /* check if request is too long (truncation) */ - if (strlen(path) >= sizeof(path) - 1) - return NULL; - - return youtube_request(path); } struct search_response * youtube_search(const char *rawsearch, const char *page, const char *order) { struct search_response *r; - XMLParser x = { 0 }; - char *data, *s; + char *data, *s, *start, *end; + int ret; if (!(data = request_search(rawsearch, page, order))) return NULL; + if (!(s = strstr(data, "\r\n\r\n"))) return NULL; /* invalid response */ /* skip header */ @@ -304,20 +205,18 @@ youtube_search(const char *rawsearch, const char *page, const char *order) if (!(r = calloc(1, sizeof(*r)))) return NULL; - nvideos = 0; - videos = r->items; - - x.xmlattr = xmlattr; - x.xmlattrentity = xmlattrentity; - x.xmldata = xmldata; - x.xmldataentity = xmldataentity; - x.xmltagend = xmltagend; - x.xmltagstart = xmltagstart; - - setxmldata(s, strlen(s)); - xml_parse(&x); + if (extractjson(s, &start, &end) == -1) { +// fprintf(stderr, "error extracting JSON"); + free(r); + return NULL; + } - r->nitems = nvideos; + ret = parsejson(start, end - start, processnode, r); + if (ret < 0) { +// fprintf(stderr, "error parsing JSON"); + free(r); + return NULL; + } return r; }