codemadness.org

       youtube: add channel2tsv output - frontends - front-ends for some sites (experiment)
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 2be30b4f834c64d4478e8cff231ee9b29601edc0
 (DIR) parent 0ddeddd9e7acba6abe47ccaf8563b712cf96a037
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sat, 11 Feb 2023 19:01:42 +0100
       
       youtube: add channel2tsv output
       
       * Make the parser a bit less strict so it can also parse the channel page
         with videos.
       * Add a function that can fetch the channel videos by channel ID.
       * Add a tool that outputs channel videos to a TAB-separated format.
       
       Diffstat:
         M Makefile                            |       6 +++++-
         A youtube/channel2tsv.c               |     108 +++++++++++++++++++++++++++++++
         M youtube/youtube.c                   |      93 +++++++++++++++++++++++--------
         M youtube/youtube.h                   |       3 +++
       
       4 files changed, 187 insertions(+), 23 deletions(-)
       ---
 (DIR) diff --git a/Makefile b/Makefile
       @@ -25,6 +25,7 @@ BIN = \
                reddit/cli \
                reddit/gopher \
                youtube/cgi \
       +        youtube/channel2tsv \
                youtube/cli \
                youtube/gopher
        
       @@ -97,11 +98,14 @@ twitch/cgi: ${LIB} twitch/twitch.o twitch/cgi.o
        twitch/gopher: ${LIB} twitch/twitch.o twitch/gopher.o
                ${CC} -o $@ twitch/gopher.o twitch/twitch.o ${LIB} ${LDFLAGS} ${LIBTLS_LDFLAGS_STATIC}
        
       -youtube: youtube/cgi youtube/cli youtube/gopher
       +youtube: youtube/cgi youtube/channel2tsv youtube/cli youtube/gopher
        
        youtube/cgi: ${LIB} youtube/youtube.o youtube/cgi.o
                ${CC} -o $@ youtube/cgi.o youtube/youtube.o ${LIB} ${LDFLAGS} ${LIBTLS_LDFLAGS_STATIC}
        
       +youtube/channel2tsv: ${LIB} youtube/youtube.o youtube/channel2tsv.o
       +        ${CC} -o $@ youtube/channel2tsv.o youtube/youtube.o ${LIB} ${LDFLAGS} ${LIBTLS_LDFLAGS}
       +
        youtube/cli: ${LIB} youtube/youtube.o youtube/cli.o
                ${CC} -o $@ youtube/cli.o youtube/youtube.o ${LIB} ${LDFLAGS} ${LIBTLS_LDFLAGS}
        
 (DIR) diff --git a/youtube/channel2tsv.c b/youtube/channel2tsv.c
       @@ -0,0 +1,108 @@
       +#include <sys/socket.h>
       +#include <sys/types.h>
       +
       +#include <ctype.h>
       +#include <errno.h>
       +#include <netdb.h>
       +#include <stdarg.h>
       +#include <stdio.h>
       +#include <stdlib.h>
       +#include <string.h>
       +#include <unistd.h>
       +
       +#include "https.h"
       +#include "util.h"
       +#include "youtube.h"
       +
       +#define OUT(s) fputs((s), stdout)
       +#define OUTESCAPE(s) printescape((s))
       +
       +/* print: ignore control-characters */
       +void
       +printescape(const char *s)
       +{
       +        for (; *s; ++s)
       +                if (!iscntrl((unsigned char)*s))
       +                        fputc(*s, stdout);
       +}
       +
       +int
       +render(struct search_response *r)
       +{
       +        struct item *videos = r->items;
       +        size_t i;
       +
       +        if (pledge("stdio", NULL) == -1) {
       +                fprintf(stderr, "pledge: %s\n", strerror(errno));
       +                exit(1);
       +        }
       +
       +        for (i = 0; i < r->nitems; i++) {
       +                switch (videos[i].linktype) {
       +                case Channel:
       +                case Movie:
       +                case Playlist:
       +                        continue;
       +                default:
       +                        break;
       +                }
       +
       +                OUTESCAPE(videos[i].id);
       +                OUT("\t");
       +                if (videos[i].id[0]) {
       +                        OUT("https://www.youtube.com/embed/");
       +                        OUTESCAPE(videos[i].id);
       +                }
       +                OUT("\t");
       +                OUTESCAPE(videos[i].title);
       +                OUT("\t");
       +                OUTESCAPE(videos[i].publishedat);
       +                OUT("\t");
       +                OUTESCAPE(videos[i].viewcount);
       +                OUT("\t");
       +                OUTESCAPE(videos[i].duration);
       +                OUT("\n");
       +        }
       +
       +        return 0;
       +}
       +
       +static void
       +usage(const char *argv0)
       +{
       +        fprintf(stderr, "usage: %s <channelid>\n", argv0);
       +        exit(1);
       +}
       +
       +int
       +main(int argc, char *argv[])
       +{
       +        struct search_response *r;
       +        char channelid[1024];
       +
       +        if (pledge("stdio dns inet rpath unveil", NULL) == -1) {
       +                fprintf(stderr, "pledge: %s\n", strerror(errno));
       +                exit(1);
       +        }
       +        if (unveil(TLS_CA_CERT_FILE, "r") == -1) {
       +                fprintf(stderr, "unveil: %s\n", strerror(errno));
       +                exit(1);
       +        }
       +        if (unveil(NULL, NULL) == -1) {
       +                fprintf(stderr, "unveil: %s\n", strerror(errno));
       +                exit(1);
       +        }
       +
       +        if (argc < 2 || !argv[1][0])
       +                usage(argv[0]);
       +        if (!uriencode(argv[1], channelid, sizeof(channelid)))
       +                usage(argv[0]);
       +
       +        r = youtube_channel_videos(channelid);
       +        if (!r || r->nitems == 0)
       +                exit(1);
       +
       +        render(r);
       +
       +        return 0;
       +}
 (DIR) diff --git a/youtube/youtube.c b/youtube/youtube.c
       @@ -22,6 +22,20 @@ youtube_request(const char *path)
        }
        
        static char *
       +request_channel_videos(const char *channelid)
       +{
       +        char path[4096];
       +        int r;
       +
       +        r = snprintf(path, sizeof(path), "/channel/%s/videos", channelid);
       +        /* check if request is too long (truncation) */
       +        if (r < 0 || (size_t)r >= sizeof(path))
       +                return NULL;
       +
       +        return youtube_request(path);
       +}
       +
       +static char *
        request_search(const char *s, const char *page, const char *order)
        {
                char path[4096];
       @@ -90,11 +104,11 @@ processnode(struct json_node *nodes, size_t depth, const char *value,
                /* new item, structures can be very deep, just check the end for:
                   (items|contents)[].videoRenderer objects */
                if (depth >= 3 &&
       -            nodes[depth - 3].type == JSON_TYPE_ARRAY &&
       -            nodes[depth - 2].type == JSON_TYPE_OBJECT &&
       +//            nodes[depth - 3].type == JSON_TYPE_ARRAY &&
       +//            nodes[depth - 2].type == JSON_TYPE_OBJECT &&
                    nodes[depth - 1].type == JSON_TYPE_OBJECT &&
       -            (!strcmp(nodes[depth - 3].name, "items") ||
       -             !strcmp(nodes[depth - 3].name, "contents")) &&
       +//            (!strcmp(nodes[depth - 3].name, "items") ||
       +//             !strcmp(nodes[depth - 3].name, "content")) &&
                    !strcmp(nodes[depth - 1].name, "videoRenderer")) {
                        r->nitems++;
                        return;
       @@ -105,27 +119,28 @@ processnode(struct json_node *nodes, size_t depth, const char *value,
                item = &(r->items[r->nitems - 1]);
        
                if (depth >= 4 &&
       -            nodes[depth - 4].type == JSON_TYPE_ARRAY &&
       -            nodes[depth - 3].type == JSON_TYPE_OBJECT &&
       -            nodes[depth - 2].type == JSON_TYPE_OBJECT &&
       +//            nodes[depth - 4].type == JSON_TYPE_ARRAY &&
       +//            nodes[depth - 3].type == JSON_TYPE_OBJECT &&
       +//            nodes[depth - 2].type == JSON_TYPE_OBJECT &&
                    nodes[depth - 1].type == JSON_TYPE_STRING &&
       -            (!strcmp(nodes[depth - 4].name, "items") ||
       -             !strcmp(nodes[depth - 4].name, "contents")) &&
       +//            (!strcmp(nodes[depth - 4].name, "items") ||
       +//             !strcmp(nodes[depth - 4].name, "contents")) &&
                    !strcmp(nodes[depth - 2].name, "videoRenderer") &&
                    !strcmp(nodes[depth - 1].name, "videoId")) {
                        strlcpy(item->id, value, sizeof(item->id));
                }
        
                if (depth >= 7 &&
       -            nodes[depth - 7].type == JSON_TYPE_ARRAY &&
       -            nodes[depth - 6].type == JSON_TYPE_OBJECT &&
       +//            nodes[depth - 7].type == JSON_TYPE_ARRAY &&
       +//            nodes[depth - 6].type == JSON_TYPE_OBJECT &&
                    nodes[depth - 5].type == JSON_TYPE_OBJECT &&
                    nodes[depth - 4].type == JSON_TYPE_OBJECT &&
                    nodes[depth - 3].type == JSON_TYPE_ARRAY &&
                    nodes[depth - 2].type == JSON_TYPE_OBJECT &&
                    nodes[depth - 1].type == JSON_TYPE_STRING &&
       -            (!strcmp(nodes[depth - 7].name, "items") ||
       -             !strcmp(nodes[depth - 7].name, "contents")) &&
       +//            (!strcmp(nodes[depth - 7].name, "items") ||
       +//             !strcmp(nodes[depth - 7].name, "contents")) &&
       +
                    !strcmp(nodes[depth - 5].name, "videoRenderer") &&
                    !strcmp(nodes[depth - 4].name, "title") &&
                    !strcmp(nodes[depth - 3].name, "runs") &&
       @@ -135,13 +150,13 @@ processnode(struct json_node *nodes, size_t depth, const char *value,
                }
        
                if (depth >= 5 &&
       -            nodes[depth - 5].type == JSON_TYPE_ARRAY &&
       +//            nodes[depth - 5].type == JSON_TYPE_ARRAY &&
                    nodes[depth - 4].type == JSON_TYPE_OBJECT &&
                    nodes[depth - 3].type == JSON_TYPE_OBJECT &&
                    nodes[depth - 2].type == JSON_TYPE_OBJECT &&
                    nodes[depth - 1].type == JSON_TYPE_STRING &&
       -            (!strcmp(nodes[depth - 5].name, "items") ||
       -             !strcmp(nodes[depth - 5].name, "contents")) &&
       +//            (!strcmp(nodes[depth - 5].name, "items") ||
       +//             !strcmp(nodes[depth - 5].name, "contents")) &&
                    !strcmp(nodes[depth - 3].name, "videoRenderer") &&
                    !strcmp(nodes[depth - 1].name, "simpleText")) {
                        if (!strcmp(nodes[depth - 2].name, "viewCountText") &&
       @@ -157,7 +172,7 @@ processnode(struct json_node *nodes, size_t depth, const char *value,
                }
        
                if (depth >= 9 &&
       -            nodes[depth - 9].type == JSON_TYPE_ARRAY &&
       +//            nodes[depth - 9].type == JSON_TYPE_ARRAY &&
                    nodes[depth - 8].type == JSON_TYPE_OBJECT &&
                    nodes[depth - 7].type == JSON_TYPE_OBJECT &&
                    nodes[depth - 6].type == JSON_TYPE_OBJECT &&
       @@ -166,8 +181,8 @@ processnode(struct json_node *nodes, size_t depth, const char *value,
                    nodes[depth - 3].type == JSON_TYPE_OBJECT &&
                    nodes[depth - 2].type == JSON_TYPE_OBJECT &&
                    nodes[depth - 1].type == JSON_TYPE_STRING &&
       -            (!strcmp(nodes[depth - 9].name, "items") ||
       -             !strcmp(nodes[depth - 9].name, "contents")) &&
       +//            (!strcmp(nodes[depth - 9].name, "items") ||
       +//             !strcmp(nodes[depth - 9].name, "contents")) &&
                    !strcmp(nodes[depth - 7].name, "videoRenderer") &&
                    !strcmp(nodes[depth - 6].name, "longBylineText") &&
                    !strcmp(nodes[depth - 5].name, "runs") &&
       @@ -179,15 +194,15 @@ processnode(struct json_node *nodes, size_t depth, const char *value,
                }
        
                if (depth >= 7 &&
       -            nodes[depth - 7].type == JSON_TYPE_ARRAY &&
       +//            nodes[depth - 7].type == JSON_TYPE_ARRAY &&
                    nodes[depth - 6].type == JSON_TYPE_OBJECT &&
                    nodes[depth - 5].type == JSON_TYPE_OBJECT &&
                    nodes[depth - 4].type == JSON_TYPE_OBJECT &&
                    nodes[depth - 3].type == JSON_TYPE_ARRAY &&
                    nodes[depth - 2].type == JSON_TYPE_OBJECT &&
                    nodes[depth - 1].type == JSON_TYPE_STRING &&
       -            (!strcmp(nodes[depth - 7].name, "items") ||
       -             !strcmp(nodes[depth - 7].name, "contents")) &&
       +//            (!strcmp(nodes[depth - 7].name, "items") ||
       +//             !strcmp(nodes[depth - 7].name, "contents")) &&
                    !strcmp(nodes[depth - 5].name, "videoRenderer") &&
                    !strcmp(nodes[depth - 4].name, "longBylineText") &&
                    !strcmp(nodes[depth - 3].name, "runs")) {
       @@ -231,3 +246,37 @@ youtube_search(const char *rawsearch, const char *page, const char *order)
        
                return r;
        }
       +
       +struct search_response *
       +youtube_channel_videos(const char *channelid)
       +{
       +        struct search_response *r;
       +        char *data, *s, *start, *end;
       +        int ret;
       +
       +        if (!(data = request_channel_videos(channelid)))
       +                return NULL;
       +
       +        if (!(s = strstr(data, "\r\n\r\n")))
       +                return NULL; /* invalid response */
       +        /* skip header */
       +        s += strlen("\r\n\r\n");
       +
       +        if (!(r = calloc(1, sizeof(*r))))
       +                return NULL;
       +
       +        if (extractjson(s, &start, &end) == -1) {
       +                fprintf(stderr, "error extracting JSON");
       +                free(r);
       +                return NULL;
       +        }
       +
       +        ret = parsejson(start, end - start, processnode, r);
       +        if (ret < 0) {
       +//                fprintf(stderr, "error parsing JSON");
       +                free(r);
       +                return NULL;
       +        }
       +
       +        return r;
       +}
 (DIR) diff --git a/youtube/youtube.h b/youtube/youtube.h
       @@ -19,3 +19,6 @@ struct search_response {
        
        struct search_response *
        youtube_search(const char *rawsearch, const char *page, const char *order);
       +
       +struct search_response *
       +youtube_channel_videos(const char *channelid);