add initial version of youtube/feed - frontends - front-ends for some sites (experiment) (DIR) Log (DIR) Files (DIR) Refs (DIR) README (DIR) LICENSE --- (DIR) commit f5a6863b5397d1cc3ad31de291be11fae6256b5f (DIR) parent 7b18c287f2fcf98227ff2ec1fdd4eeb8050e8166 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org> Date: Wed, 10 May 2023 01:10:51 +0200 add initial version of youtube/feed This fetches the Youtube Atom feed and the channel videos and combines the data. It can output: - Atom - sfeed(5) - JSON / JSON Feed It can run in command-line and CGI mode. For now it only adds the video duration in the title and filters away Youtube shorts. The Atom parser is based on sfeed. Diffstat: M Makefile | 4 ++++ M util.h | 7 +++++++ A youtube/feed.c | 1001 +++++++++++++++++++++++++++++++ 3 files changed, 1012 insertions(+), 0 deletions(-) --- (DIR) diff --git a/Makefile b/Makefile @@ -22,6 +22,7 @@ LIBTLS_LDFLAGS_STATIC = -ltls -lssl -lcrypto -static BIN = \ youtube/cgi \ youtube/cli \ + youtube/feed \ youtube/gopher SRC = ${BIN:=.c} \ @@ -68,6 +69,9 @@ youtube/cgi: ${LIB} youtube/youtube.o youtube/cgi.o youtube/cli: ${LIB} youtube/youtube.o youtube/cli.o ${CC} -o $@ youtube/cli.o youtube/youtube.o ${LIB} ${LDFLAGS} ${LIBTLS_LDFLAGS} +youtube/feed: ${LIB} youtube/youtube.o youtube/feed.o + ${CC} -o $@ youtube/feed.o youtube/youtube.o ${LIB} ${LDFLAGS} ${LIBTLS_LDFLAGS_STATIC} + youtube/gopher: ${LIB} youtube/youtube.o youtube/gopher.o ${CC} -o $@ youtube/gopher.o youtube/youtube.o ${LIB} ${LDFLAGS} ${LIBTLS_LDFLAGS_STATIC} (DIR) diff --git a/util.h b/util.h @@ -3,6 +3,13 @@ #define unveil(p1,p2) 0 #endif +/* ctype-like macros, but always compatible with ASCII / UTF-8 */ +#define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26) +#define ISCNTRL(c) ((c) < ' ' || (c) == 0x7f) +#define ISDIGIT(c) (((unsigned)c) - '0' < 10) +#define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5)) +#define TOLOWER(c) ((((unsigned)c) - 'A' < 26) ? ((c) | 32) : (c)) + #undef strlcat size_t strlcat(char *, const char *, size_t); #undef strlcpy (DIR) diff --git a/youtube/feed.c b/youtube/feed.c @@ -0,0 +1,1001 @@ +#include <err.h> +#include <errno.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <time.h> + +#include "https.h" +#include "util.h" +#include "youtube.h" +#include "xml.h" + +#define ISINCONTENT(ctx) ((ctx).iscontent && !((ctx).iscontenttag)) +#define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag) + +/* string and byte-length */ +#define STRP(s) s,sizeof(s)-1 + +enum FeedType { + FeedTypeNone = 0, + FeedTypeAtom = 2 +}; + +/* String data / memory pool */ +typedef struct string { + char *data; /* data */ + size_t len; /* string length */ + size_t bufsiz; /* allocated size */ +} String; + +/* NOTE: the order of these fields (content, date, author) indicate the + * priority to use them, from least important to high. */ +enum TagId { + TagUnknown = 0, + /* Atom */ + /* creation date has higher priority */ + AtomTagPublished, + AtomTagTitle, + AtomTagMediaDescription, + AtomTagId, + AtomTagLink, + AtomTagLinkAlternate, + AtomTagAuthor, AtomTagAuthorName, + TagYoutubeVideoId, + TagLast +}; + +typedef struct feedtag { + char *name; /* name of tag to match */ + size_t len; /* len of `name` */ + enum TagId id; /* unique ID */ +} FeedTag; + +typedef struct field { + String str; + enum TagId tagid; /* tagid set previously, used for tag priority */ +} FeedField; + +enum { + /* sfeed fields */ + FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldContent, + FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCategory, + FeedFieldYoutubeId, /* yt:videoId */ + FeedFieldLast +}; + +typedef struct feedcontext { + String *field; /* current FeedItem field String */ + FeedField fields[FeedFieldLast]; /* data for current item */ + FeedTag tag; /* unique current parsed tag */ + int iscontent; /* in content data */ + int iscontenttag; /* in content tag */ + enum FeedType feedtype; +} FeedContext; + +static long long datetounix(long long, int, int, int, int, int); +static FeedTag * gettag(enum FeedType, const char *, size_t); +static long gettzoffset(const char *); +static int isattr(const char *, size_t, const char *, size_t); +static int istag(const char *, size_t, const char *, size_t); +static int parsetime(const char *, long long *); + +static void atom_header(void); +static void atom_item(void); +static void atom_footer(void); +static void json_header(void); +static void json_item(void); +static void json_footer(void); +static void sfeed_item(void); /* TSV / sfeed */ + +static void string_append(String *, const char *, size_t); +static void string_buffer_realloc(String *, size_t); +static void string_clear(String *); +static void string_print_encoded(String *); +static void string_print_timestamp(String *); +static void string_print(String *); +static void xmlattr(XMLParser *, const char *, size_t, const char *, size_t, + const char *, size_t); +static void xmlattrentity(XMLParser *, const char *, size_t, const char *, + size_t, const char *, size_t); +static void xmlattrstart(XMLParser *, const char *, size_t, const char *, + size_t); +static void xmldata(XMLParser *, const char *, size_t); +static void xmldataentity(XMLParser *, const char *, size_t); +static void xmltagend(XMLParser *, const char *, size_t, int); +static void xmltagstart(XMLParser *, const char *, size_t); +static void xmltagstartparsed(XMLParser *, const char *, size_t, int); + +/* Atom, must be alphabetical order */ +static const FeedTag atomtags[] = { + { STRP("author"), AtomTagAuthor }, + { STRP("id"), AtomTagId }, + /* Atom: <link href="" />, RSS has <link></link> */ + { STRP("link"), AtomTagLink }, + { STRP("media:description"), AtomTagMediaDescription }, + { STRP("published"), AtomTagPublished }, + { STRP("title"), AtomTagTitle }, + { STRP("yt:videoId"), TagYoutubeVideoId } +}; + +/* special case: nested <author><name> */ +static const FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor }; +static const FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorName }; + +/* reference to no / unknown tag */ +static const FeedTag notag = { STRP(""), TagUnknown }; + +/* map TagId type to RSS/Atom field, all tags must be defined */ +static const int fieldmap[TagLast] = { + [TagUnknown] = -1, + /* Atom */ + [AtomTagPublished] = FeedFieldTime, + [AtomTagTitle] = FeedFieldTitle, + [AtomTagMediaDescription] = FeedFieldContent, + [AtomTagId] = FeedFieldId, + [AtomTagLink] = -1, + [AtomTagLinkAlternate] = FeedFieldLink, + [AtomTagAuthor] = -1, + [AtomTagAuthorName] = FeedFieldAuthor, + [TagYoutubeVideoId] = FeedFieldYoutubeId +}; + +static const int FieldSeparator = '\t'; + +static FeedContext ctx; +static XMLParser parser; /* XML parser state */ +static String attrrel, tmpstr; + +static struct search_response *search_res = NULL; +static void (*printfields)(void) = sfeed_item; +static int cgimode = 0; + +static int +tagcmp(const void *v1, const void *v2) +{ + return strcasecmp(((FeedTag *)v1)->name, ((FeedTag *)v2)->name); +} + +/* Unique tagid for parsed tag name. */ +static FeedTag * +gettag(enum FeedType feedtype, const char *name, size_t namelen) +{ + FeedTag f, *r = NULL; + + f.name = (char *)name; + + switch (feedtype) { + case FeedTypeAtom: + r = bsearch(&f, atomtags, sizeof(atomtags) / sizeof(atomtags[0]), + sizeof(atomtags[0]), tagcmp); + break; + default: + break; + } + + return r; +} + +/* Clear string only; don't free, prevents unnecessary reallocation. */ +static void +string_clear(String *s) +{ + if (s->data) + s->data[0] = '\0'; + s->len = 0; +} + +static void +string_buffer_realloc(String *s, size_t newlen) +{ + size_t alloclen; + + if (newlen > SIZE_MAX / 2) { + alloclen = SIZE_MAX; + } else { + for (alloclen = 64; alloclen <= newlen; alloclen *= 2) + ; + } + if (!(s->data = realloc(s->data, alloclen))) + err(1, "realloc"); + s->bufsiz = alloclen; +} + +/* Append data to String, s->data and data may not overlap. */ +static void +string_append(String *s, const char *data, size_t len) +{ + if (!len) + return; + + if (s->len >= SIZE_MAX - len) { + errno = ENOMEM; + err(1, "realloc"); + } + + /* check if allocation is necessary, never shrink the buffer. */ + if (s->len + len >= s->bufsiz) + string_buffer_realloc(s, s->len + len + 1); + memcpy(s->data + s->len, data, len); + s->len += len; + s->data[s->len] = '\0'; +} + +/* Print text, encode TABs, newlines and '\', remove other whitespace. + * Remove leading and trailing whitespace. */ +static void +string_print_encoded(String *s) +{ + const char *p, *e; + + if (!s->data || !s->len) + return; + + p = s->data; + e = p + strlen(p); + + for (; *p && p != e; p++) { + switch (*p) { + case '\n': putchar('\\'); putchar('n'); break; + case '\\': putchar('\\'); putchar('\\'); break; + case '\t': putchar('\\'); putchar('t'); break; + default: + /* ignore control chars */ + if (!ISCNTRL((unsigned char)*p)) + putchar(*p); + break; + } + } +} + +/* Print text, replace TABs, carriage return and other whitespace with ' '. + * Other control chars are removed. Remove leading and trailing whitespace. */ +static void +string_print(String *s) +{ + char *p, *e; + + if (!s->data || !s->len) + return; + + p = s->data; + e = p + s->len; + for (; *p && p != e; p++) { + if (ISSPACE((unsigned char)*p)) + putchar(' '); /* any whitespace to space */ + else if (!ISCNTRL((unsigned char)*p)) + /* ignore other control chars */ + putchar(*p); + } +} + +/* Print as UNIX timestamp, print nothing if the time is empty or invalid. */ +static void +string_print_timestamp(String *s) +{ + long long t; + + if (!s->data || !s->len) + return; + + if (parsetime(s->data, &t) != -1) + printf("%lld", t); +} + +/* Convert time fields. Returns a signed (at least) 64-bit UNIX timestamp. + Parameters should be passed as they are in a struct tm: + that is: year = year - 1900, month = month - 1. */ +static long long +datetounix(long long year, int mon, int day, int hour, int min, int sec) +{ + /* seconds in a month in a regular (non-leap) year */ + static const long secs_through_month[] = { + 0, 31 * 86400, 59 * 86400, 90 * 86400, + 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400, + 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 }; + int is_leap = 0, cycles, centuries = 0, leaps = 0, rem; + long long t; + + /* optimization: handle common range year 1902 up to and including 2038 */ + if (year - 2ULL <= 136) { + /* amount of leap days relative to 1970: every 4 years */ + leaps = (year - 68) >> 2; + if (!((year - 68) & 3)) { + leaps--; + is_leap = 1; + } else { + is_leap = 0; + } + t = 31536000 * (year - 70) + (86400 * leaps); /* 365 * 86400 = 31536000 */ + } else { + /* general leap year calculation: + leap years occur mostly every 4 years but every 100 years + a leap year is skipped unless the year is divisible by 400 */ + cycles = (year - 100) / 400; + rem = (year - 100) % 400; + if (rem < 0) { + cycles--; + rem += 400; + } + if (!rem) { + is_leap = 1; + } else { + if (rem >= 300) + centuries = 3, rem -= 300; + else if (rem >= 200) + centuries = 2, rem -= 200; + else if (rem >= 100) + centuries = 1, rem -= 100; + if (rem) { + leaps = rem / 4U; + rem %= 4U; + is_leap = !rem; + } + } + leaps += (97 * cycles) + (24 * centuries) - is_leap; + + /* adjust 8 leap days from 1970 up to and including 2000: + ((30 * 365) + 8) * 86400 = 946771200 */ + t = ((year - 100) * 31536000LL) + (leaps * 86400LL) + 946771200LL; + } + t += secs_through_month[mon]; + if (is_leap && mon >= 2) + t += 86400; + t += 86400LL * (day - 1); + t += 3600LL * hour; + t += 60LL * min; + t += sec; + + return t; +} + +/* Get timezone from string, return time offset in seconds from UTC. + * NOTE: only parses timezones in RFC-822, many other timezone names are + * ambiguous anyway. + * ANSI and military zones are defined wrong in RFC822 and are unsupported, + * see note on RFC2822 4.3 page 32. */ +static long +gettzoffset(const char *s) +{ + const char *p; + long tzhour = 0, tzmin = 0; + size_t i; + + switch (*s) { + case '-': /* offset */ + case '+': + for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*p); i++, p++) + tzhour = (tzhour * 10) + (*p - '0'); + if (*p == ':') + p++; + for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p++) + tzmin = (tzmin * 10) + (*p - '0'); + return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1); + default: /* timezone name */ + break; + } + return 0; +} + +/* Parse time string `s` into the UNIX timestamp `tp`. + Returns 0 on success or -1 on failure. */ +static int +parsetime(const char *s, long long *tp) +{ + int va[6] = { 0 }, i, v, vi; + + /* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */ + if (!ISDIGIT((unsigned char)s[0]) || + !ISDIGIT((unsigned char)s[1]) || + !ISDIGIT((unsigned char)s[2]) || + !ISDIGIT((unsigned char)s[3])) + return -1; + + /* parse time parts (and possibly remaining date parts) */ + for (vi = 0; *s && vi < 6; vi++) { + for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) && + ISDIGIT((unsigned char)*s); s++, i++) { + v = (v * 10) + (*s - '0'); + } + va[vi] = v; + + if ((vi < 2 && *s == '-') || + (vi == 2 && (*s == 'T' || ISSPACE((unsigned char)*s))) || + (vi > 2 && *s == ':')) + s++; + } + + /* invalid range */ + if (va[0] < 0 || va[0] > 9999 || + va[1] < 1 || va[1] > 12 || + va[2] < 1 || va[2] > 31 || + va[3] < 0 || va[3] > 23 || + va[4] < 0 || va[4] > 59 || + va[5] < 0 || va[5] > 60) /* allow leap second */ + return -1; + + *tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) - + gettzoffset(s); + + return 0; +} + +static void +atom_header(void) +{ + fputs("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<feed xmlns=\"http://www.w3.org/2005/Atom\">\n" + "\t<title>Newsfeed</title>\n", stdout); +} + +static void +atom_footer(void) +{ + fputs("</feed>\n", stdout); +} + +static void +atom_item(void) +{ + struct item *v, *found = NULL; + size_t i; + + /* must have a video id */ + if (!ctx.fields[FeedFieldYoutubeId].str.len) + return; + + for (i = 0; i < search_res->nitems; i++) { + v = &(search_res->items[i]); + if (!strcmp(ctx.fields[FeedFieldYoutubeId].str.data, v->id)) + found = v; + } + /* Only print the video if it was found in the feed aswell. + This way it filters away shorts too. */ + if (!found) + return; + + fputs("<entry>\n\t<title>", stdout); + xmlencode(ctx.fields[FeedFieldTitle].str.data); + if (found->duration[0]) { + fputs(" [", stdout); + xmlencode(found->duration); + fputs("]", stdout); + } + fputs("</title>\n", stdout); + if (ctx.fields[FeedFieldLink].str.len) { + fputs("\t<link rel=\"alternate\" href=\"", stdout); + xmlencode(ctx.fields[FeedFieldLink].str.data); + fputs("\" />\n", stdout); + } + /* prefer link over id for Atom <id>. */ + fputs("\t<id>", stdout); + if (ctx.fields[FeedFieldLink].str.len) + xmlencode(ctx.fields[FeedFieldLink].str.data); + else if (ctx.fields[FeedFieldId].str.len) + xmlencode(ctx.fields[FeedFieldId].str.data); + fputs("</id>\n", stdout); + + /* just print the original timestamp, it should conform */ + fputs("\t<updated>", stdout); + string_print(&ctx.fields[FeedFieldTime].str); + fputs("</updated>\n", stdout); + + if (ctx.fields[FeedFieldAuthor].str.len) { + fputs("\t<author><name>", stdout); + xmlencode(ctx.fields[FeedFieldAuthor].str.data); + fputs("</name></author>\n", stdout); + } + if (ctx.fields[FeedFieldContent].str.len) { + fputs("\t<content>", stdout); + xmlencode(ctx.fields[FeedFieldContent].str.data); + fputs("</content>\n", stdout); + } + fputs("</entry>\n", stdout); +} + +static void +json_header(void) +{ + fputs("{\n" + "\"version\": \"https://jsonfeed.org/version/1.1\",\n" + "\"title\": \"Newsfeed\",\n" + "\"items\": [\n", stdout); +} + +static void +json_footer(void) +{ + fputs("]\n}\n", stdout); +} + +static void +json_printfield(const char *s) +{ + for (; *s; s++) { + if (*s == '\\') + fputs("\\\\", stdout); + else if (*s == '"') + fputs("\\\"", stdout); + else if (ISCNTRL((unsigned char)*s)) + printf("\\u00%02x", (unsigned char)*s); + else + putchar(*s); + } +} + +static void +json_item(void) +{ + static int json_firstitem = 1; + struct item *v, *found = NULL; + size_t i; + + /* must have a video id */ + if (!ctx.fields[FeedFieldYoutubeId].str.len) + return; + + for (i = 0; i < search_res->nitems; i++) { + v = &(search_res->items[i]); + if (!strcmp(ctx.fields[FeedFieldYoutubeId].str.data, v->id)) + found = v; + } + /* Only print the video if it was found in the feed aswell. + This way it filters away shorts too. */ + if (!found) + return; + + if (!json_firstitem) + fputs(",\n", stdout); + json_firstitem = 0; + + fputs("{\n\t\"id\": \"", stdout); + json_printfield(ctx.fields[FeedFieldId].str.data); + fputs("\"", stdout); + + /* just print the original timestamp, it should conform */ + fputs(",\n\t\"date_published\": \"", stdout); + string_print(&ctx.fields[FeedFieldTime].str); + fputs("\"", stdout); + + fputs(",\n\t\"title\": \"", stdout); + json_printfield(ctx.fields[FeedFieldTitle].str.data); + if (found->duration[0]) { + fputs(" [", stdout); + json_printfield(found->duration); + fputs("]", stdout); + } + fputs("\"", stdout); + + if (ctx.fields[FeedFieldLink].str.len) { + fputs(",\n\t\"url\": \"", stdout); + json_printfield(ctx.fields[FeedFieldLink].str.data); + fputs("\"", stdout); + } + + if (ctx.fields[FeedFieldAuthor].str.len) { + fputs(",\n\t\"authors\": [{\"name\": \"", stdout); + json_printfield(ctx.fields[FeedFieldAuthor].str.data); + fputs("\"}]", stdout); + } + + fputs(",\n\t\"content_text\": \"", stdout); + json_printfield(ctx.fields[FeedFieldContent].str.data); + fputs("\"\n}", stdout); +} + +static void +sfeed_item(void) +{ + struct item *v, *found = NULL; + size_t i; + + /* must have a video id */ + if (!ctx.fields[FeedFieldYoutubeId].str.len) + return; + + for (i = 0; i < search_res->nitems; i++) { + v = &(search_res->items[i]); + if (!strcmp(ctx.fields[FeedFieldYoutubeId].str.data, v->id)) + found = v; + } + /* Only print the video if it was found in the feed aswell. + This way it filters away shorts too. */ + if (!found) + return; + + string_print_timestamp(&ctx.fields[FeedFieldTime].str); + putchar(FieldSeparator); + string_print(&ctx.fields[FeedFieldTitle].str); + if (found->duration[0]) { + fputs(" [", stdout); + fputs(found->duration, stdout); + fputs("]", stdout); + } + putchar(FieldSeparator); + string_print(&ctx.fields[FeedFieldLink].str); + putchar(FieldSeparator); + string_print_encoded(&ctx.fields[FeedFieldContent].str); + putchar(FieldSeparator); + fputs("plain", stdout); + putchar(FieldSeparator); + string_print(&ctx.fields[FeedFieldId].str); + putchar(FieldSeparator); + string_print(&ctx.fields[FeedFieldAuthor].str); + putchar(FieldSeparator); + /* no/empty enclosure */ + putchar(FieldSeparator); + /* empty category */ + putchar('\n'); +} + +static int +istag(const char *name, size_t len, const char *name2, size_t len2) +{ + return (len == len2 && !strcasecmp(name, name2)); +} + +static int +isattr(const char *name, size_t len, const char *name2, size_t len2) +{ + return (len == len2 && !strcasecmp(name, name2)); +} + +static void +xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl, + const char *v, size_t vl) +{ + if (ISINCONTENT(ctx)) + return; + + if (!ctx.tag.id) + return; + + if (ISCONTENTTAG(ctx)) + return; + + if (ctx.tag.id == AtomTagLink) { + if (isattr(n, nl, STRP("rel"))) { + string_append(&attrrel, v, vl); + } else if (isattr(n, nl, STRP("href"))) { + string_append(&tmpstr, v, vl); + } + } +} + +static void +xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl, + const char *data, size_t datalen) +{ + char buf[8]; + int len; + + if (ISINCONTENT(ctx)) + return; + + if (!ctx.tag.id) + return; + + /* try to translate entity, else just pass as data to + * xmlattr handler. */ + if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0) + xmlattr(p, t, tl, n, nl, buf, (size_t)len); + else + xmlattr(p, t, tl, n, nl, data, datalen); +} + +static void +xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl) +{ + if (ISINCONTENT(ctx)) + return; + + if (attrrel.len && isattr(n, nl, STRP("rel"))) + string_clear(&attrrel); + else if (tmpstr.len && + (isattr(n, nl, STRP("href")) || + isattr(n, nl, STRP("url")))) + string_clear(&tmpstr); /* use the last value for multiple attribute values */ +} + +static void +xmldata(XMLParser *p, const char *s, size_t len) +{ + if (!ctx.field) + return; + + string_append(ctx.field, s, len); +} + +static void +xmldataentity(XMLParser *p, const char *data, size_t datalen) +{ + char buf[8]; + int len; + + if (!ctx.field) + return; + + /* try to translate entity, else just pass as data to + * xmldata handler. */ + if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0) + xmldata(p, buf, (size_t)len); + else + xmldata(p, data, datalen); +} + +static void +xmltagstart(XMLParser *p, const char *t, size_t tl) +{ + const FeedTag *f; + + if (ISINCONTENT(ctx)) + return; + + /* start of RSS or Atom item / entry */ + if (ctx.feedtype == FeedTypeNone) { + if (istag(t, tl, STRP("entry"))) + ctx.feedtype = FeedTypeAtom; + return; + } + + /* field tagid already set or nested tags. */ + if (ctx.tag.id) { + /* nested <author><name> for Atom */ + if (ctx.tag.id == AtomTagAuthor && + istag(t, tl, STRP("name"))) { + memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ctx.tag)); + } else { + return; /* other nested tags are not allowed: return */ + } + } + + /* in item */ + if (ctx.tag.id == TagUnknown) { + if (!(f = gettag(ctx.feedtype, t, tl))) + f = ¬ag; + memcpy(&(ctx.tag), f, sizeof(ctx.tag)); + } + + ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent); + string_clear(&attrrel); +} + +static void +xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) +{ + enum TagId tagid; + + if (ISINCONTENT(ctx)) + return; + + /* set tag type based on its attribute value */ + if (ctx.tag.id == AtomTagLink) { + /* empty or "alternate": other types could be + "enclosure", "related", "self" or "via" */ + if (!attrrel.len || isattr(attrrel.data, attrrel.len, STRP("alternate"))) + ctx.tag.id = AtomTagLinkAlternate; + else + ctx.tag.id = AtomTagLink; /* unknown */ + } + + tagid = ctx.tag.id; + + /* map tag type to field: unknown or lesser priority is ignored, + when tags of the same type are repeated only the first is used. */ + if (fieldmap[tagid] == -1 || + tagid <= ctx.fields[fieldmap[tagid]].tagid) { + return; + } + + if (ctx.iscontenttag) { + ctx.iscontent = 1; + ctx.iscontenttag = 0; + } + + ctx.field = &(ctx.fields[fieldmap[tagid]].str); + ctx.fields[fieldmap[tagid]].tagid = tagid; + + /* clear field if it is overwritten (with a priority order) for the new + value, if the field can have multiple values then do not clear it. */ + string_clear(ctx.field); +} + +static void +xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) +{ + size_t i; + + if (ctx.feedtype == FeedTypeNone) + return; + + if (ISINCONTENT(ctx)) { + /* not a closed content field */ + if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) + return; + } else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)) { + /* matched tag end: close it */ + } else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom && + istag(t, tl, STRP("entry"))))) /* Atom */ + { + /* end of Atom entry */ + printfields(); + + /* clear strings */ + for (i = 0; i < FeedFieldLast; i++) { + string_clear(&ctx.fields[i].str); + ctx.fields[i].tagid = TagUnknown; + } + /* allow parsing of Atom and RSS concatenated in one XML stream. */ + ctx.feedtype = FeedTypeNone; + } else { + return; /* not end of field */ + } + + /* temporary string: for fields that cannot be processed + directly and need more context, for example by its tag + attributes, like the Atom link rel="alternate|enclosure". */ + if (tmpstr.len && ctx.field) { + string_clear(ctx.field); + string_append(ctx.field, tmpstr.data, tmpstr.len); + } + + /* close field */ + string_clear(&tmpstr); /* reuse and clear temporary string */ + + if (ctx.tag.id == AtomTagAuthorName) + memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* outer tag */ + else + memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag)); + + ctx.iscontent = 0; + ctx.field = NULL; +} + +static char * +request_channel_feed(const char *channelid) +{ + char path[2048]; + int r; + + r = snprintf(path, sizeof(path), "/feeds/videos.xml?channel_id=%s", channelid); + /* check if request is too long (truncation) */ + if (r < 0 || (size_t)r >= sizeof(path)) + return NULL; + + return request("www.youtube.com", path, ""); +} + +int +isvalidchannel(const char *s) +{ + size_t len; + + for (len = 0; *s; s++, len++) { + if (ISALPHA((unsigned char)*s) || + ISDIGIT((unsigned char)*s) || + *s == '-' || *s == '_') + continue; + return 0; + } + + return *s == '\0' && len == 24; +} + +void +usage(void) +{ + if (cgimode) { + fputs("Status: 400 Bad Request\r\n", stdout); + fputs("Content-Type: text/plain; charset=utf-8\r\n\r\n", stdout); + fputs("400 Bad Request\n", stdout); + exit(0); + } else { + fputs("usage: feed <channelid> [atom|json|tsv]\n", stderr); + exit(1); + } +} + +int +main(int argc, char *argv[]) +{ + char buf[256]; + const char *channelid = NULL; + char *data, *format = "tsv", *p, *requesturi, *tmp; + size_t i; + + if (pledge("stdio dns inet rpath unveil", NULL) == -1) + err(1, "pledge"); + + if ((tmp = getenv("REQUEST_URI"))) { + cgimode = 1; + + strlcpy(buf, tmp, sizeof(buf)); + requesturi = buf; + + if (!(p = strrchr(requesturi, '/'))) + usage(); + + channelid = p + 1; + if ((p = strrchr(channelid, '.'))) { + *p = '\0'; /* NULL terminate */ + format = p + 1; + } + } else { + if (argc <= 1) + usage(); + + channelid = argv[1]; + if (argc > 2) + format = argv[2]; + } + if (!channelid || !isvalidchannel(channelid)) + usage(); + + if (!strcmp(format, "atom") || !strcmp(format, "xml")) + printfields = atom_item; + else if (!strcmp(format, "json")) + printfields = json_item; + else if (!strcmp(format, "tsv") || !strcmp(format, "sfeed")) + printfields = sfeed_item; + else + usage(); + + search_res = youtube_channel_videos(channelid); + if (!search_res || search_res->nitems == 0) { + /* error or no videos found */ + return 0; + } + + if (!(data = request_channel_feed(channelid))) + return 1; /* error, no data at all */ + + if (pledge("stdio", NULL) == -1) + err(1, "pledge"); + + setxmldata(data, strlen(data)); + + memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag)); + + parser.xmlattr = xmlattr; + parser.xmlattrentity = xmlattrentity; + parser.xmlattrstart = xmlattrstart; + parser.xmlcdata = xmldata; + parser.xmldata = xmldata; + parser.xmldataentity = xmldataentity; + parser.xmltagend = xmltagend; + parser.xmltagstart = xmltagstart; + parser.xmltagstartparsed = xmltagstartparsed; + + /* init all fields, make sure it has a value */ + for (i = 0; i < FeedFieldLast; i++) { + string_append(&(ctx.fields[i].str), " ", 1); + string_clear(&(ctx.fields[i].str)); + } + + if (cgimode) { + fputs("Status: 200 OK\r\n", stdout); + if (!strcmp(format, "atom") || !strcmp(format, "xml")) + fputs("Content-Type: text/xml; charset=utf-8\r\n\r\n", stdout); + else if (!strcmp(format, "json")) + fputs("Content-Type: application/json; charset=utf-8\r\n\r\n", stdout); + else + fputs("Content-Type: text/plain; charset=utf-8\r\n\r\n", stdout); + } + + if (!strcmp(format, "atom") || !strcmp(format, "xml")) + atom_header(); + else if (!strcmp(format, "json")) + json_header(); + + /* NOTE: getnext is defined in xml.h for inline optimization */ + xml_parse(&parser); + + if (!strcmp(format, "atom")) + atom_footer(); + else if (!strcmp(format, "json")) + json_footer(); + + return 0; +}