sfeed.c - sfeed - RSS and Atom parser
 (HTM) git clone git://git.codemadness.org/sfeed
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       sfeed.c (30053B)
       ---
            1 #include <errno.h>
            2 #include <stdint.h>
            3 #include <stdio.h>
            4 #include <stdlib.h>
            5 #include <string.h>
            6 #include <strings.h>
            7 
            8 #include "util.h"
            9 #include "xml.h"
           10 
           11 #define ISINCONTENT(ctx)  ((ctx).iscontent && !((ctx).iscontenttag))
           12 #define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag)
           13 
           14 /* these feed fields support multiple separated values */
           15 #define ISFEEDFIELDMULTI(t) ((t) == FeedFieldCategory)
           16 
           17 /* string and byte-length */
           18 #define STRP(s)           s,sizeof(s)-1
           19 
           20 enum FeedType {
           21         FeedTypeNone = 0,
           22         FeedTypeRSS  = 1,
           23         FeedTypeAtom = 2
           24 };
           25 
           26 enum ContentType {
           27         ContentTypeNone  = 0,
           28         ContentTypePlain = 1,
           29         ContentTypeHTML  = 2
           30 };
           31 static const char *contenttypes[] = { "", "plain", "html" };
           32 
           33 /* String data / memory pool */
           34 typedef struct string {
           35         char   *data;   /* data */
           36         size_t  len;    /* string length */
           37         size_t  bufsiz; /* allocated size */
           38 } String;
           39 
           40 /* NOTE: the order of these fields (content, date, author) indicate the
           41  *       priority to use them, from least important to high. */
           42 enum TagId {
           43         TagUnknown = 0,
           44         /* RSS */
           45         RSSTagDcdate, RSSTagPubdate, /* creation date has higher priority */
           46         RSSTagTitle,
           47         RSSTagMediaDescription, RSSTagDescription, RSSTagContentEncoded,
           48         RSSTagGuid,
           49         RSSTagGuidPermalinkFalse,
           50         RSSTagGuidPermalinkTrue,
           51         /* must be defined after GUID, because it can be a link (isPermaLink) */
           52         RSSTagLink,
           53         RSSTagEnclosure,
           54         RSSTagAuthor, RSSTagDccreator,
           55         RSSTagCategory,
           56         /* Atom */
           57         /* creation date has higher priority */
           58         AtomTagModified, AtomTagUpdated, AtomTagIssued, AtomTagPublished,
           59         AtomTagTitle,
           60         AtomTagMediaDescription, AtomTagSummary, AtomTagContent,
           61         AtomTagId,
           62         AtomTagLink,
           63         AtomTagLinkAlternate,
           64         AtomTagLinkEnclosure,
           65         AtomTagAuthor, AtomTagAuthorName,
           66         AtomTagCategory,
           67         TagLast
           68 };
           69 
           70 typedef struct feedtag {
           71         char       *name; /* name of tag to match */
           72         size_t      len;  /* len of `name` */
           73         enum TagId  id;   /* unique ID */
           74 } FeedTag;
           75 
           76 typedef struct field {
           77         String     str;
           78         enum TagId tagid; /* tagid set previously, used for tag priority */
           79 } FeedField;
           80 
           81 enum {
           82         FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldContent,
           83         FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCategory,
           84         FeedFieldLast
           85 };
           86 
           87 typedef struct feedcontext {
           88         String          *field;        /* current FeedItem field String */
           89         FeedField        fields[FeedFieldLast]; /* data for current item */
           90         FeedTag          tag;          /* unique current parsed tag */
           91         int              iscontent;    /* in content data */
           92         int              iscontenttag; /* in content tag */
           93         enum ContentType contenttype;  /* content-type for item */
           94         enum FeedType    feedtype;
           95         int              attrcount;    /* count item HTML element attributes */
           96 } FeedContext;
           97 
           98 static long long datetounix(long long, int, int, int, int, int);
           99 static FeedTag * gettag(enum FeedType, const char *, size_t);
          100 static long gettzoffset(const char *);
          101 static int  isattr(const char *, size_t, const char *, size_t);
          102 static int  istag(const char *, size_t, const char *, size_t);
          103 static int  parsetime(const char *, long long *);
          104 static void printfields(void);
          105 static void string_append(String *, const char *, size_t);
          106 static void string_buffer_realloc(String *, size_t);
          107 static void string_clear(String *);
          108 static void string_print_encoded(String *);
          109 static void string_print_timestamp(String *);
          110 static void string_print_trimmed(String *);
          111 static void string_print_trimmed_multi(String *);
          112 static void string_print_uri(String *);
          113 static void xmlattr(XMLParser *, const char *, size_t, const char *, size_t,
          114                     const char *, size_t);
          115 static void xmlattrentity(XMLParser *, const char *, size_t, const char *,
          116                           size_t, const char *, size_t);
          117 static void xmlattrend(XMLParser *, const char *, size_t, const char *,
          118                        size_t);
          119 static void xmlattrstart(XMLParser *, const char *, size_t, const char *,
          120                          size_t);
          121 static void xmldata(XMLParser *, const char *, size_t);
          122 static void xmldataentity(XMLParser *, const char *, size_t);
          123 static void xmltagend(XMLParser *, const char *, size_t, int);
          124 static void xmltagstart(XMLParser *, const char *, size_t);
          125 static void xmltagstartparsed(XMLParser *, const char *, size_t, int);
          126 
          127 /* map tag name to TagId type */
          128 /* RSS, must be alphabetical order */
          129 static const FeedTag rsstags[] = {
          130         { STRP("author"),            RSSTagAuthor            },
          131         { STRP("category"),          RSSTagCategory          },
          132         { STRP("content:encoded"),   RSSTagContentEncoded    },
          133         { STRP("dc:creator"),        RSSTagDccreator         },
          134         { STRP("dc:date"),           RSSTagDcdate            },
          135         { STRP("description"),       RSSTagDescription       },
          136         /* RSS: <enclosure url="" />, Atom has <link rel="enclosure" /> */
          137         { STRP("enclosure"),         RSSTagEnclosure         },
          138         { STRP("guid"),              RSSTagGuid              },
          139         { STRP("link"),              RSSTagLink              },
          140         { STRP("media:description"), RSSTagMediaDescription  },
          141         { STRP("pubdate"),           RSSTagPubdate           },
          142         { STRP("title"),             RSSTagTitle             }
          143 };
          144 
          145 /* Atom, must be alphabetical order */
          146 static const FeedTag atomtags[] = {
          147         { STRP("author"),            AtomTagAuthor           },
          148         { STRP("category"),          AtomTagCategory         },
          149         { STRP("content"),           AtomTagContent          },
          150         { STRP("id"),                AtomTagId               },
          151         { STRP("issued"),            AtomTagIssued           }, /* Atom 0.3 */
          152         /* Atom: <link href="" />, RSS has <link></link> */
          153         { STRP("link"),              AtomTagLink             },
          154         { STRP("media:description"), AtomTagMediaDescription },
          155         { STRP("modified"),          AtomTagModified         }, /* Atom 0.3 */
          156         { STRP("published"),         AtomTagPublished        },
          157         { STRP("summary"),           AtomTagSummary          },
          158         { STRP("title"),             AtomTagTitle            },
          159         { STRP("updated"),           AtomTagUpdated          }
          160 };
          161 
          162 /* special case: nested <author><name> */
          163 static const FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor };
          164 static const FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorName };
          165 
          166 /* reference to no / unknown tag */
          167 static const FeedTag notag = { STRP(""), TagUnknown };
          168 
          169 /* map TagId type to RSS/Atom field, all tags must be defined */
          170 static const int fieldmap[TagLast] = {
          171         [TagUnknown]               = -1,
          172         /* RSS */
          173         [RSSTagDcdate]             = FeedFieldTime,
          174         [RSSTagPubdate]            = FeedFieldTime,
          175         [RSSTagTitle]              = FeedFieldTitle,
          176         [RSSTagMediaDescription]   = FeedFieldContent,
          177         [RSSTagDescription]        = FeedFieldContent,
          178         [RSSTagContentEncoded]     = FeedFieldContent,
          179         [RSSTagGuid]               = -1,
          180         [RSSTagGuidPermalinkFalse] = FeedFieldId,
          181         [RSSTagGuidPermalinkTrue]  = FeedFieldId, /* special-case: both a link and an id */
          182         [RSSTagLink]               = FeedFieldLink,
          183         [RSSTagEnclosure]          = FeedFieldEnclosure,
          184         [RSSTagAuthor]             = FeedFieldAuthor,
          185         [RSSTagDccreator]          = FeedFieldAuthor,
          186         [RSSTagCategory]           = FeedFieldCategory,
          187         /* Atom */
          188         [AtomTagModified]          = FeedFieldTime,
          189         [AtomTagUpdated]           = FeedFieldTime,
          190         [AtomTagIssued]            = FeedFieldTime,
          191         [AtomTagPublished]         = FeedFieldTime,
          192         [AtomTagTitle]             = FeedFieldTitle,
          193         [AtomTagMediaDescription]  = FeedFieldContent,
          194         [AtomTagSummary]           = FeedFieldContent,
          195         [AtomTagContent]           = FeedFieldContent,
          196         [AtomTagId]                = FeedFieldId,
          197         [AtomTagLink]              = -1,
          198         [AtomTagLinkAlternate]     = FeedFieldLink,
          199         [AtomTagLinkEnclosure]     = FeedFieldEnclosure,
          200         [AtomTagAuthor]            = -1,
          201         [AtomTagAuthorName]        = FeedFieldAuthor,
          202         [AtomTagCategory]          = FeedFieldCategory
          203 };
          204 
          205 static const int FieldSeparator = '\t';
          206 /* separator for multiple values in a field, separator should be 1 byte */
          207 static const char FieldMultiSeparator[] = "|";
          208 static struct uri baseuri;
          209 static const char *baseurl;
          210 
          211 static FeedContext ctx;
          212 static XMLParser parser; /* XML parser state */
          213 static String attrispermalink, attrrel, attrtype, tmpstr;
          214 
          215 static int
          216 tagcmp(const void *v1, const void *v2)
          217 {
          218         return strcasecmp(((FeedTag *)v1)->name, ((FeedTag *)v2)->name);
          219 }
          220 
          221 /* Unique tagid for parsed tag name. */
          222 static FeedTag *
          223 gettag(enum FeedType feedtype, const char *name, size_t namelen)
          224 {
          225         FeedTag f, *r = NULL;
          226 
          227         f.name = (char *)name;
          228 
          229         switch (feedtype) {
          230         case FeedTypeRSS:
          231                 r = bsearch(&f, rsstags, sizeof(rsstags) / sizeof(rsstags[0]),
          232                         sizeof(rsstags[0]), tagcmp);
          233                 break;
          234         case FeedTypeAtom:
          235                 r = bsearch(&f, atomtags, sizeof(atomtags) / sizeof(atomtags[0]),
          236                         sizeof(atomtags[0]), tagcmp);
          237                 break;
          238         default:
          239                 break;
          240         }
          241 
          242         return r;
          243 }
          244 
          245 static char *
          246 ltrim(const char *s)
          247 {
          248         for (; ISSPACE((unsigned char)*s); s++)
          249                 ;
          250         return (char *)s;
          251 }
          252 
          253 static char *
          254 rtrim(const char *s)
          255 {
          256         const char *e;
          257 
          258         for (e = s + strlen(s); e > s && ISSPACE((unsigned char)*(e - 1)); e--)
          259                 ;
          260         return (char *)e;
          261 }
          262 
          263 /* Clear string only; don't free, prevents unnecessary reallocation. */
          264 static void
          265 string_clear(String *s)
          266 {
          267         if (s->data)
          268                 s->data[0] = '\0';
          269         s->len = 0;
          270 }
          271 
          272 static void
          273 string_buffer_realloc(String *s, size_t newlen)
          274 {
          275         size_t alloclen;
          276 
          277         if (newlen > SIZE_MAX / 2) {
          278                 alloclen = SIZE_MAX;
          279         } else {
          280                 for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
          281                         ;
          282         }
          283         if (!(s->data = realloc(s->data, alloclen)))
          284                 err(1, "realloc");
          285         s->bufsiz = alloclen;
          286 }
          287 
          288 /* Append data to String, s->data and data may not overlap. */
          289 static void
          290 string_append(String *s, const char *data, size_t len)
          291 {
          292         if (!len)
          293                 return;
          294 
          295         if (s->len >= SIZE_MAX - len) {
          296                 errno = ENOMEM;
          297                 err(1, "realloc");
          298         }
          299 
          300         /* check if allocation is necessary, never shrink the buffer. */
          301         if (s->len + len >= s->bufsiz)
          302                 string_buffer_realloc(s, s->len + len + 1);
          303         memcpy(s->data + s->len, data, len);
          304         s->len += len;
          305         s->data[s->len] = '\0';
          306 }
          307 
          308 /* Print text, encode TABs, newlines and '\', remove other whitespace.
          309  * Remove leading and trailing whitespace. */
          310 static void
          311 string_print_encoded(String *s)
          312 {
          313         const char *p, *e;
          314 
          315         if (!s->data || !s->len)
          316                 return;
          317 
          318         p = ltrim(s->data);
          319         e = rtrim(p);
          320 
          321         for (; *p && p != e; p++) {
          322                 switch (*p) {
          323                 case '\n': putchar('\\'); putchar('n'); break;
          324                 case '\\': putchar('\\'); putchar('\\'); break;
          325                 case '\t': putchar('\\'); putchar('t'); break;
          326                 default:
          327                         /* ignore control chars */
          328                         if (!ISCNTRL((unsigned char)*p))
          329                                 putchar(*p);
          330                         break;
          331                 }
          332         }
          333 }
          334 
          335 static void
          336 printtrimmed(const char *s)
          337 {
          338         char *p, *e;
          339 
          340         p = ltrim(s);
          341         e = rtrim(p);
          342         for (; *p && p != e; p++) {
          343                 if (ISSPACE((unsigned char)*p))
          344                         putchar(' '); /* any whitespace to space */
          345                 else if (!ISCNTRL((unsigned char)*p))
          346                         /* ignore other control chars */
          347                         putchar(*p);
          348         }
          349 }
          350 
          351 /* Print text, replace TABs, carriage return and other whitespace with ' '.
          352  * Other control chars are removed. Remove leading and trailing whitespace. */
          353 static void
          354 string_print_trimmed(String *s)
          355 {
          356         if (!s->data || !s->len)
          357                 return;
          358 
          359         printtrimmed(s->data);
          360 }
          361 
          362 /* Print each field with trimmed whitespace, separated by '|'. */
          363 static void
          364 string_print_trimmed_multi(String *s)
          365 {
          366         char *p, *e;
          367         int c;
          368 
          369         if (!s->data || !s->len)
          370                 return;
          371 
          372         for (p = s->data; ; p = e + 1) {
          373                 if ((e = strstr(p, FieldMultiSeparator))) {
          374                         c = *e;
          375                         *e = '\0';
          376                         printtrimmed(p);
          377                         *e = c; /* restore NUL byte to original character */
          378                         fputs(FieldMultiSeparator, stdout);
          379                 } else {
          380                         printtrimmed(p);
          381                         break;
          382                 }
          383         }
          384 }
          385 
          386 /* Print URL, if it is a relative URL then it uses the global `baseurl`. */
          387 static void
          388 printuri(char *s)
          389 {
          390         char link[4096], *p, *e;
          391         struct uri newuri, olduri;
          392         int c, r = -1;
          393 
          394         p = ltrim(s);
          395         e = rtrim(p);
          396         c = *e;
          397         *e = '\0';
          398 
          399         if (baseurl && !uri_hasscheme(p) &&
          400             uri_parse(p, &olduri) != -1 && !olduri.proto[0] &&
          401             uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.proto[0])
          402                 r = uri_format(link, sizeof(link), &newuri);
          403 
          404         if (r >= 0 && (size_t)r < sizeof(link))
          405                 printtrimmed(link);
          406         else
          407                 printtrimmed(p);
          408 
          409         *e = c; /* restore NUL byte to original character */
          410 }
          411 
          412 /* Print URL, if it is a relative URL then it uses the global `baseurl`. */
          413 static void
          414 string_print_uri(String *s)
          415 {
          416         if (!s->data || !s->len)
          417                 return;
          418 
          419         printuri(s->data);
          420 }
          421 
          422 /* Print as UNIX timestamp, print nothing if the time is empty or invalid. */
          423 static void
          424 string_print_timestamp(String *s)
          425 {
          426         long long t;
          427 
          428         if (!s->data || !s->len)
          429                 return;
          430 
          431         if (parsetime(s->data, &t) != -1)
          432                 printf("%lld", t);
          433 }
          434 
          435 /* Convert time fields. Returns a signed (at least) 64-bit UNIX timestamp.
          436    Parameters should be passed as they are in a struct tm:
          437    that is: year = year - 1900, month = month - 1. */
          438 static long long
          439 datetounix(long long year, int mon, int day, int hour, int min, int sec)
          440 {
          441         /* seconds in a month in a regular (non-leap) year */
          442         static const long secs_through_month[] = {
          443                 0, 31 * 86400, 59 * 86400, 90 * 86400,
          444                 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
          445                 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
          446         int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
          447         long long t;
          448 
          449         /* optimization: handle common range year 1902 up to and including 2038 */
          450         if (year - 2ULL <= 136) {
          451                 /* amount of leap days relative to 1970: every 4 years */
          452                 leaps = (year - 68) >> 2;
          453                 if (!((year - 68) & 3)) {
          454                         leaps--;
          455                         is_leap = 1;
          456                 } else {
          457                         is_leap = 0;
          458                 }
          459                 t = 31536000 * (year - 70) + (86400 * leaps); /* 365 * 86400 = 31536000 */
          460         } else {
          461                 /* general leap year calculation:
          462                    leap years occur mostly every 4 years but every 100 years
          463                    a leap year is skipped unless the year is divisible by 400 */
          464                 cycles = (year - 100) / 400;
          465                 rem = (year - 100) % 400;
          466                 if (rem < 0) {
          467                         cycles--;
          468                         rem += 400;
          469                 }
          470                 if (!rem) {
          471                         is_leap = 1;
          472                 } else {
          473                         if (rem >= 300) {
          474                                 centuries = 3;
          475                                 rem -= 300;
          476                         } else if (rem >= 200) {
          477                                 centuries = 2;
          478                                 rem -= 200;
          479                         } else if (rem >= 100) {
          480                                 centuries = 1;
          481                                 rem -= 100;
          482                         }
          483                         if (rem) {
          484                                 leaps = rem / 4U;
          485                                 rem %= 4U;
          486                                 is_leap = !rem;
          487                         }
          488                 }
          489                 leaps += (97 * cycles) + (24 * centuries) - is_leap;
          490 
          491                 /* adjust 8 leap days from 1970 up to and including 2000:
          492                    ((30 * 365) + 8) * 86400 = 946771200 */
          493                 t = ((year - 100) * 31536000LL) + (leaps * 86400LL) + 946771200LL;
          494         }
          495         t += secs_through_month[mon];
          496         if (is_leap && mon >= 2)
          497                 t += 86400;
          498         t += 86400LL * (day - 1);
          499         t += 3600LL * hour;
          500         t += 60LL * min;
          501         t += sec;
          502 
          503         return t;
          504 }
          505 
          506 /* Get timezone from string, return time offset in seconds from UTC.
          507  * NOTE: only parses timezones in RFC 822, many other timezone names are
          508  * ambiguous anyway.
          509  * ANSI and military zones are defined wrong in RFC 822 and are unsupported,
          510  * see note on RFC 2822 4.3 page 32. */
          511 static long
          512 gettzoffset(const char *s)
          513 {
          514         static const struct {
          515                 char *name;
          516                 int offhour;
          517         } tzones[] = {
          518                 { "CDT", -5 * 3600 },
          519                 { "CST", -6 * 3600 },
          520                 { "EDT", -4 * 3600 },
          521                 { "EST", -5 * 3600 },
          522                 { "MDT", -6 * 3600 },
          523                 { "MST", -7 * 3600 },
          524                 { "PDT", -7 * 3600 },
          525                 { "PST", -8 * 3600 },
          526         };
          527         const char *p;
          528         long tzhour = 0, tzmin = 0;
          529         size_t i;
          530 
          531         for (; ISSPACE((unsigned char)*s); s++)
          532                 ;
          533         switch (*s) {
          534         case '-': /* offset */
          535         case '+':
          536                 for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
          537                         tzhour = (tzhour * 10) + (*p - '0');
          538                 if (*p == ':')
          539                         p++;
          540                 for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
          541                         tzmin = (tzmin * 10) + (*p - '0');
          542                 return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1);
          543         default: /* timezone name */
          544                 for (i = 0; ISALPHA((unsigned char)s[i]); i++)
          545                         ;
          546                 if (i != 3)
          547                         return 0;
          548                 /* compare timezone and adjust offset relative to UTC */
          549                 for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) {
          550                         if (!memcmp(s, tzones[i].name, 3))
          551                                 return tzones[i].offhour;
          552                 }
          553         }
          554         return 0;
          555 }
          556 
          557 /* Parse time string `s` into the UNIX timestamp `tp`.
          558    Returns 0 on success or -1 on failure. */
          559 static int
          560 parsetime(const char *s, long long *tp)
          561 {
          562         static const struct {
          563                 char *name;
          564                 int len;
          565         } mons[] = {
          566                 { STRP("January"),   },
          567                 { STRP("February"),  },
          568                 { STRP("March"),     },
          569                 { STRP("April"),     },
          570                 { STRP("May"),       },
          571                 { STRP("June"),      },
          572                 { STRP("July"),      },
          573                 { STRP("August"),    },
          574                 { STRP("September"), },
          575                 { STRP("October"),   },
          576                 { STRP("November"),  },
          577                 { STRP("December"),  },
          578         };
          579         int va[6] = { 0 }, i, j, v, vi;
          580         size_t m;
          581 
          582         for (; ISSPACE((unsigned char)*s); s++)
          583                 ;
          584         if (!ISDIGIT((unsigned char)*s) && !ISALPHA((unsigned char)*s))
          585                 return -1;
          586 
          587         if (ISDIGIT((unsigned char)s[0]) &&
          588             ISDIGIT((unsigned char)s[1]) &&
          589             ISDIGIT((unsigned char)s[2]) &&
          590             ISDIGIT((unsigned char)s[3])) {
          591                 /* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */
          592                 vi = 0;
          593         } else {
          594                 /* format: "[%a, ]%d %b %Y %H:%M:%S" */
          595                 /* parse "[%a, ]%d %b %Y " part, then use time parsing as above */
          596                 for (; ISALPHA((unsigned char)*s); s++)
          597                         ;
          598                 for (; ISSPACE((unsigned char)*s); s++)
          599                         ;
          600                 if (*s == ',')
          601                         s++;
          602                 for (; ISSPACE((unsigned char)*s); s++)
          603                         ;
          604                 for (v = 0, i = 0; i < 2 && ISDIGIT((unsigned char)*s); s++, i++)
          605                         v = (v * 10) + (*s - '0');
          606                 va[2] = v; /* day */
          607                 for (; ISSPACE((unsigned char)*s); s++)
          608                         ;
          609                 /* end of word month */
          610                 for (j = 0; ISALPHA((unsigned char)s[j]); j++)
          611                         ;
          612                 /* check month name */
          613                 if (j < 3 || j > 9)
          614                         return -1; /* month cannot match */
          615                 for (m = 0; m < sizeof(mons) / sizeof(*mons); m++) {
          616                         /* abbreviation (3 length) or long name */
          617                         if ((j == 3 || j == mons[m].len) &&
          618                             !strncasecmp(mons[m].name, s, j)) {
          619                                 va[1] = m + 1;
          620                                 s += j;
          621                                 break;
          622                         }
          623                 }
          624                 if (m >= 12)
          625                         return -1; /* no month found */
          626                 for (; ISSPACE((unsigned char)*s); s++)
          627                         ;
          628                 for (v = 0, i = 0; i < 4 && ISDIGIT((unsigned char)*s); s++, i++)
          629                         v = (v * 10) + (*s - '0');
          630                 /* obsolete short year: RFC 2822 4.3 */
          631                 if (i == 2 || i == 3)
          632                         v += (i == 2 && v >= 0 && v <= 49) ? 2000 : 1900;
          633                 va[0] = v; /* year */
          634                 for (; ISSPACE((unsigned char)*s); s++)
          635                         ;
          636                 /* parse only regular time part, see below */
          637                 vi = 3;
          638         }
          639 
          640         /* parse time parts (and possibly remaining date parts) */
          641         for (; *s && vi < 6; vi++) {
          642                 for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) &&
          643                                    ISDIGIT((unsigned char)*s); s++, i++) {
          644                         v = (v * 10) + (*s - '0');
          645                 }
          646                 va[vi] = v;
          647 
          648                 if ((vi < 2 && *s == '-') ||
          649                     (vi == 2 && (*s == 'T' || *s == 't' || ISSPACE((unsigned char)*s))) ||
          650                     (vi > 2 && *s == ':'))
          651                         s++;
          652         }
          653 
          654         /* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */
          655         if (*s == '.') {
          656                 for (s++; ISDIGIT((unsigned char)*s); s++)
          657                         ;
          658         }
          659 
          660         /* invalid range */
          661         if (va[0] < 0 || va[0] > 9999 ||
          662             va[1] < 1 || va[1] > 12 ||
          663             va[2] < 1 || va[2] > 31 ||
          664             va[3] < 0 || va[3] > 23 ||
          665             va[4] < 0 || va[4] > 59 ||
          666             va[5] < 0 || va[5] > 60) /* allow leap second */
          667                 return -1;
          668 
          669         *tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) -
          670               gettzoffset(s);
          671 
          672         return 0;
          673 }
          674 
          675 static void
          676 printfields(void)
          677 {
          678         string_print_timestamp(&ctx.fields[FeedFieldTime].str);
          679         putchar(FieldSeparator);
          680         string_print_trimmed(&ctx.fields[FeedFieldTitle].str);
          681         putchar(FieldSeparator);
          682         string_print_uri(&ctx.fields[FeedFieldLink].str);
          683         putchar(FieldSeparator);
          684         string_print_encoded(&ctx.fields[FeedFieldContent].str);
          685         putchar(FieldSeparator);
          686         fputs(contenttypes[ctx.contenttype], stdout);
          687         putchar(FieldSeparator);
          688         string_print_trimmed(&ctx.fields[FeedFieldId].str);
          689         putchar(FieldSeparator);
          690         string_print_trimmed(&ctx.fields[FeedFieldAuthor].str);
          691         putchar(FieldSeparator);
          692         string_print_uri(&ctx.fields[FeedFieldEnclosure].str);
          693         putchar(FieldSeparator);
          694         string_print_trimmed_multi(&ctx.fields[FeedFieldCategory].str);
          695         putchar('\n');
          696 
          697         if (ferror(stdout)) /* check for errors but do not flush */
          698                 checkfileerror(stdout, "<stdout>", 'w');
          699 }
          700 
          701 static int
          702 istag(const char *name, size_t len, const char *name2, size_t len2)
          703 {
          704         return (len == len2 && !strcasecmp(name, name2));
          705 }
          706 
          707 static int
          708 isattr(const char *name, size_t len, const char *name2, size_t len2)
          709 {
          710         return (len == len2 && !strcasecmp(name, name2));
          711 }
          712 
          713 static void
          714 xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
          715         const char *v, size_t vl)
          716 {
          717         /* handles transforming inline XML to data */
          718         if (ISINCONTENT(ctx)) {
          719                 if (ctx.contenttype == ContentTypeHTML)
          720                         xmldata(p, v, vl);
          721                 return;
          722         }
          723 
          724         if (!ctx.tag.id)
          725                 return;
          726 
          727         /* content-type may be for Atom: text, xhtml, html or a mime-type.
          728            for MRSS (media:description): plain, html. */
          729         if (ISCONTENTTAG(ctx)) {
          730                 if (isattr(n, nl, STRP("type")))
          731                         string_append(&attrtype, v, vl);
          732                 return;
          733         }
          734 
          735         if (ctx.feedtype == FeedTypeRSS) {
          736                 if (ctx.tag.id == RSSTagEnclosure &&
          737                     isattr(n, nl, STRP("url"))) {
          738                         string_append(&tmpstr, v, vl);
          739                 } else if (ctx.tag.id == RSSTagGuid &&
          740                            isattr(n, nl, STRP("ispermalink"))) {
          741                         string_append(&attrispermalink, v, vl);
          742                 }
          743         } else if (ctx.feedtype == FeedTypeAtom) {
          744                 if (ctx.tag.id == AtomTagLink) {
          745                         if (isattr(n, nl, STRP("rel"))) {
          746                                 string_append(&attrrel, v, vl);
          747                         } else if (isattr(n, nl, STRP("href"))) {
          748                                 string_append(&tmpstr, v, vl);
          749                         }
          750                 } else if (ctx.tag.id == AtomTagCategory &&
          751                            isattr(n, nl, STRP("term"))) {
          752                         string_append(&tmpstr, v, vl);
          753                 }
          754         }
          755 }
          756 
          757 static void
          758 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
          759               const char *data, size_t datalen)
          760 {
          761         char buf[8];
          762         int len;
          763 
          764         /* handles transforming inline XML to data */
          765         if (ISINCONTENT(ctx)) {
          766                 if (ctx.contenttype == ContentTypeHTML)
          767                         xmldata(p, data, datalen);
          768                 return;
          769         }
          770 
          771         if (!ctx.tag.id)
          772                 return;
          773 
          774         /* try to translate entity, else just pass as data to
          775          * xmlattr handler. */
          776         if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
          777                 xmlattr(p, t, tl, n, nl, buf, (size_t)len);
          778         else
          779                 xmlattr(p, t, tl, n, nl, data, datalen);
          780 }
          781 
          782 static void
          783 xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
          784 {
          785         if (ISINCONTENT(ctx)) {
          786                 if (ctx.contenttype == ContentTypeHTML) {
          787                         /* handles transforming inline XML to data */
          788                         xmldata(p, "\"", 1);
          789                         ctx.attrcount = 0;
          790                 }
          791                 return;
          792         }
          793 }
          794 
          795 static void
          796 xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
          797 {
          798         if (ISINCONTENT(ctx)) {
          799                 if (ctx.contenttype == ContentTypeHTML) {
          800                         /* handles transforming inline XML to data */
          801                         if (!ctx.attrcount)
          802                                 xmldata(p, " ", 1);
          803                         ctx.attrcount++;
          804                         xmldata(p, n, nl);
          805                         xmldata(p, "=\"", 2);
          806                 }
          807                 return;
          808         }
          809 
          810         if (attrispermalink.len && isattr(n, nl, STRP("ispermalink")))
          811                 string_clear(&attrispermalink);
          812         else if (attrrel.len && isattr(n, nl, STRP("rel")))
          813                 string_clear(&attrrel);
          814         else if (attrtype.len && isattr(n, nl, STRP("type")))
          815                 string_clear(&attrtype);
          816         else if (tmpstr.len &&
          817             (isattr(n, nl, STRP("href")) ||
          818              isattr(n, nl, STRP("term")) ||
          819              isattr(n, nl, STRP("url"))))
          820                 string_clear(&tmpstr); /* use the last value for multiple attribute values */
          821 }
          822 
          823 static void
          824 xmldata(XMLParser *p, const char *s, size_t len)
          825 {
          826         if (!ctx.field)
          827                 return;
          828 
          829         if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
          830                 string_append(&tmpstr, s, len);
          831         else
          832                 string_append(ctx.field, s, len);
          833 }
          834 
          835 static void
          836 xmldataentity(XMLParser *p, const char *data, size_t datalen)
          837 {
          838         char buf[8];
          839         int len;
          840 
          841         if (!ctx.field)
          842                 return;
          843 
          844         /* try to translate entity, else just pass as data to
          845          * xmldata handler. */
          846         if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
          847                 xmldata(p, buf, (size_t)len);
          848         else
          849                 xmldata(p, data, datalen);
          850 }
          851 
          852 static void
          853 xmltagstart(XMLParser *p, const char *t, size_t tl)
          854 {
          855         const FeedTag *f;
          856 
          857         if (ISINCONTENT(ctx)) {
          858                 if (ctx.contenttype == ContentTypeHTML) {
          859                         ctx.attrcount = 0;
          860                         xmldata(p, "<", 1);
          861                         xmldata(p, t, tl);
          862                 }
          863                 return;
          864         }
          865 
          866         /* start of RSS or Atom item / entry */
          867         if (ctx.feedtype == FeedTypeNone) {
          868                 if (istag(t, tl, STRP("entry")))
          869                         ctx.feedtype = FeedTypeAtom;
          870                 else if (istag(t, tl, STRP("item")))
          871                         ctx.feedtype = FeedTypeRSS;
          872                 return;
          873         }
          874 
          875         /* field tagid already set or nested tags. */
          876         if (ctx.tag.id) {
          877                 /* nested <author><name> for Atom */
          878                 if (ctx.tag.id == AtomTagAuthor &&
          879                     istag(t, tl, STRP("name"))) {
          880                         memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ctx.tag));
          881                 } else {
          882                         return; /* other nested tags are not allowed: return */
          883                 }
          884         }
          885 
          886         /* in item */
          887         if (ctx.tag.id == TagUnknown) {
          888                 if (!(f = gettag(ctx.feedtype, t, tl)))
          889                         f = &notag;
          890                 memcpy(&(ctx.tag), f, sizeof(ctx.tag));
          891         }
          892 
          893         ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent);
          894         string_clear(&attrispermalink);
          895         string_clear(&attrrel);
          896         string_clear(&attrtype);
          897 }
          898 
          899 static void
          900 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
          901 {
          902         enum TagId tagid;
          903 
          904         if (ISINCONTENT(ctx)) {
          905                 if (ctx.contenttype == ContentTypeHTML) {
          906                         if (isshort)
          907                                 xmldata(p, "/>", 2);
          908                         else
          909                                 xmldata(p, ">", 1);
          910                 }
          911                 return;
          912         }
          913 
          914         /* set tag type based on its attribute value */
          915         if (ctx.tag.id == RSSTagGuid) {
          916                 /* if empty the default is "true" */
          917                 if (!attrispermalink.len ||
          918                     isattr(attrispermalink.data, attrispermalink.len, STRP("true")))
          919                         ctx.tag.id = RSSTagGuidPermalinkTrue;
          920                 else
          921                         ctx.tag.id = RSSTagGuidPermalinkFalse;
          922         } else if (ctx.tag.id == AtomTagLink) {
          923                 /* empty or "alternate": other types could be
          924                    "enclosure", "related", "self" or "via" */
          925                 if (!attrrel.len || isattr(attrrel.data, attrrel.len, STRP("alternate")))
          926                         ctx.tag.id = AtomTagLinkAlternate;
          927                 else if (isattr(attrrel.data, attrrel.len, STRP("enclosure")))
          928                         ctx.tag.id = AtomTagLinkEnclosure;
          929                 else
          930                         ctx.tag.id = AtomTagLink; /* unknown */
          931         }
          932 
          933         tagid = ctx.tag.id;
          934 
          935         /* map tag type to field: unknown or lesser priority is ignored,
          936            when tags of the same type are repeated only the first is used. */
          937         if (fieldmap[tagid] == -1 ||
          938             (!ISFEEDFIELDMULTI(fieldmap[tagid]) &&
          939              tagid <= ctx.fields[fieldmap[tagid]].tagid)) {
          940                 return;
          941         }
          942 
          943         if (ctx.iscontenttag) {
          944                 ctx.iscontent = 1;
          945                 ctx.iscontenttag = 0;
          946 
          947                 /* detect content-type based on type attribute */
          948                 if (attrtype.len) {
          949                         if (isattr(attrtype.data, attrtype.len, STRP("html")) ||
          950                             isattr(attrtype.data, attrtype.len, STRP("xhtml")) ||
          951                             isattr(attrtype.data, attrtype.len, STRP("text/html")) ||
          952                             isattr(attrtype.data, attrtype.len, STRP("text/xhtml")) ||
          953                             isattr(attrtype.data, attrtype.len, STRP("application/xhtml+xml")))
          954                                 ctx.contenttype = ContentTypeHTML;
          955                         else /* unknown: handle as base64 text data */
          956                                 ctx.contenttype = ContentTypePlain;
          957                 } else {
          958                         /* default content-type */
          959                         if (tagid == RSSTagContentEncoded || tagid == RSSTagDescription)
          960                                 ctx.contenttype = ContentTypeHTML;
          961                         else
          962                                 ctx.contenttype = ContentTypePlain;
          963                 }
          964         }
          965 
          966         ctx.field = &(ctx.fields[fieldmap[tagid]].str);
          967         ctx.fields[fieldmap[tagid]].tagid = tagid;
          968 
          969         /* clear field if it is overwritten (with a priority order) for the new
          970            value, if the field can have multiple values then do not clear it. */
          971         if (!ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
          972                 string_clear(ctx.field);
          973 }
          974 
          975 static void
          976 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
          977 {
          978         size_t i;
          979 
          980         if (ctx.feedtype == FeedTypeNone)
          981                 return;
          982 
          983         if (ISINCONTENT(ctx)) {
          984                 /* not a closed content field */
          985                 if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) {
          986                         if (!isshort && ctx.contenttype == ContentTypeHTML) {
          987                                 xmldata(p, "</", 2);
          988                                 xmldata(p, t, tl);
          989                                 xmldata(p, ">", 1);
          990                         }
          991                         return;
          992                 }
          993         } else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)) {
          994                 /* matched tag end: close it */
          995                 /* copy also to the link field if the attribute isPermaLink="true"
          996                    and it is not set by a tag with higher priority. */
          997                 if (ctx.tag.id == RSSTagGuidPermalinkTrue && ctx.field &&
          998                     ctx.tag.id > ctx.fields[FeedFieldLink].tagid) {
          999                         string_clear(&ctx.fields[FeedFieldLink].str);
         1000                         string_append(&ctx.fields[FeedFieldLink].str,
         1001                                       ctx.field->data, ctx.field->len);
         1002                         ctx.fields[FeedFieldLink].tagid = ctx.tag.id;
         1003                 }
         1004         } else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom &&
         1005            istag(t, tl, STRP("entry"))) || /* Atom */
         1006            (ctx.feedtype == FeedTypeRSS &&
         1007            istag(t, tl, STRP("item"))))) /* RSS */
         1008         {
         1009                 /* end of RSS or Atom entry / item */
         1010                 printfields();
         1011 
         1012                 /* clear strings */
         1013                 for (i = 0; i < FeedFieldLast; i++) {
         1014                         string_clear(&ctx.fields[i].str);
         1015                         ctx.fields[i].tagid = TagUnknown;
         1016                 }
         1017                 ctx.contenttype = ContentTypeNone;
         1018                 /* allow parsing of Atom and RSS concatenated in one XML stream. */
         1019                 ctx.feedtype = FeedTypeNone;
         1020         } else {
         1021                 return; /* not end of field */
         1022         }
         1023 
         1024         /* temporary string: for fields that cannot be processed
         1025            directly and need more context, for example by its tag
         1026            attributes, like the Atom link rel="alternate|enclosure". */
         1027         if (tmpstr.len && ctx.field) {
         1028                 if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) {
         1029                         if (ctx.field->len)
         1030                                 string_append(ctx.field, FieldMultiSeparator, 1);
         1031                         string_append(ctx.field, tmpstr.data, tmpstr.len);
         1032                 } else {
         1033                         string_clear(ctx.field);
         1034                         string_append(ctx.field, tmpstr.data, tmpstr.len);
         1035                 }
         1036         }
         1037 
         1038         /* close field */
         1039         string_clear(&tmpstr); /* reuse and clear temporary string */
         1040 
         1041         if (ctx.tag.id == AtomTagAuthorName)
         1042                 memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* outer tag */
         1043         else
         1044                 memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
         1045 
         1046         ctx.iscontent = 0;
         1047         ctx.field = NULL;
         1048 }
         1049 
         1050 int
         1051 main(int argc, char *argv[])
         1052 {
         1053         if (pledge("stdio", NULL) == -1)
         1054                 err(1, "pledge");
         1055 
         1056         if (argc > 1) {
         1057                 if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[0])
         1058                         baseurl = argv[1];
         1059                 else
         1060                         errx(1, "baseurl incorrect or too long");
         1061         }
         1062 
         1063         memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
         1064 
         1065         parser.xmlattr = xmlattr;
         1066         parser.xmlattrentity = xmlattrentity;
         1067         parser.xmlattrend = xmlattrend;
         1068         parser.xmlattrstart = xmlattrstart;
         1069         parser.xmlcdata = xmldata;
         1070         parser.xmldata = xmldata;
         1071         parser.xmldataentity = xmldataentity;
         1072         parser.xmltagend = xmltagend;
         1073         parser.xmltagstart = xmltagstart;
         1074         parser.xmltagstartparsed = xmltagstartparsed;
         1075 
         1076         /* NOTE: GETNEXT is defined in xml.h for inline optimization */
         1077         xml_parse(&parser);
         1078 
         1079         checkfileerror(stdin, "<stdin>", 'r');
         1080         checkfileerror(stdout, "<stdout>", 'w');
         1081 
         1082         return 0;
         1083 }