rework URI handling - hurl - Gopher/HTTP/HTTPS file grabber
 (HTM) git clone git://git.codemadness.org/hurl
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit e8e1e1a7d09c614b57fac5070eb5c28822c948ba
 (DIR) parent 5a9951db80a5e9b9f2d5ad7ca1c6efebbd00e11f
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Fri, 12 Mar 2021 22:22:13 +0100
       
       rework URI handling
       
       - Parse the URI in a more correct way following the Gopher URI RFC 4266 and
          General URI RFC 3986 - Uniform Resource Identifier (URI): Generic Syntax.
       - An URI fragment is not sent to the server anymore.
       - A gopher type is now optional for an empty path or for example:
         "gopher://codemadness.org".
       
       Also The use of strlcat() is removed and the code should now be more portable.
       
       Diffstat:
         M hurl.c                              |     175 ++++++++++++++++++++-----------
       
       1 file changed, 116 insertions(+), 59 deletions(-)
       ---
 (DIR) diff --git a/hurl.c b/hurl.c
       @@ -28,12 +28,15 @@
        #define TLS_CA_CERT_FILE "/etc/ssl/cert.pem"
        #endif
        
       -/* uri */
       +/* URI */
        struct uri {
       -        char proto[48];
       +        char proto[48];     /* scheme including ":" or "://" */
       +        char userinfo[256]; /* username [:password] */
                char host[256];
       -        char path[2048];
       -        char port[6];     /* numeric port */
       +        char port[6];       /* numeric port */
       +        char path[1024];
       +        char query[1024];
       +        char fragment[1024];
        };
        
        char *argv0;
       @@ -61,70 +64,115 @@ sighandler(int signo)
        }
        
        int
       -parseuri(const char *s, struct uri *u)
       +uri_parse(const char *s, struct uri *u)
        {
       -        const char *p = s, *b;
       -        char *endptr = NULL;
       +        const char *p = s;
       +        char *endptr;
                size_t i;
       -        unsigned long l;
       +        long l;
        
       -        u->proto[0] = u->host[0] = u->path[0] = u->port[0] = '\0';
       -        if (!*p)
       -                return 0;
       +        u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0';
       +        u->path[0] = u->query[0] = u->fragment[0] = '\0';
        
       -        /* protocol part */
       -        for (p = s; *p && (isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
       -                       *p == '+' || *p == '-' || *p == '.'); p++)
       +        /* protocol-relative */
       +        if (*p == '/' && *(p + 1) == '/') {
       +                p += 2; /* skip "//" */
       +                goto parseauth;
       +        }
       +
       +        /* scheme / protocol part */
       +        for (; isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
       +                       *p == '+' || *p == '-' || *p == '.'; p++)
                        ;
       -        if (!strncmp(p, "://", 3)) {
       +        /* scheme, except if empty and starts with ":" then it is a path */
       +        if (*p == ':' && p != s) {
       +                if (*(p + 1) == '/' && *(p + 2) == '/')
       +                        p += 3; /* skip "://" */
       +                else
       +                        p++; /* skip ":" */
       +
                        if ((size_t)(p - s) >= sizeof(u->proto))
                                return -1; /* protocol too long */
                        memcpy(u->proto, s, p - s);
                        u->proto[p - s] = '\0';
       -                p += 3; /* skip "://" */
       +
       +                if (*(p - 1) != '/')
       +                        goto parsepath;
                } else {
       -                return -1; /* no protocol specified */
       +                p = s; /* no scheme format, reset to start */
       +                goto parsepath;
       +        }
       +
       +parseauth:
       +        /* userinfo (username:password) */
       +        i = strcspn(p, "@/?#");
       +        if (p[i] == '@') {
       +                if (i >= sizeof(u->userinfo))
       +                        return -1; /* userinfo too long */
       +                memcpy(u->userinfo, p, i);
       +                u->userinfo[i] = '\0';
       +                p += i + 1;
                }
        
                /* IPv6 address */
                if (*p == '[') {
       -                /* bracket not found or host too long */
       -                if (!(b = strchr(p, ']')) || (size_t)(b - p) >= (ssize_t)sizeof(u->host))
       +                /* bracket not found, host too short or too long */
       +                i = strcspn(p, "]");
       +                if (p[i] != ']' || i < 3)
                                return -1;
       -                memcpy(u->host, p + 1, b - p - 1);
       -                u->host[b - p - 1] = '\0';
       -                p = b + 1;
       +                i++; /* including "]" */
                } else {
                        /* domain / host part, skip until port, path or end. */
       -                if ((i = strcspn(p, ":/")) >= sizeof(u->host))
       -                        return -1; /* host too long */
       -                memcpy(u->host, p, i);
       -                u->host[i] = '\0';
       -                p = &p[i];
       +                i = strcspn(p, ":/?#");
                }
       +        if (i >= sizeof(u->host))
       +                return -1; /* host too long */
       +        memcpy(u->host, p, i);
       +        u->host[i] = '\0';
       +        p += i;
       +
                /* port */
                if (*p == ':') {
       -                if ((i = strcspn(++p, "/")) >= sizeof(u->port))
       +                p++;
       +                if ((i = strcspn(p, "/?#")) >= sizeof(u->port))
                                return -1; /* port too long */
                        memcpy(u->port, p, i);
                        u->port[i] = '\0';
       -                /* check for valid port: range 1 - 65535 */
       +                /* check for valid port: range 1 - 65535, may be empty */
                        errno = 0;
       -                l = strtoul(u->port, &endptr, 10);
       -                if (errno || u->port[0] == '\0' || *endptr ||
       -                    !l || l > 65535)
       +                l = strtol(u->port, &endptr, 10);
       +                if (i && (errno || *endptr || l <= 0 || l > 65535))
                                return -1;
       -                p = &p[i];
       +                p += i;
                }
       -        if (u->host[0]) {
       -                p = &p[strspn(p, "/")];
       -                memcpy(u->path, "/", 2);
       -        } else {
       -                return -1;
       +
       +parsepath:
       +        /* path */
       +        if ((i = strcspn(p, "?#")) >= sizeof(u->path))
       +                return -1; /* path too long */
       +        memcpy(u->path, p, i);
       +        u->path[i] = '\0';
       +        p += i;
       +
       +        /* query */
       +        if (*p == '?') {
       +                p++;
       +                if ((i = strcspn(p, "#")) >= sizeof(u->query))
       +                        return -1; /* query too long */
       +                memcpy(u->query, p, i);
       +                u->query[i] = '\0';
       +                p += i;
                }
       -        /* treat truncation as an error */
       -        if (strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path))
       -                return -1;
       +
       +        /* fragment */
       +        if (*p == '#') {
       +                p++;
       +                if ((i = strlen(p)) >= sizeof(u->fragment))
       +                        return -1; /* fragment too long */
       +                memcpy(u->fragment, p, i);
       +                u->fragment[i] = '\0';
       +        }
       +
                return 0;
        }
        
       @@ -206,11 +254,14 @@ https_request(void)
        
                /* create and send HTTP header */
                r = snprintf(buf, sizeof(buf),
       -                "GET %s HTTP/1.0\r\n"
       +                "GET %s%s%s HTTP/1.0\r\n"
                        "Host: %s%s%s\r\n"
                        "Connection: close\r\n"
                        "%s%s"
       -                "\r\n", u.path, u.host,
       +                "\r\n",
       +                u.path[0] ? u.path : "/",
       +                u.query[0] ? "?" : "", u.query,
       +                u.host,
                        stdport ? "" : ":",
                        stdport ? "" : u.port,
                        config_headers, config_headers[0] ? "\r\n" : "");
       @@ -334,11 +385,14 @@ http_request(void)
        
                /* create and send HTTP header */
                r = snprintf(buf, sizeof(buf),
       -                "GET %s HTTP/1.0\r\n"
       +                "GET %s%s%s HTTP/1.0\r\n"
                        "Host: %s%s%s\r\n"
                        "Connection: close\r\n"
                        "%s%s"
       -                "\r\n", u.path, u.host,
       +                "\r\n",
       +                u.path[0] ? u.path : "/",
       +                u.query[0] ? "?" : "", u.query,
       +                u.host,
                        stdport ? "" : ":",
                        stdport ? "" : u.port,
                        config_headers, config_headers[0] ? "\r\n" : "");
       @@ -427,7 +481,7 @@ int
        gopher_request(void)
        {
                char buf[READ_BUF_SIZ], *p;
       -        const char *errstr;
       +        const char *errstr, *path;
                size_t len = 0;
                ssize_t r;
                int fd = -1, ret = 1;
       @@ -440,8 +494,13 @@ gopher_request(void)
                if (pledge("stdio", NULL) == -1)
                        err(1, "pledge");
        
       -        /* create and send path, skip type part */
       -        r = snprintf(buf, sizeof(buf), "%s\r\n", u.path + 2);
       +        /* create and send path, skip type part, empty path is allowed,
       +           see RFC 4266 The gopher URI Scheme - section 2.1 */
       +        path = u.path;
       +        if (*path == '/' && *path++)
       +                path++;
       +        r = snprintf(buf, sizeof(buf), "%s%s%s\r\n",
       +                path, u.query[0] ? "?" : "", u.query);
                if (r < 0 || (size_t)r >= sizeof(buf)) {
                        fprintf(stderr, "not writing header because it is truncated");
                        goto err;
       @@ -623,8 +682,10 @@ main(int argc, char **argv)
                        usage();
        
                url = argv[0];
       -        if (parseuri(url, &u) == -1)
       -                errx(1, "invalid url: %s", url);
       +        if (uri_parse(url, &u) == -1)
       +                errx(1, "invalid URL: %s", url);
       +        if (u.userinfo[0])
       +                errx(1, "userinfo field not supported in the URL: %s", url);
        
                if (config_timeout > 0) {
                        signal(SIGALRM, sighandler);
       @@ -632,7 +693,7 @@ main(int argc, char **argv)
                                err(1, "alarm");
                }
        
       -        if (!strcmp(u.proto, "https")) {
       +        if (!strcmp(u.proto, "https://")) {
                        if (tls_init())
                                errx(1, "tls_init failed");
                        if (!(tls_config = tls_config_new()))
       @@ -643,22 +704,18 @@ main(int argc, char **argv)
                                        errx(1, "tls set ciphers failed: %s",
                                             tls_config_error(tls_config));
                        }
       -                if (!u.port[0] && !strcmp(u.proto, "https"))
       +                if (!u.port[0])
                                memcpy(u.port, "443", 4);
                        statuscode = https_request();
       -        } else if (!strcmp(u.proto, "http")) {
       +        } else if (!strcmp(u.proto, "http://")) {
                        if (!u.port[0])
                                memcpy(u.port, "80", 3);
                        statuscode = http_request();
       -        } else if (!strcmp(u.proto, "gopher")) {
       +        } else if (!strcmp(u.proto, "gopher://")) {
                        if (!u.port[0])
                                memcpy(u.port, "70", 3);
       -
       -                if (u.path[0] != '/' || u.path[1] == '\0')
       -                        errx(1, "must specify type");
       -
                        statuscode = gopher_request();
       -        } else if (!strcmp(u.proto, "gophers")) {
       +        } else if (!strcmp(u.proto, "gophers://")) {
                        if (tls_init())
                                errx(1, "tls_init failed");
                        if (!(tls_config = tls_config_new()))