utf8pad: improve padded printing and printing invalid unicode characters - stagit-gopher - A git gopher frontend. (mirror)
 (HTM) git clone git://bitreich.org/stagit-gopher/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/stagit-gopher/
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) Tags
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 554a9fe2e9d12defd9d6253871d8261d3f3ef3c6
 (DIR) parent 7b93d02cd8f26ab9a25d967c72c359a22c91eb74
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sat,  9 Jan 2021 14:56:51 +0100
       
       utf8pad: improve padded printing and printing invalid unicode characters
       
       - Use unicode replacement character (codepoint 0xfffd) when a codepoint is
         invalid and proceed printing the rest of the characters.
       
       - When a codepoint is invalid reset the internal state of mbtowc(3), from the
         OpenBSD man page:
       
         "  If a call to mbtowc() resulted in an undefined internal state, mbtowc()
            must be called with s set to NULL to reset the internal state before it
            can safely be used again."
       
       - Make the function return 0 when `len` is 0 (this should not be not an error).
       
       Diffstat:
         M stagit-gopher-index.c               |      59 ++++++++++++++++++++++---------
         M stagit-gopher.c                     |      58 ++++++++++++++++++++++---------
       
       2 files changed, 83 insertions(+), 34 deletions(-)
       ---
 (DIR) diff --git a/stagit-gopher-index.c b/stagit-gopher-index.c
       @@ -10,6 +10,9 @@
        
        #include <git2.h>
        
       +#define PAD_TRUNCATE_SYMBOL    "\xe2\x80\xa6" /* symbol: "ellipsis" */
       +#define UTF_INVALID_SYMBOL     "\xef\xbf\xbd" /* symbol: "replacement" */
       +
        static git_repository *repo;
        
        static const char *relpath = "";
       @@ -17,40 +20,62 @@ static const char *relpath = "";
        static char description[255] = "Repositories";
        static char *name = "";
        
       -/* format `len' columns of characters. If string is shorter pad the rest
       +/* Format `len' columns of characters. If string is shorter pad the rest
         * with characters `pad`. */
        int
        utf8pad(char *buf, size_t bufsiz, const char *s, size_t len, int pad)
        {
                wchar_t wc;
                size_t col = 0, i, slen, siz = 0;
       -        int rl, w;
       +        int inc, rl, w;
        
       -        if (!len)
       +        if (!bufsiz)
                        return -1;
       +        if (!len) {
       +                buf[0] = '\0';
       +                return 0;
       +        }
        
                slen = strlen(s);
       -        for (i = 0; i < slen; i += rl) {
       -                if ((rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4)) <= 0)
       -                        break;
       -                if ((w = wcwidth(wc)) == -1)
       +        for (i = 0; i < slen; i += inc) {
       +                inc = 1;
       +                if ((unsigned char)s[i] < 32)
                                continue;
       -                if (col + w > len || (col + w == len && s[i + rl])) {
       +
       +                rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4);
       +                if (rl < 0) {
       +                        mbtowc(NULL, NULL, 0); /* reset state */
       +                        inc = 1; /* next byte */
       +                        w = 1; /* replacement char is one width */
       +                } else if ((w = wcwidth(wc)) == -1) {
       +                        continue;
       +                } else {
       +                        inc = rl;
       +                }
       +
       +                if (col + w > len || (col + w == len && s[i + inc])) {
                                if (siz + 4 >= bufsiz)
                                        return -1;
       -                        memcpy(&buf[siz], "\xe2\x80\xa6", 3);
       -                        siz += 3;
       -                        if (col + w == len && w > 1)
       -                                buf[siz++] = pad;
       +                        memcpy(&buf[siz], PAD_TRUNCATE_SYMBOL, sizeof(PAD_TRUNCATE_SYMBOL) - 1);
       +                        siz += sizeof(PAD_TRUNCATE_SYMBOL) - 1;
                                buf[siz] = '\0';
       -                        return 0;
       +                        col++;
       +                        break;
       +                } else if (rl < 0) {
       +                        if (siz + 4 >= bufsiz)
       +                                return -1;
       +                        memcpy(&buf[siz], UTF_INVALID_SYMBOL, sizeof(UTF_INVALID_SYMBOL) - 1);
       +                        siz += sizeof(UTF_INVALID_SYMBOL) - 1;
       +                        buf[siz] = '\0';
       +                        col++;
       +                        continue;
                        }
       -                if (siz + rl + 1 >= bufsiz)
       +                if (siz + inc + 1 >= bufsiz)
                                return -1;
       -                memcpy(&buf[siz], &s[i], rl);
       -                col += w;
       -                siz += rl;
       +                memcpy(&buf[siz], &s[i], inc);
       +                siz += inc;
                        buf[siz] = '\0';
       +                col += w;
                }
        
                len -= col;
 (DIR) diff --git a/stagit-gopher.c b/stagit-gopher.c
       @@ -19,6 +19,8 @@
        #include "compat.h"
        
        #define LEN(s)    (sizeof(s)/sizeof(*s))
       +#define PAD_TRUNCATE_SYMBOL    "\xe2\x80\xa6" /* symbol: "ellipsis" */
       +#define UTF_INVALID_SYMBOL     "\xef\xbf\xbd" /* symbol: "replacement" */
        
        struct deltainfo {
                git_patch *patch;
       @@ -80,40 +82,62 @@ static char lastoidstr[GIT_OID_HEXSZ + 2]; /* id + newline + NUL byte */
        static FILE *rcachefp, *wcachefp;
        static const char *cachefile;
        
       -/* format `len' columns of characters. If string is shorter pad the rest
       +/* Format `len' columns of characters. If string is shorter pad the rest
         * with characters `pad`. */
        int
        utf8pad(char *buf, size_t bufsiz, const char *s, size_t len, int pad)
        {
                wchar_t wc;
                size_t col = 0, i, slen, siz = 0;
       -        int rl, w;
       +        int inc, rl, w;
        
       -        if (!len)
       +        if (!bufsiz)
                        return -1;
       +        if (!len) {
       +                buf[0] = '\0';
       +                return 0;
       +        }
        
                slen = strlen(s);
       -        for (i = 0; i < slen; i += rl) {
       -                if ((rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4)) <= 0)
       -                        break;
       -                if ((w = wcwidth(wc)) == -1)
       +        for (i = 0; i < slen; i += inc) {
       +                inc = 1;
       +                if ((unsigned char)s[i] < 32)
       +                        continue;
       +
       +                rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4);
       +                if (rl < 0) {
       +                        mbtowc(NULL, NULL, 0); /* reset state */
       +                        inc = 1; /* next byte */
       +                        w = 1; /* replacement char is one width */
       +                } else if ((w = wcwidth(wc)) == -1) {
                                continue;
       -                if (col + w > len || (col + w == len && s[i + rl])) {
       +                } else {
       +                        inc = rl;
       +                }
       +
       +                if (col + w > len || (col + w == len && s[i + inc])) {
                                if (siz + 4 >= bufsiz)
                                        return -1;
       -                        memcpy(&buf[siz], "\xe2\x80\xa6", 3);
       -                        siz += 3;
       -                        if (col + w == len && w > 1)
       -                                buf[siz++] = pad;
       +                        memcpy(&buf[siz], PAD_TRUNCATE_SYMBOL, sizeof(PAD_TRUNCATE_SYMBOL) - 1);
       +                        siz += sizeof(PAD_TRUNCATE_SYMBOL) - 1;
                                buf[siz] = '\0';
       -                        return 0;
       +                        col++;
       +                        break;
       +                } else if (rl < 0) {
       +                        if (siz + 4 >= bufsiz)
       +                                return -1;
       +                        memcpy(&buf[siz], UTF_INVALID_SYMBOL, sizeof(UTF_INVALID_SYMBOL) - 1);
       +                        siz += sizeof(UTF_INVALID_SYMBOL) - 1;
       +                        buf[siz] = '\0';
       +                        col++;
       +                        continue;
                        }
       -                if (siz + rl + 1 >= bufsiz)
       +                if (siz + inc + 1 >= bufsiz)
                                return -1;
       -                memcpy(&buf[siz], &s[i], rl);
       -                col += w;
       -                siz += rl;
       +                memcpy(&buf[siz], &s[i], inc);
       +                siz += inc;
                        buf[siz] = '\0';
       +                col += w;
                }
        
                len -= col;