tconvert to 4-byte UTF-8 and 32-bit Rune - plan9port - [fork] Plan 9 from user space
 (HTM) git clone git://src.adamsgaard.dk/plan9port
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 0cadb4301d18724e7513d7489cb5bebd262c82f1
 (DIR) parent 4dbf255619efac4f0a00e4216d6c999128910df2
 (HTM) Author: Russ Cox <russcox@gmail.com>
       Date:   Fri, 11 Sep 2009 17:03:06 -0400
       
       convert to 4-byte UTF-8 and 32-bit Rune
       
       http://codereview.appspot.com/116075
       
       Diffstat:
         M include/fmt.h                       |       2 +-
         M include/utf.h                       |       7 ++++---
         M src/cmd/9term/wind.c                |       2 +-
         M src/cmd/acme/regx.c                 |       4 ++--
         M src/cmd/sam/cmd.c                   |       2 +-
         M src/cmd/sam/regexp.c                |       4 ++--
         M src/cmd/sed.c                       |       4 ++--
         M src/cmd/tr.c                        |       2 +-
         M src/cmd/troff/mbwc.c                |       4 ++--
         M src/cmd/unicode.c                   |       6 +++---
         M src/lib9/fmt/dofmt.c                |      11 ++++++-----
         M src/lib9/utf/rune.c                 |      78 +++++++++++++++++++++++--------
         M src/libbio/bgetrune.c               |       2 +-
         M src/libbio/bputrune.c               |       2 +-
       
       14 files changed, 86 insertions(+), 44 deletions(-)
       ---
 (DIR) diff --git a/include/fmt.h b/include/fmt.h
       t@@ -30,7 +30,7 @@ struct Fmt{
                void        *farg;                        /* to make flush a closure */
                int        nfmt;                        /* num chars formatted so far */
                va_list        args;                        /* args passed to dofmt */
       -        int        r;                        /* % format Rune */
       +        Rune        r;                        /* % format Rune */
                int        width;
                int        prec;
                unsigned long        flags;
 (DIR) diff --git a/include/utf.h b/include/utf.h
       t@@ -4,14 +4,15 @@
        extern "C" { 
        #endif
        
       -typedef unsigned short Rune;        /* 16 bits */
       +typedef unsigned int Rune;        /* 32 bits */
        
        enum
        {
       -        UTFmax                = 3,                /* maximum bytes per rune */
       +        UTFmax                = 4,                /* maximum bytes per rune */
                Runesync        = 0x80,                /* cannot represent part of a UTF sequence (<) */
                Runeself        = 0x80,                /* rune and UTF sequences are the same (<) */
       -        Runeerror        = 0xFFFD                /* decoding error in UTF */
       +        Runeerror        = 0xFFFD,        /* decoding error in UTF */
       +        Runemax = 0x10FFFF        /* maximum rune value */
        };
        
        /* Edit .+1,/^$/ | cfn $PLAN9/src/lib9/utf/?*.c | grep -v static |grep -v __ */
 (DIR) diff --git a/src/cmd/9term/wind.c b/src/cmd/9term/wind.c
       t@@ -193,7 +193,7 @@ winctl(void *arg)
                Rune *rp, *bp, *up, *kbdr;
                uint qh;
                int nr, nb, c, wid, i, npart, initial, lastb;
       -        char *s, *t, part[3];
       +        char *s, *t, part[UTFmax];
                Window *w;
                Mousestate *mp, m;
                enum { WKey, WMouse, WMouseread, WCtl, WCwrite, WCread, WWread, NWALT };
 (DIR) diff --git a/src/cmd/acme/regx.c b/src/cmd/acme/regx.c
       t@@ -488,7 +488,7 @@ bldcclass(void)
                                exprp++;        /* eat '-' */
                                if((c2 = nextrec()) == ']')
                                        goto Error;
       -                        classp[n+0] = 0xFFFF;
       +                        classp[n+0] = Runemax;
                                classp[n+1] = c1;
                                classp[n+2] = c2;
                                n += 3;
       t@@ -510,7 +510,7 @@ classmatch(int classno, int c, int negate)
        
                p = class[classno];
                while(*p){
       -                if(*p == 0xFFFF){
       +                if(*p == Runemax){
                                if(p[1]<=c && c<=p[2])
                                        return !negate;
                                p += 3;
 (DIR) diff --git a/src/cmd/sam/cmd.c b/src/cmd/sam/cmd.c
       t@@ -71,7 +71,7 @@ int
        inputc(void)
        {
                int n, nbuf;
       -        char buf[3];
       +        char buf[UTFmax];
                Rune r;
        
            Again:
 (DIR) diff --git a/src/cmd/sam/regexp.c b/src/cmd/sam/regexp.c
       t@@ -494,7 +494,7 @@ bldcclass(void)
                                exprp++;        /* eat '-' */
                                if((c2 = nextrec()) == ']')
                                        goto Error;
       -                        classp[n+0] = 0xFFFF;
       +                        classp[n+0] = Runemax;
                                classp[n+1] = c1;
                                classp[n+2] = c2;
                                n += 3;
       t@@ -516,7 +516,7 @@ classmatch(int classno, int c, int negate)
        
                p = class[classno];
                while(*p){
       -                if(*p == 0xFFFF){
       +                if(*p == Runemax){
                                if(p[1]<=c && c<=p[2])
                                        return !negate;
                                p += 3;
 (DIR) diff --git a/src/cmd/sed.c b/src/cmd/sed.c
       t@@ -615,7 +615,7 @@ compsub(Rune *rhs, Rune *end)
                while ((r = *cp++) != '\0') {
                        if(r == '\\') {
                                if (rhs < end)
       -                                *rhs++ = 0xFFFF;
       +                                *rhs++ = Runemax;
                                else
                                        return 0;
                                r = *cp++;
       t@@ -1050,7 +1050,7 @@ dosub(Rune *rhsbuf)
                                sp = place(sp, loc1, loc2);
                                continue;
                        }
       -                if (c == 0xFFFF && (c = *rp++) >= '1' && c < MAXSUB+'0') {
       +                if (c == Runemax && (c = *rp++) >= '1' && c < MAXSUB+'0') {
                                n = c-'0';
                                if (subexp[n].s.rsp && subexp[n].e.rep) {
                                        sp = place(sp, subexp[n].s.rsp, subexp[n].e.rep);
 (DIR) diff --git a/src/cmd/tr.c b/src/cmd/tr.c
       t@@ -15,7 +15,7 @@ uchar        bits[] = { 1, 2, 4, 8, 16, 32, 64, 128 };
        #define        CLEARBIT(a,c)                ((a)[(c)/8] &= ~bits[(c)&07])
        #define        BITSET(a,c)                ((a)[(c)/8] & bits[(c)&07])
        
       -#define        MAXRUNE        0xFFFF
       +#define        MAXRUNE        Runemax
        
        uchar        f[(MAXRUNE+1)/8];
        uchar        t[(MAXRUNE+1)/8];
 (DIR) diff --git a/src/cmd/troff/mbwc.c b/src/cmd/troff/mbwc.c
       t@@ -152,9 +152,9 @@ wcstombs(char *s, const wchar_t *pwcs, size_t n)
                        if(p+d <= pe+3) {
                                *p++ = buf[0];
                                if(d > 1) {
       -                                *p++ = buf[2];
       +                                *p++ = buf[1];
                                        if(d > 2)
       -                                        *p++ = buf[3];
       +                                        *p++ = buf[2];
                                }
                        }
                        if(c == 0)
 (DIR) diff --git a/src/cmd/unicode.c b/src/cmd/unicode.c
       t@@ -51,13 +51,13 @@ range(char *argv[])
                                return "bad range";
                        }
                        min = strtoul(q, &q, 16);
       -                if(min<0 || min>0xFFFF || *q!='-')
       +                if(min<0 || min>Runemax || *q!='-')
                                goto err;
                        q++;
                        if(strchr(hex, *q) == 0)
                                goto err;
                        max = strtoul(q, &q, 16);
       -                if(max<0 || max>0xFFFF || max<min || *q!=0)
       +                if(max<0 || max>Runemax || max<min || *q!=0)
                                goto err;
                        i = 0;
                        do{
       t@@ -111,7 +111,7 @@ chars(char *argv[])
                                return "bad char";
                        }
                        m = strtoul(q, &q, 16);
       -                if(m<0 || m>0xFFFF || *q!=0)
       +                if(m<0 || m>Runemax || *q!=0)
                                goto err;
                        Bprint(&bout, "%C", m);
                        if(!text)
 (DIR) diff --git a/src/lib9/fmt/dofmt.c b/src/lib9/fmt/dofmt.c
       t@@ -605,12 +605,13 @@ __flagfmt(Fmt *f)
        int
        __badfmt(Fmt *f)
        {
       -        char x[3];
       +        char x[2+UTFmax];
       +        int n;
        
                x[0] = '%';
       -        x[1] = f->r;
       -        x[2] = '%';
       -        f->prec = 3;
       -        __fmtcpy(f, (const void*)x, 3, 3);
       +        n = 1 + runetochar(x+1, &f->r);
       +        x[n++] = '%';
       +        f->prec = n;
       +        __fmtcpy(f, (const void*)x, n, n);
                return 0;
        }
 (DIR) diff --git a/src/lib9/utf/rune.c b/src/lib9/utf/rune.c
       t@@ -23,16 +23,19 @@ enum
                Bit2        = 5,
                Bit3        = 4,
                Bit4        = 3,
       +        Bit5        = 2,
        
                T1        = ((1<<(Bit1+1))-1) ^ 0xFF,        /* 0000 0000 */
                Tx        = ((1<<(Bitx+1))-1) ^ 0xFF,        /* 1000 0000 */
                T2        = ((1<<(Bit2+1))-1) ^ 0xFF,        /* 1100 0000 */
                T3        = ((1<<(Bit3+1))-1) ^ 0xFF,        /* 1110 0000 */
                T4        = ((1<<(Bit4+1))-1) ^ 0xFF,        /* 1111 0000 */
       +        T5        = ((1<<(Bit5+1))-1) ^ 0xFF,        /* 1111 1000 */
        
       -        Rune1        = (1<<(Bit1+0*Bitx))-1,                /* 0000 0000 0111 1111 */
       -        Rune2        = (1<<(Bit2+1*Bitx))-1,                /* 0000 0111 1111 1111 */
       -        Rune3        = (1<<(Bit3+2*Bitx))-1,                /* 1111 1111 1111 1111 */
       +        Rune1        = (1<<(Bit1+0*Bitx))-1,                /* 0000 0000 0000 0000 0111 1111 */
       +        Rune2        = (1<<(Bit2+1*Bitx))-1,                /* 0000 0000 0000 0111 1111 1111 */
       +        Rune3        = (1<<(Bit3+2*Bitx))-1,                /* 0000 0000 1111 1111 1111 1111 */
       +        Rune4        = (1<<(Bit4+3*Bitx))-1,                /* 0011 1111 1111 1111 1111 1111 */
        
                Maskx        = (1<<Bitx)-1,                        /* 0011 1111 */
                Testx        = Maskx ^ 0xFF,                        /* 1100 0000 */
       t@@ -43,7 +46,7 @@ enum
        int
        chartorune(Rune *rune, char *str)
        {
       -        int c, c1, c2;
       +        int c, c1, c2, c3;
                long l;
        
                /*
       t@@ -89,6 +92,25 @@ chartorune(Rune *rune, char *str)
                }
        
                /*
       +         * four character sequence
       +         *        10000-10FFFF => T4 Tx Tx Tx
       +         */
       +        if(UTFmax >= 4) {
       +                c3 = *(uchar*)(str+3) ^ Tx;
       +                if(c3 & Testx)
       +                        goto bad;
       +                if(c < T5) {
       +                        l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
       +                        if(l <= Rune3)
       +                                goto bad;
       +                        if(l > Runemax)
       +                                goto bad;
       +                        *rune = l;
       +                        return 4;
       +                }
       +        }
       +
       +        /*
                 * bad decoding
                 */
        bad:
       t@@ -113,7 +135,7 @@ runetochar(char *str, Rune *rune)
        
                /*
                 * two character sequence
       -         *        0080-07FF => T2 Tx
       +         *        00080-007FF => T2 Tx
                 */
                if(c <= Rune2) {
                        str[0] = T2 | (c >> 1*Bitx);
       t@@ -123,12 +145,26 @@ runetochar(char *str, Rune *rune)
        
                /*
                 * three character sequence
       -         *        0800-FFFF => T3 Tx Tx
       +         *        00800-0FFFF => T3 Tx Tx
                 */
       -        str[0] = T3 |  (c >> 2*Bitx);
       -        str[1] = Tx | ((c >> 1*Bitx) & Maskx);
       -        str[2] = Tx |  (c & Maskx);
       -        return 3;
       +        if(c > Runemax)
       +                c = Runeerror;
       +        if(c <= Rune3) {
       +                str[0] = T3 |  (c >> 2*Bitx);
       +                str[1] = Tx | ((c >> 1*Bitx) & Maskx);
       +                str[2] = Tx |  (c & Maskx);
       +                return 3;
       +        }
       +        
       +        /*
       +         * four character sequence
       +         *        010000-1FFFFF => T4 Tx Tx Tx
       +         */
       +        str[0] = T4 |  (c >> 3*Bitx);
       +        str[1] = Tx | ((c >> 2*Bitx) & Maskx);
       +        str[2] = Tx | ((c >> 1*Bitx) & Maskx);
       +        str[3] = Tx |  (c & Maskx);
       +        return 4;
        }
        
        int
       t@@ -155,7 +191,10 @@ runenlen(Rune *r, int nrune)
                        if(c <= Rune2)
                                nb += 2;
                        else
       +                if(c <= Rune3 || c > Runemax)
                                nb += 3;
       +                else
       +                        nb += 4;
                }
                return nb;
        }
       t@@ -165,13 +204,14 @@ fullrune(char *str, int n)
        {
                int c;
        
       -        if(n > 0) {
       -                c = *(uchar*)str;
       -                if(c < Tx)
       -                        return 1;
       -                if(n > 1)
       -                        if(c < T3 || n > 2)
       -                                return 1;
       -        }
       -        return 0;
       +        if(n <= 0)
       +                return 0;
       +        c = *(uchar*)str;
       +        if(c < Tx)
       +                return 1;
       +        if(c < T3)
       +                return n >= 2;
       +        if(UTFmax == 3 || c < T4)
       +                return n >= 3;
       +        return n >= 4;
        }
 (DIR) diff --git a/src/libbio/bgetrune.c b/src/libbio/bgetrune.c
       t@@ -7,7 +7,7 @@ Bgetrune(Biobuf *bp)
        {
                int c, i;
                Rune rune;
       -        char str[4];
       +        char str[UTFmax];
        
                c = Bgetc(bp);
                if(c < Runeself) {                /* one char */
 (DIR) diff --git a/src/libbio/bputrune.c b/src/libbio/bputrune.c
       t@@ -6,7 +6,7 @@ int
        Bputrune(Biobuf *bp, long c)
        {
                Rune rune;
       -        char str[4];
       +        char str[UTFmax];
                int n;
        
                rune = c;