codemadness.org

       add support for high-low surrogates and UTF-16 decoding - json2tsv - JSON to TSV converter
 (HTM) git clone git://git.codemadness.org/json2tsv
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 933582372d81a911193fb1da7c86b6b960432535
 (DIR) parent 922491e0343ab6f440024803921daf843b0e9cf5
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sun, 13 Oct 2019 21:31:31 +0200
       
       add support for high-low surrogates and UTF-16 decoding
       
       seen in the wild on a Reddit JSON file encoding emojis.
       It is also mentioned in the RFC7159 - 7. Strings
       
       Diffstat:
         M json2tsv.c                          |      25 ++++++++++++++++++++++++-
       
       1 file changed, 24 insertions(+), 1 deletion(-)
       ---
 (DIR) diff --git a/json2tsv.c b/json2tsv.c
       @@ -107,7 +107,7 @@ parsejson(void (*cb)(struct json_node *, size_t, const char *), const char **err
        {
                struct json_node nodes[JSON_MAX_NODE_DEPTH] = { 0 };
                size_t depth = 0, v = 0, vz = 0;
       -        long cp;
       +        long cp, hi, lo;
                int c, i, escape, ret = -1;
                char *value = NULL;
        
       @@ -164,6 +164,29 @@ parsejson(void (*cb)(struct json_node *, size_t, const char *), const char **err
                                                                }
                                                                cp |= (hexdigit(c) << i);
                                                        }
       +                                                /* See also:
       +                                                 * RFC7159 - 7. Strings and
       +                                                 * https://unicode.org/faq/utf_bom.html#utf8-4
       +                                                 * 0xd800 - 0xdb7f - high surrogates (no private use range) */
       +                                                if (cp >= 0xd800 && cp <= 0xdb7f) {
       +                                                        if (GETNEXT() != '\\' || GETNEXT() != 'u') {
       +                                                                *errstr = "invalid codepoint";
       +                                                                goto end;
       +                                                        }
       +                                                        for (hi = cp, i = 12, lo = 0; i >= 0; i -= 4) {
       +                                                                if ((c = GETNEXT()) == EOF || !isxdigit(c)) {
       +                                                                        *errstr = "invalid codepoint";
       +                                                                        goto end;
       +                                                                }
       +                                                                lo |= (hexdigit(c) << i);
       +                                                        }
       +                                                        /* 0xdc00 - 0xdfff - low surrogates: must follow after high surrogate */
       +                                                        if (!(lo >= 0xdc00 && lo <= 0xdfff)) {
       +                                                                *errstr = "invalid codepoint";
       +                                                                goto end;
       +                                                        }
       +                                                        cp = (hi << 10) + (0xDC00 + (lo & 0x3FF)) - 56613888;
       +                                                }
                                                        if (capacity(&value, &vz, v, 5) == -1)
                                                                goto end;
                                                        v += codepointtoutf8(cp, &value[v]);