diff options
author | Thomas Lively <tlively@google.com> | 2024-02-21 12:10:01 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-21 20:10:01 +0000 |
commit | 39ae6cf367fced5aad3224c6bffa086b0cd0d393 (patch) | |
tree | 3a46e5f0ebc3c9d0773d4409f67f5b8e4fed3246 /src | |
parent | a2fa5598104f048bce744db57d4d9407473bcd14 (diff) | |
download | binaryen-39ae6cf367fced5aad3224c6bffa086b0cd0d393.tar.gz binaryen-39ae6cf367fced5aad3224c6bffa086b0cd0d393.tar.bz2 binaryen-39ae6cf367fced5aad3224c6bffa086b0cd0d393.zip |
Improve JSON string encoding (#6328)
Catch and report all kinds of WTF-8 encoding errors in the source strings,
including invalid leading bytes, invalid trailing bytes, unexpected ends of
strings, and invalid surrogate sequences. Insert replacement characters into the
output as necessary. Add a TODO about minimizing size by escaping only those
code points mandated to be escaped by the JSON spec. Generally improve
readability of the code.
Diffstat (limited to 'src')
-rw-r--r-- | src/support/string.cpp | 172 |
1 files changed, 103 insertions, 69 deletions
diff --git a/src/support/string.cpp b/src/support/string.cpp index 75dd1716c..c3a9ce4e4 100644 --- a/src/support/string.cpp +++ b/src/support/string.cpp @@ -142,11 +142,77 @@ std::ostream& printEscaped(std::ostream& os, const std::string_view str) { std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str) { os << '"'; - for (size_t i = 0; i < str.size(); i++) { - int u0 = str[i]; - switch (u0) { - case '\t': - os << "\\t"; + constexpr uint32_t replacementCharacter = 0xFFFD; + bool lastWasLeadingSurrogate = false; + for (size_t i = 0; i < str.size();) { + // Decode from WTF-8 into a unicode code point. + uint8_t leading = str[i]; + size_t trailingBytes; + uint32_t u; + if ((leading & 0b10000000) == 0b00000000) { + // 0xxxxxxx + trailingBytes = 0; + u = leading; + } else if ((leading & 0b11100000) == 0b11000000) { + // 110xxxxx 10xxxxxx + trailingBytes = 1; + u = (leading & 0b00011111) << 6; + } else if ((leading & 0b11110000) == 0b11100000) { + // 1110xxxx 10xxxxxx 10xxxxxx + trailingBytes = 2; + u = (leading & 0b00001111) << 12; + } else if ((leading & 0b11111000) == 0b11110000) { + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + trailingBytes = 3; + u = (leading & 0b00000111) << 18; + } else { + std::cerr << "warning: Bad WTF-8 leading byte (" << std::hex + << int(leading) << std::dec << "). Replacing.\n"; + trailingBytes = 0; + u = replacementCharacter; + } + + ++i; + + if (i + trailingBytes > str.size()) { + std::cerr << "warning: Unexpected end of string. Replacing.\n"; + u = replacementCharacter; + } else { + for (size_t j = 0; j < trailingBytes; ++j) { + uint8_t trailing = str[i + j]; + if ((trailing & 0b11000000) != 0b10000000) { + std::cerr << "warning: Bad WTF-8 trailing byte (" << std::hex + << int(trailing) << std::dec << "). Replacing.\n"; + u = replacementCharacter; + break; + } + // Shift 6 bits for every remaining trailing byte after this one. + u |= (trailing & 0b00111111) << (6 * (trailingBytes - j - 1)); + } + } + + i += trailingBytes; + + bool isLeadingSurrogate = 0xD800 <= u && u <= 0xDBFF; + bool isTrailingSurrogate = 0xDC00 <= u && u <= 0xDFFF; + if (lastWasLeadingSurrogate && isTrailingSurrogate) { + std::cerr << "warning: Invalid surrogate sequence in WTF-8.\n"; + } + lastWasLeadingSurrogate = isLeadingSurrogate; + + // Encode unicode code point into JSON. + switch (u) { + case '"': + os << "\\\""; + continue; + case '\\': + os << "\\\\"; + continue; + case '\b': + os << "\\b"; + continue; + case '\f': + os << "\\f"; continue; case '\n': os << "\\n"; @@ -154,73 +220,41 @@ std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str) { case '\r': os << "\\r"; continue; - case '"': - os << "\\\""; - continue; - case '\'': - os << "'"; - continue; - case '\\': - os << "\\\\"; + case '\t': + os << "\\t"; continue; - default: { - // Emit something like \u006e, the JSON escaping of a 16-bit number. - auto uEscape = [&](uint32_t v) { - if (v > 0xffff) { - std::cerr << "warning: Bad 16-bit escapee " << int(u0) << '\n'; - } - os << std::hex; - os << "\\u"; - os << ((v >> 12) & 0xf); - os << ((v >> 8) & 0xf); - os << ((v >> 4) & 0xf); - os << (v & 0xf); - os << std::dec; - }; - - // Based off of - // https://github.com/emscripten-core/emscripten/blob/59e6b8f1262d75585d8416b728e8cbb3db176fe2/src/library_strings.js#L72-L91 - if (!(u0 & 0x80)) { - if (u0 >= 32 && u0 < 127) { - // This requires no escaping at all. - os << char(u0); - } else { - uEscape(u0); - } - continue; - } - - // This uses 2 bytes. - i++; - int u1 = str[i] & 63; - if ((u0 & 0xE0) == 0xC0) { - uEscape((((u0 & 31) << 6) | u1)); - continue; - } + default: + break; + } - // This uses 3 bytes. - i++; - int u2 = str[i] & 63; - if ((u0 & 0xF0) == 0xE0) { - u0 = ((u0 & 15) << 12) | (u1 << 6) | u2; - } else { - // This uses 4 bytes. - if ((u0 & 0xF8) != 0xF0) { - std::cerr << "warning: Bad UTF-8 leading byte " << int(u0) << '\n'; - } - i++; - u0 = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | (str[i] & 63); - } + // TODO: To minimize size, consider additionally escaping only other control + // characters (u <= 0x1F) and surrogates, emitting everything else directly + // assuming a UTF-8 encoding of the JSON text. We don't do this now because + // Print.cpp would consider the contents unprintable, messing up our test. + bool isNaivelyPrintable = 32 <= u && u < 127; + if (isNaivelyPrintable) { + assert(u < 0x80 && "need additional logic to emit valid UTF-8"); + os << uint8_t(u); + continue; + } - if (u0 < 0x10000) { - uEscape(u0); - } else { - // There are two separate code points here. - auto ch = u0 - 0x10000; - uEscape(0xD800 | (ch >> 10)); - uEscape(0xDC00 | (ch & 0x3FF)); - } - } + // Escape as '\uXXXX` for code points less than 0x10000 or as a + // '\uXXXX\uYYYY' surrogate pair otherwise. + auto printEscape = [&os](uint32_t codePoint) { + assert(codePoint < 0x10000); + os << std::hex << "\\u"; + os << ((codePoint & 0xF000) >> 12); + os << ((codePoint & 0x0F00) >> 8); + os << ((codePoint & 0x00F0) >> 4); + os << (codePoint & 0x000F); + os << std::dec; + }; + if (u < 0x10000) { + printEscape(u); + } else { + assert(u <= 0x10FFFF && "unexpectedly high code point"); + printEscape(0xD800 + ((u - 0x10000) >> 10)); + printEscape(0xDC00 + ((u - 0x10000) & 0x3FF)); } } return os << '"'; |