diff options
-rw-r--r-- | src/support/string.cpp | 172 | ||||
-rw-r--r-- | test/lit/passes/string-lowering.wast | 21 |
2 files changed, 119 insertions, 74 deletions
diff --git a/src/support/string.cpp b/src/support/string.cpp index 75dd1716c..c3a9ce4e4 100644 --- a/src/support/string.cpp +++ b/src/support/string.cpp @@ -142,11 +142,77 @@ std::ostream& printEscaped(std::ostream& os, const std::string_view str) { std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str) { os << '"'; - for (size_t i = 0; i < str.size(); i++) { - int u0 = str[i]; - switch (u0) { - case '\t': - os << "\\t"; + constexpr uint32_t replacementCharacter = 0xFFFD; + bool lastWasLeadingSurrogate = false; + for (size_t i = 0; i < str.size();) { + // Decode from WTF-8 into a unicode code point. + uint8_t leading = str[i]; + size_t trailingBytes; + uint32_t u; + if ((leading & 0b10000000) == 0b00000000) { + // 0xxxxxxx + trailingBytes = 0; + u = leading; + } else if ((leading & 0b11100000) == 0b11000000) { + // 110xxxxx 10xxxxxx + trailingBytes = 1; + u = (leading & 0b00011111) << 6; + } else if ((leading & 0b11110000) == 0b11100000) { + // 1110xxxx 10xxxxxx 10xxxxxx + trailingBytes = 2; + u = (leading & 0b00001111) << 12; + } else if ((leading & 0b11111000) == 0b11110000) { + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + trailingBytes = 3; + u = (leading & 0b00000111) << 18; + } else { + std::cerr << "warning: Bad WTF-8 leading byte (" << std::hex + << int(leading) << std::dec << "). Replacing.\n"; + trailingBytes = 0; + u = replacementCharacter; + } + + ++i; + + if (i + trailingBytes > str.size()) { + std::cerr << "warning: Unexpected end of string. Replacing.\n"; + u = replacementCharacter; + } else { + for (size_t j = 0; j < trailingBytes; ++j) { + uint8_t trailing = str[i + j]; + if ((trailing & 0b11000000) != 0b10000000) { + std::cerr << "warning: Bad WTF-8 trailing byte (" << std::hex + << int(trailing) << std::dec << "). Replacing.\n"; + u = replacementCharacter; + break; + } + // Shift 6 bits for every remaining trailing byte after this one. + u |= (trailing & 0b00111111) << (6 * (trailingBytes - j - 1)); + } + } + + i += trailingBytes; + + bool isLeadingSurrogate = 0xD800 <= u && u <= 0xDBFF; + bool isTrailingSurrogate = 0xDC00 <= u && u <= 0xDFFF; + if (lastWasLeadingSurrogate && isTrailingSurrogate) { + std::cerr << "warning: Invalid surrogate sequence in WTF-8.\n"; + } + lastWasLeadingSurrogate = isLeadingSurrogate; + + // Encode unicode code point into JSON. + switch (u) { + case '"': + os << "\\\""; + continue; + case '\\': + os << "\\\\"; + continue; + case '\b': + os << "\\b"; + continue; + case '\f': + os << "\\f"; continue; case '\n': os << "\\n"; @@ -154,73 +220,41 @@ std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str) { case '\r': os << "\\r"; continue; - case '"': - os << "\\\""; - continue; - case '\'': - os << "'"; - continue; - case '\\': - os << "\\\\"; + case '\t': + os << "\\t"; continue; - default: { - // Emit something like \u006e, the JSON escaping of a 16-bit number. - auto uEscape = [&](uint32_t v) { - if (v > 0xffff) { - std::cerr << "warning: Bad 16-bit escapee " << int(u0) << '\n'; - } - os << std::hex; - os << "\\u"; - os << ((v >> 12) & 0xf); - os << ((v >> 8) & 0xf); - os << ((v >> 4) & 0xf); - os << (v & 0xf); - os << std::dec; - }; - - // Based off of - // https://github.com/emscripten-core/emscripten/blob/59e6b8f1262d75585d8416b728e8cbb3db176fe2/src/library_strings.js#L72-L91 - if (!(u0 & 0x80)) { - if (u0 >= 32 && u0 < 127) { - // This requires no escaping at all. - os << char(u0); - } else { - uEscape(u0); - } - continue; - } - - // This uses 2 bytes. - i++; - int u1 = str[i] & 63; - if ((u0 & 0xE0) == 0xC0) { - uEscape((((u0 & 31) << 6) | u1)); - continue; - } + default: + break; + } - // This uses 3 bytes. - i++; - int u2 = str[i] & 63; - if ((u0 & 0xF0) == 0xE0) { - u0 = ((u0 & 15) << 12) | (u1 << 6) | u2; - } else { - // This uses 4 bytes. - if ((u0 & 0xF8) != 0xF0) { - std::cerr << "warning: Bad UTF-8 leading byte " << int(u0) << '\n'; - } - i++; - u0 = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | (str[i] & 63); - } + // TODO: To minimize size, consider additionally escaping only other control + // characters (u <= 0x1F) and surrogates, emitting everything else directly + // assuming a UTF-8 encoding of the JSON text. We don't do this now because + // Print.cpp would consider the contents unprintable, messing up our test. + bool isNaivelyPrintable = 32 <= u && u < 127; + if (isNaivelyPrintable) { + assert(u < 0x80 && "need additional logic to emit valid UTF-8"); + os << uint8_t(u); + continue; + } - if (u0 < 0x10000) { - uEscape(u0); - } else { - // There are two separate code points here. - auto ch = u0 - 0x10000; - uEscape(0xD800 | (ch >> 10)); - uEscape(0xDC00 | (ch & 0x3FF)); - } - } + // Escape as '\uXXXX` for code points less than 0x10000 or as a + // '\uXXXX\uYYYY' surrogate pair otherwise. + auto printEscape = [&os](uint32_t codePoint) { + assert(codePoint < 0x10000); + os << std::hex << "\\u"; + os << ((codePoint & 0xF000) >> 12); + os << ((codePoint & 0x0F00) >> 8); + os << ((codePoint & 0x00F0) >> 4); + os << (codePoint & 0x000F); + os << std::dec; + }; + if (u < 0x10000) { + printEscape(u); + } else { + assert(u <= 0x10FFFF && "unexpectedly high code point"); + printEscape(0xD800 + ((u - 0x10000) >> 10)); + printEscape(0xDC00 + ((u - 0x10000) & 0x3FF)); } } return os << '"'; diff --git a/test/lit/passes/string-lowering.wast b/test/lit/passes/string-lowering.wast index 43d30fc64..f7f47871b 100644 --- a/test/lit/passes/string-lowering.wast +++ b/test/lit/passes/string-lowering.wast @@ -14,7 +14,19 @@ (string.const "foo") ) (drop - (string.const "needs\tescaping\00.'#%\"- .\r\n\\.ꙮ") + (string.const "needs\tescaping\00.'#%\"- .\r\n\\08\0C\0A\0D\09.ꙮ") + ) + (drop + (string.const "invalid WTF-8 leading byte \FF") + ) + (drop + (string.const "invalid trailing byte \C0\00") + ) + (drop + (string.const "unexpected end \C0") + ) + (drop + (string.const "invalid surrogate sequence \ED\A0\81\ED\B0\B7") ) ) ) @@ -24,7 +36,7 @@ ;; ;; RUN: wasm-opt %s --string-lowering -all -S -o - | filecheck %s ;; -;; CHECK: custom section "string.consts", size 59, contents: "[\"bar\",\"foo\",\"needs\\tescaping\\u0000.'#%\\\"- .\\r\\n\\\\.\\ua66e\"]" +;; CHECK: custom section "string.consts", size 202, contents: "[\"bar\",\"foo\",\"invalid WTF-8 leading byte \\ufffd\",\"invalid surrogate sequence \\ud801\\udc37\",\"invalid trailing byte \\ufffd\",\"needs\\tescaping\\u0000.'#%\\\"- .\\r\\n\\\\08\\f\\n\\r\\t.\\ua66e\",\"unexpected end \\ufffd\"]" ;; The custom section should parse OK using JSON.parse from node. ;; (Note we run --remove-unused-module-elements to remove externref-using @@ -33,6 +45,5 @@ ;; RUN: wasm-opt %s --string-lowering --remove-unused-module-elements -all -o %t.wasm ;; RUN: node %S/string-lowering.js %t.wasm | filecheck %s --check-prefix=CHECK-JS ;; -;; CHECK-JS: string: ["bar","foo","needs\tescaping\x00.'#%\"- .\r\n\\.\ua66e"] -;; CHECK-JS: JSON: ["bar","foo","needs\tescaping\x00.'#%\"- .\r\n\\.ꙮ"] - +;; CHECK-JS: string: ["bar","foo","invalid WTF-8 leading byte \ufffd","invalid surrogate sequence \ud801\udc37","invalid trailing byte \ufffd","needs\tescaping\x00.'#%\"- .\r\n\\08\f\n\r\t.\ua66e","unexpected end \ufffd"] +;; CHECK-JS: JSON: ["bar","foo","invalid WTF-8 leading byte �","invalid surrogate sequence 𐐷","invalid trailing byte �","needs\tescaping\x00.'#%\"- .\r\n\\08\f\n\r\t.ꙮ","unexpected end �"] |