summaryrefslogtreecommitdiff
path: root/src/support/string.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/support/string.cpp')
-rw-r--r--src/support/string.cpp172
1 files changed, 103 insertions, 69 deletions
diff --git a/src/support/string.cpp b/src/support/string.cpp
index 75dd1716c..c3a9ce4e4 100644
--- a/src/support/string.cpp
+++ b/src/support/string.cpp
@@ -142,11 +142,77 @@ std::ostream& printEscaped(std::ostream& os, const std::string_view str) {
std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str) {
os << '"';
- for (size_t i = 0; i < str.size(); i++) {
- int u0 = str[i];
- switch (u0) {
- case '\t':
- os << "\\t";
+ constexpr uint32_t replacementCharacter = 0xFFFD;
+ bool lastWasLeadingSurrogate = false;
+ for (size_t i = 0; i < str.size();) {
+ // Decode from WTF-8 into a unicode code point.
+ uint8_t leading = str[i];
+ size_t trailingBytes;
+ uint32_t u;
+ if ((leading & 0b10000000) == 0b00000000) {
+ // 0xxxxxxx
+ trailingBytes = 0;
+ u = leading;
+ } else if ((leading & 0b11100000) == 0b11000000) {
+ // 110xxxxx 10xxxxxx
+ trailingBytes = 1;
+ u = (leading & 0b00011111) << 6;
+ } else if ((leading & 0b11110000) == 0b11100000) {
+ // 1110xxxx 10xxxxxx 10xxxxxx
+ trailingBytes = 2;
+ u = (leading & 0b00001111) << 12;
+ } else if ((leading & 0b11111000) == 0b11110000) {
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ trailingBytes = 3;
+ u = (leading & 0b00000111) << 18;
+ } else {
+ std::cerr << "warning: Bad WTF-8 leading byte (" << std::hex
+ << int(leading) << std::dec << "). Replacing.\n";
+ trailingBytes = 0;
+ u = replacementCharacter;
+ }
+
+ ++i;
+
+ if (i + trailingBytes > str.size()) {
+ std::cerr << "warning: Unexpected end of string. Replacing.\n";
+ u = replacementCharacter;
+ } else {
+ for (size_t j = 0; j < trailingBytes; ++j) {
+ uint8_t trailing = str[i + j];
+ if ((trailing & 0b11000000) != 0b10000000) {
+ std::cerr << "warning: Bad WTF-8 trailing byte (" << std::hex
+ << int(trailing) << std::dec << "). Replacing.\n";
+ u = replacementCharacter;
+ break;
+ }
+ // Shift 6 bits for every remaining trailing byte after this one.
+ u |= (trailing & 0b00111111) << (6 * (trailingBytes - j - 1));
+ }
+ }
+
+ i += trailingBytes;
+
+ bool isLeadingSurrogate = 0xD800 <= u && u <= 0xDBFF;
+ bool isTrailingSurrogate = 0xDC00 <= u && u <= 0xDFFF;
+ if (lastWasLeadingSurrogate && isTrailingSurrogate) {
+ std::cerr << "warning: Invalid surrogate sequence in WTF-8.\n";
+ }
+ lastWasLeadingSurrogate = isLeadingSurrogate;
+
+ // Encode unicode code point into JSON.
+ switch (u) {
+ case '"':
+ os << "\\\"";
+ continue;
+ case '\\':
+ os << "\\\\";
+ continue;
+ case '\b':
+ os << "\\b";
+ continue;
+ case '\f':
+ os << "\\f";
continue;
case '\n':
os << "\\n";
@@ -154,73 +220,41 @@ std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str) {
case '\r':
os << "\\r";
continue;
- case '"':
- os << "\\\"";
- continue;
- case '\'':
- os << "'";
- continue;
- case '\\':
- os << "\\\\";
+ case '\t':
+ os << "\\t";
continue;
- default: {
- // Emit something like \u006e, the JSON escaping of a 16-bit number.
- auto uEscape = [&](uint32_t v) {
- if (v > 0xffff) {
- std::cerr << "warning: Bad 16-bit escapee " << int(u0) << '\n';
- }
- os << std::hex;
- os << "\\u";
- os << ((v >> 12) & 0xf);
- os << ((v >> 8) & 0xf);
- os << ((v >> 4) & 0xf);
- os << (v & 0xf);
- os << std::dec;
- };
-
- // Based off of
- // https://github.com/emscripten-core/emscripten/blob/59e6b8f1262d75585d8416b728e8cbb3db176fe2/src/library_strings.js#L72-L91
- if (!(u0 & 0x80)) {
- if (u0 >= 32 && u0 < 127) {
- // This requires no escaping at all.
- os << char(u0);
- } else {
- uEscape(u0);
- }
- continue;
- }
-
- // This uses 2 bytes.
- i++;
- int u1 = str[i] & 63;
- if ((u0 & 0xE0) == 0xC0) {
- uEscape((((u0 & 31) << 6) | u1));
- continue;
- }
+ default:
+ break;
+ }
- // This uses 3 bytes.
- i++;
- int u2 = str[i] & 63;
- if ((u0 & 0xF0) == 0xE0) {
- u0 = ((u0 & 15) << 12) | (u1 << 6) | u2;
- } else {
- // This uses 4 bytes.
- if ((u0 & 0xF8) != 0xF0) {
- std::cerr << "warning: Bad UTF-8 leading byte " << int(u0) << '\n';
- }
- i++;
- u0 = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | (str[i] & 63);
- }
+ // TODO: To minimize size, consider additionally escaping only other control
+ // characters (u <= 0x1F) and surrogates, emitting everything else directly
+ // assuming a UTF-8 encoding of the JSON text. We don't do this now because
+ // Print.cpp would consider the contents unprintable, messing up our test.
+ bool isNaivelyPrintable = 32 <= u && u < 127;
+ if (isNaivelyPrintable) {
+ assert(u < 0x80 && "need additional logic to emit valid UTF-8");
+ os << uint8_t(u);
+ continue;
+ }
- if (u0 < 0x10000) {
- uEscape(u0);
- } else {
- // There are two separate code points here.
- auto ch = u0 - 0x10000;
- uEscape(0xD800 | (ch >> 10));
- uEscape(0xDC00 | (ch & 0x3FF));
- }
- }
+ // Escape as '\uXXXX` for code points less than 0x10000 or as a
+ // '\uXXXX\uYYYY' surrogate pair otherwise.
+ auto printEscape = [&os](uint32_t codePoint) {
+ assert(codePoint < 0x10000);
+ os << std::hex << "\\u";
+ os << ((codePoint & 0xF000) >> 12);
+ os << ((codePoint & 0x0F00) >> 8);
+ os << ((codePoint & 0x00F0) >> 4);
+ os << (codePoint & 0x000F);
+ os << std::dec;
+ };
+ if (u < 0x10000) {
+ printEscape(u);
+ } else {
+ assert(u <= 0x10FFFF && "unexpectedly high code point");
+ printEscape(0xD800 + ((u - 0x10000) >> 10));
+ printEscape(0xDC00 + ((u - 0x10000) & 0x3FF));
}
}
return os << '"';