Improve JSON string encoding (#6328)

Catch and report all kinds of WTF-8 encoding errors in the source strings, including invalid leading bytes, invalid trailing bytes, unexpected ends of strings, and invalid surrogate sequences. Insert replacement characters into the output as necessary. Add a TODO about minimizing size by escaping only those code points mandated to be escaped by the JSON spec. Generally improve readability of the code.
author: Thomas Lively <tlively@google.com> 2024-02-21 12:10:01 -0800
committer: GitHub <noreply@github.com> 2024-02-21 20:10:01 +0000
commit: 39ae6cf367fced5aad3224c6bffa086b0cd0d393 (patch)
tree: 3a46e5f0ebc3c9d0773d4409f67f5b8e4fed3246 /src
parent: a2fa5598104f048bce744db57d4d9407473bcd14 (diff)
download: binaryen-39ae6cf367fced5aad3224c6bffa086b0cd0d393.tar.gz
binaryen-39ae6cf367fced5aad3224c6bffa086b0cd0d393.tar.bz2
binaryen-39ae6cf367fced5aad3224c6bffa086b0cd0d393.zip
1 files changed, 103 insertions, 69 deletions
diff --git a/src/support/string.cpp b/src/support/string.cpp
index 75dd1716c..c3a9ce4e4 100644
--- a/src/support/string.cpp
+++ b/src/support/string.cpp
@@ -142,11 +142,77 @@ std::ostream& printEscaped(std::ostream& os, const std::string_view str) {
 
 std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str) {
   os << '"';
-  for (size_t i = 0; i < str.size(); i++) {
-    int u0 = str[i];
-    switch (u0) {
-      case '\t':
-        os << "\\t";
+  constexpr uint32_t replacementCharacter = 0xFFFD;
+  bool lastWasLeadingSurrogate = false;
+  for (size_t i = 0; i < str.size();) {
+    // Decode from WTF-8 into a unicode code point.
+    uint8_t leading = str[i];
+    size_t trailingBytes;
+    uint32_t u;
+    if ((leading & 0b10000000) == 0b00000000) {
+      // 0xxxxxxx
+      trailingBytes = 0;
+      u = leading;
+    } else if ((leading & 0b11100000) == 0b11000000) {
+      // 110xxxxx 10xxxxxx
+      trailingBytes = 1;
+      u = (leading & 0b00011111) << 6;
+    } else if ((leading & 0b11110000) == 0b11100000) {
+      // 1110xxxx 10xxxxxx 10xxxxxx
+      trailingBytes = 2;
+      u = (leading & 0b00001111) << 12;
+    } else if ((leading & 0b11111000) == 0b11110000) {
+      // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+      trailingBytes = 3;
+      u = (leading & 0b00000111) << 18;
+    } else {
+      std::cerr << "warning: Bad WTF-8 leading byte (" << std::hex
+                << int(leading) << std::dec << "). Replacing.\n";
+      trailingBytes = 0;
+      u = replacementCharacter;
+    }
+
+    ++i;
+
+    if (i + trailingBytes > str.size()) {
+      std::cerr << "warning: Unexpected end of string. Replacing.\n";
+      u = replacementCharacter;
+    } else {
+      for (size_t j = 0; j < trailingBytes; ++j) {
+        uint8_t trailing = str[i + j];
+        if ((trailing & 0b11000000) != 0b10000000) {
+          std::cerr << "warning: Bad WTF-8 trailing byte (" << std::hex
+                    << int(trailing) << std::dec << "). Replacing.\n";
+          u = replacementCharacter;
+          break;
+        }
+        // Shift 6 bits for every remaining trailing byte after this one.
+        u |= (trailing & 0b00111111) << (6 * (trailingBytes - j - 1));
+      }
+    }
+
+    i += trailingBytes;
+
+    bool isLeadingSurrogate = 0xD800 <= u && u <= 0xDBFF;
+    bool isTrailingSurrogate = 0xDC00 <= u && u <= 0xDFFF;
+    if (lastWasLeadingSurrogate && isTrailingSurrogate) {
+      std::cerr << "warning: Invalid surrogate sequence in WTF-8.\n";
+    }
+    lastWasLeadingSurrogate = isLeadingSurrogate;
+
+    // Encode unicode code point into JSON.
+    switch (u) {
+      case '"':
+        os << "\\\"";
+        continue;
+      case '\\':
+        os << "\\\\";
+        continue;
+      case '\b':
+        os << "\\b";
+        continue;
+      case '\f':
+        os << "\\f";
         continue;
       case '\n':
         os << "\\n";
@@ -154,73 +220,41 @@ std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str) {
       case '\r':
         os << "\\r";
         continue;
-      case '"':
-        os << "\\\"";
-        continue;
-      case '\'':
-        os << "'";
-        continue;
-      case '\\':
-        os << "\\\\";
+      case '\t':
+        os << "\\t";
         continue;
-      default: {
-        // Emit something like \u006e, the JSON escaping of a 16-bit number.
-        auto uEscape = [&](uint32_t v) {
-          if (v > 0xffff) {
-            std::cerr << "warning: Bad 16-bit escapee " << int(u0) << '\n';
-          }
-          os << std::hex;
-          os << "\\u";
-          os << ((v >> 12) & 0xf);
-          os << ((v >> 8) & 0xf);
-          os << ((v >> 4) & 0xf);
-          os << (v & 0xf);
-          os << std::dec;
-        };
-
-        // Based off of
-        // https://github.com/emscripten-core/emscripten/blob/59e6b8f1262d75585d8416b728e8cbb3db176fe2/src/library_strings.js#L72-L91
-        if (!(u0 & 0x80)) {
-          if (u0 >= 32 && u0 < 127) {
-            // This requires no escaping at all.
-            os << char(u0);
-          } else {
-            uEscape(u0);
-          }
-          continue;
-        }
-
-        // This uses 2 bytes.
-        i++;
-        int u1 = str[i] & 63;
-        if ((u0 & 0xE0) == 0xC0) {
-          uEscape((((u0 & 31) << 6) | u1));
-          continue;
-        }
+      default:
+        break;
+    }
 
-        // This uses 3 bytes.
-        i++;
-        int u2 = str[i] & 63;
-        if ((u0 & 0xF0) == 0xE0) {
-          u0 = ((u0 & 15) << 12) | (u1 << 6) | u2;
-        } else {
-          // This uses 4 bytes.
-          if ((u0 & 0xF8) != 0xF0) {
-            std::cerr << "warning: Bad UTF-8 leading byte " << int(u0) << '\n';
-          }
-          i++;
-          u0 = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | (str[i] & 63);
-        }
+    // TODO: To minimize size, consider additionally escaping only other control
+    // characters (u <= 0x1F) and surrogates, emitting everything else directly
+    // assuming a UTF-8 encoding of the JSON text. We don't do this now because
+    // Print.cpp would consider the contents unprintable, messing up our test.
+    bool isNaivelyPrintable = 32 <= u && u < 127;
+    if (isNaivelyPrintable) {
+      assert(u < 0x80 && "need additional logic to emit valid UTF-8");
+      os << uint8_t(u);
+      continue;
+    }
 
-        if (u0 < 0x10000) {
-          uEscape(u0);
-        } else {
-          // There are two separate code points here.
-          auto ch = u0 - 0x10000;
-          uEscape(0xD800 | (ch >> 10));
-          uEscape(0xDC00 | (ch & 0x3FF));
-        }
-      }
+    // Escape as '\uXXXX` for code points less than 0x10000 or as a
+    // '\uXXXX\uYYYY' surrogate pair otherwise.
+    auto printEscape = [&os](uint32_t codePoint) {
+      assert(codePoint < 0x10000);
+      os << std::hex << "\\u";
+      os << ((codePoint & 0xF000) >> 12);
+      os << ((codePoint & 0x0F00) >> 8);
+      os << ((codePoint & 0x00F0) >> 4);
+      os << (codePoint & 0x000F);
+      os << std::dec;
+    };
+    if (u < 0x10000) {
+      printEscape(u);
+    } else {
+      assert(u <= 0x10FFFF && "unexpectedly high code point");
+      printEscape(0xD800 + ((u - 0x10000) >> 10));
+      printEscape(0xDC00 + ((u - 0x10000) & 0x3FF));
     }
   }
   return os << '"';
author	Thomas Lively <tlively@google.com>	2024-02-21 12:10:01 -0800
committer	GitHub <noreply@github.com>	2024-02-21 20:10:01 +0000
commit	39ae6cf367fced5aad3224c6bffa086b0cd0d393 (patch)
tree	3a46e5f0ebc3c9d0773d4409f67f5b8e4fed3246 /src
parent	a2fa5598104f048bce744db57d4d9407473bcd14 (diff)
download	binaryen-39ae6cf367fced5aad3224c6bffa086b0cd0d393.tar.gz binaryen-39ae6cf367fced5aad3224c6bffa086b0cd0d393.tar.bz2 binaryen-39ae6cf367fced5aad3224c6bffa086b0cd0d393.zip