StringLowering: Escape the JSON in the custom section (#6316)

Also add an end-to-end test using node to verify we can parse the escaped content properly using TextDecoder+JSON.parse.
author: Alon Zakai <azakai@google.com> 2024-02-20 13:23:10 -0800
committer: GitHub <noreply@github.com> 2024-02-20 13:23:10 -0800
commit: 07b91a81c169091b2464e6d587666ad3f0124a1e (patch)
tree: cc0df6add42ef4d9f30ea7db6a4171ca8461d18d
parent: d0fa7102b5be83be6d0670a5662013bf5f8d1b64 (diff)
download: binaryen-07b91a81c169091b2464e6d587666ad3f0124a1e.tar.gz
binaryen-07b91a81c169091b2464e6d587666ad3f0124a1e.tar.bz2
binaryen-07b91a81c169091b2464e6d587666ad3f0124a1e.zip
6 files changed, 140 insertions, 16 deletions
diff --git a/src/passes/StringLowering.cpp b/src/passes/StringLowering.cpp
index d92a151c7..c70691ea1 100644
--- a/src/passes/StringLowering.cpp
+++ b/src/passes/StringLowering.cpp
@@ -37,7 +37,7 @@
 #include "ir/type-updating.h"
 #include "ir/utils.h"
 #include "pass.h"
-#include "support/json.h"
+#include "support/string.h"
 #include "wasm-builder.h"
 #include "wasm.h"
 
@@ -205,8 +205,9 @@ struct StringLowering : public StringGathering {
 
   void makeImports(Module* module) {
     Index importIndex = 0;
-    json::Value stringArray;
-    stringArray.setArray();
+    std::stringstream json;
+    json << '[';
+    bool first = true;
     std::vector<Name> importedStrings;
     for (auto& global : module->globals) {
       if (global->init) {
@@ -216,16 +217,19 @@ struct StringLowering : public StringGathering {
           importIndex++;
           global->init = nullptr;
 
-          auto str = json::Value::make(std::string(c->string.str).c_str());
-          stringArray.push_back(str);
+          if (first) {
+            first = false;
+          } else {
+            json << ',';
+          }
+          String::printEscapedJSON(json, c->string.str);
         }
       }
     }
 
     // Add a custom section with the JSON.
-    std::stringstream stream;
-    stringArray.stringify(stream);
-    auto str = stream.str();
+    json << ']';
+    auto str = json.str();
     auto vec = std::vector<char>(str.begin(), str.end());
     module->customSections.emplace_back(
       CustomSection{"string.consts", std::move(vec)});
diff --git a/src/support/json.cpp b/src/support/json.cpp
index d43ac0323..ab55cc75f 100644
--- a/src/support/json.cpp
+++ b/src/support/json.cpp
@@ -15,13 +15,13 @@
  */
 
 #include "support/json.h"
+#include "support/string.h"
 
 namespace json {
 
 void Value::stringify(std::ostream& os, bool pretty) {
   if (isString()) {
-    // TODO: escaping
-    os << '"' << getCString() << '"';
+    wasm::String::printEscapedJSON(os, getCString());
   } else if (isArray()) {
     os << '[';
     auto first = true;
diff --git a/src/support/string.cpp b/src/support/string.cpp
index 09924849b..75dd1716c 100644
--- a/src/support/string.cpp
+++ b/src/support/string.cpp
@@ -106,7 +106,7 @@ std::string trim(const std::string& input) {
   return input.substr(0, size);
 }
 
-std::ostream& printEscaped(std::ostream& os, std::string_view str) {
+std::ostream& printEscaped(std::ostream& os, const std::string_view str) {
   os << '"';
   for (unsigned char c : str) {
     switch (c) {
@@ -140,4 +140,90 @@ std::ostream& printEscaped(std::ostream& os, std::string_view str) {
   return os << '"';
 }
 
+std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str) {
+  os << '"';
+  for (size_t i = 0; i < str.size(); i++) {
+    int u0 = str[i];
+    switch (u0) {
+      case '\t':
+        os << "\\t";
+        continue;
+      case '\n':
+        os << "\\n";
+        continue;
+      case '\r':
+        os << "\\r";
+        continue;
+      case '"':
+        os << "\\\"";
+        continue;
+      case '\'':
+        os << "'";
+        continue;
+      case '\\':
+        os << "\\\\";
+        continue;
+      default: {
+        // Emit something like \u006e, the JSON escaping of a 16-bit number.
+        auto uEscape = [&](uint32_t v) {
+          if (v > 0xffff) {
+            std::cerr << "warning: Bad 16-bit escapee " << int(u0) << '\n';
+          }
+          os << std::hex;
+          os << "\\u";
+          os << ((v >> 12) & 0xf);
+          os << ((v >> 8) & 0xf);
+          os << ((v >> 4) & 0xf);
+          os << (v & 0xf);
+          os << std::dec;
+        };
+
+        // Based off of
+        // https://github.com/emscripten-core/emscripten/blob/59e6b8f1262d75585d8416b728e8cbb3db176fe2/src/library_strings.js#L72-L91
+        if (!(u0 & 0x80)) {
+          if (u0 >= 32 && u0 < 127) {
+            // This requires no escaping at all.
+            os << char(u0);
+          } else {
+            uEscape(u0);
+          }
+          continue;
+        }
+
+        // This uses 2 bytes.
+        i++;
+        int u1 = str[i] & 63;
+        if ((u0 & 0xE0) == 0xC0) {
+          uEscape((((u0 & 31) << 6) | u1));
+          continue;
+        }
+
+        // This uses 3 bytes.
+        i++;
+        int u2 = str[i] & 63;
+        if ((u0 & 0xF0) == 0xE0) {
+          u0 = ((u0 & 15) << 12) | (u1 << 6) | u2;
+        } else {
+          // This uses 4 bytes.
+          if ((u0 & 0xF8) != 0xF0) {
+            std::cerr << "warning: Bad UTF-8 leading byte " << int(u0) << '\n';
+          }
+          i++;
+          u0 = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | (str[i] & 63);
+        }
+
+        if (u0 < 0x10000) {
+          uEscape(u0);
+        } else {
+          // There are two separate code points here.
+          auto ch = u0 - 0x10000;
+          uEscape(0xD800 | (ch >> 10));
+          uEscape(0xDC00 | (ch & 0x3FF));
+        }
+      }
+    }
+  }
+  return os << '"';
+}
+
 } // namespace wasm::String
diff --git a/src/support/string.h b/src/support/string.h
index 8ab0ae1a6..6fb3f693b 100644
--- a/src/support/string.h
+++ b/src/support/string.h
@@ -75,7 +75,9 @@ inline bool isNumber(const std::string& str) {
   return !str.empty() && std::all_of(str.begin(), str.end(), ::isdigit);
 }
 
-std::ostream& printEscaped(std::ostream& os, std::string_view str);
+std::ostream& printEscaped(std::ostream& os, const std::string_view str);
+
+std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str);
 
 } // namespace wasm::String
 
diff --git a/test/lit/passes/string-lowering.js b/test/lit/passes/string-lowering.js
new file mode 100644
index 000000000..105970bcc
--- /dev/null
+++ b/test/lit/passes/string-lowering.js
@@ -0,0 +1,17 @@
+var filename = process.argv[2];
+var module = new WebAssembly.Module(require('fs').readFileSync(filename));
+var sections = WebAssembly.Module.customSections(module, 'string.consts');
+var array = new Uint8Array(sections[0]);
+var string = new TextDecoder('utf-8').decode(array);
+
+function standardizeEncoding(s) {
+  // Different node.js versions print differently, so we must standardize to
+  // pass tests in all places. In particular at some point node.js started to
+  // abbreviate \u0000 as \x00 (both of which are valid).
+  return s.replace('\\u0000', '\\x00');
+}
+
+console.log("string:", standardizeEncoding(string));
+
+var json = JSON.stringify(JSON.parse(string));
+console.log("JSON:", standardizeEncoding(json));
diff --git a/test/lit/passes/string-lowering.wast b/test/lit/passes/string-lowering.wast
index 628092d1c..43d30fc64 100644
--- a/test/lit/passes/string-lowering.wast
+++ b/test/lit/passes/string-lowering.wast
@@ -2,8 +2,6 @@
 ;; operations are tested in string-gathering.wast (which is auto-updated, unlike
 ;; this which is manual).
 
-;; RUN: foreach %s %t wasm-opt --string-lowering -all -S -o - | filecheck %s
-
 (module
   (func $consts
     (drop
@@ -15,9 +13,26 @@
     (drop
       (string.const "foo")
     )
+    (drop
+      (string.const "needs\tescaping\00.'#%\"- .\r\n\\.ꙮ")
+    )
   )
 )
 
-;; The custom section should contain foo and bar, and foo only once.
-;; CHECK: custom section "string.consts", size 13, contents: "[\"bar\",\"foo\"]"
+;; The custom section should contain foo and bar, and foo only once, and the
+;; string with \t should be escaped.
+;;
+;; RUN: wasm-opt %s --string-lowering -all -S -o - | filecheck %s
+;;
+;; CHECK: custom section "string.consts", size 59, contents: "[\"bar\",\"foo\",\"needs\\tescaping\\u0000.'#%\\\"- .\\r\\n\\\\.\\ua66e\"]"
+
+;; The custom section should parse OK using JSON.parse from node.
+;; (Note we run --remove-unused-module-elements to remove externref-using
+;; imports, which require a newer version of node.)
+;;
+;; RUN: wasm-opt %s --string-lowering --remove-unused-module-elements -all -o %t.wasm
+;; RUN: node %S/string-lowering.js %t.wasm | filecheck %s --check-prefix=CHECK-JS
+;;
+;; CHECK-JS: string: ["bar","foo","needs\tescaping\x00.'#%\"- .\r\n\\.\ua66e"]
+;; CHECK-JS: JSON: ["bar","foo","needs\tescaping\x00.'#%\"- .\r\n\\.ꙮ"]
author	Alon Zakai <azakai@google.com>	2024-02-20 13:23:10 -0800
committer	GitHub <noreply@github.com>	2024-02-20 13:23:10 -0800
commit	07b91a81c169091b2464e6d587666ad3f0124a1e (patch)
tree	cc0df6add42ef4d9f30ea7db6a4171ca8461d18d
parent	d0fa7102b5be83be6d0670a5662013bf5f8d1b64 (diff)
download	binaryen-07b91a81c169091b2464e6d587666ad3f0124a1e.tar.gz binaryen-07b91a81c169091b2464e6d587666ad3f0124a1e.tar.bz2 binaryen-07b91a81c169091b2464e6d587666ad3f0124a1e.zip