summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlon Zakai <azakai@google.com>2024-02-20 13:23:10 -0800
committerGitHub <noreply@github.com>2024-02-20 13:23:10 -0800
commit07b91a81c169091b2464e6d587666ad3f0124a1e (patch)
treecc0df6add42ef4d9f30ea7db6a4171ca8461d18d
parentd0fa7102b5be83be6d0670a5662013bf5f8d1b64 (diff)
downloadbinaryen-07b91a81c169091b2464e6d587666ad3f0124a1e.tar.gz
binaryen-07b91a81c169091b2464e6d587666ad3f0124a1e.tar.bz2
binaryen-07b91a81c169091b2464e6d587666ad3f0124a1e.zip
StringLowering: Escape the JSON in the custom section (#6316)
Also add an end-to-end test using node to verify we can parse the escaped content properly using TextDecoder+JSON.parse.
-rw-r--r--src/passes/StringLowering.cpp20
-rw-r--r--src/support/json.cpp4
-rw-r--r--src/support/string.cpp88
-rw-r--r--src/support/string.h4
-rw-r--r--test/lit/passes/string-lowering.js17
-rw-r--r--test/lit/passes/string-lowering.wast23
6 files changed, 140 insertions, 16 deletions
diff --git a/src/passes/StringLowering.cpp b/src/passes/StringLowering.cpp
index d92a151c7..c70691ea1 100644
--- a/src/passes/StringLowering.cpp
+++ b/src/passes/StringLowering.cpp
@@ -37,7 +37,7 @@
#include "ir/type-updating.h"
#include "ir/utils.h"
#include "pass.h"
-#include "support/json.h"
+#include "support/string.h"
#include "wasm-builder.h"
#include "wasm.h"
@@ -205,8 +205,9 @@ struct StringLowering : public StringGathering {
void makeImports(Module* module) {
Index importIndex = 0;
- json::Value stringArray;
- stringArray.setArray();
+ std::stringstream json;
+ json << '[';
+ bool first = true;
std::vector<Name> importedStrings;
for (auto& global : module->globals) {
if (global->init) {
@@ -216,16 +217,19 @@ struct StringLowering : public StringGathering {
importIndex++;
global->init = nullptr;
- auto str = json::Value::make(std::string(c->string.str).c_str());
- stringArray.push_back(str);
+ if (first) {
+ first = false;
+ } else {
+ json << ',';
+ }
+ String::printEscapedJSON(json, c->string.str);
}
}
}
// Add a custom section with the JSON.
- std::stringstream stream;
- stringArray.stringify(stream);
- auto str = stream.str();
+ json << ']';
+ auto str = json.str();
auto vec = std::vector<char>(str.begin(), str.end());
module->customSections.emplace_back(
CustomSection{"string.consts", std::move(vec)});
diff --git a/src/support/json.cpp b/src/support/json.cpp
index d43ac0323..ab55cc75f 100644
--- a/src/support/json.cpp
+++ b/src/support/json.cpp
@@ -15,13 +15,13 @@
*/
#include "support/json.h"
+#include "support/string.h"
namespace json {
void Value::stringify(std::ostream& os, bool pretty) {
if (isString()) {
- // TODO: escaping
- os << '"' << getCString() << '"';
+ wasm::String::printEscapedJSON(os, getCString());
} else if (isArray()) {
os << '[';
auto first = true;
diff --git a/src/support/string.cpp b/src/support/string.cpp
index 09924849b..75dd1716c 100644
--- a/src/support/string.cpp
+++ b/src/support/string.cpp
@@ -106,7 +106,7 @@ std::string trim(const std::string& input) {
return input.substr(0, size);
}
-std::ostream& printEscaped(std::ostream& os, std::string_view str) {
+std::ostream& printEscaped(std::ostream& os, const std::string_view str) {
os << '"';
for (unsigned char c : str) {
switch (c) {
@@ -140,4 +140,90 @@ std::ostream& printEscaped(std::ostream& os, std::string_view str) {
return os << '"';
}
+std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str) {
+ os << '"';
+ for (size_t i = 0; i < str.size(); i++) {
+ int u0 = str[i];
+ switch (u0) {
+ case '\t':
+ os << "\\t";
+ continue;
+ case '\n':
+ os << "\\n";
+ continue;
+ case '\r':
+ os << "\\r";
+ continue;
+ case '"':
+ os << "\\\"";
+ continue;
+ case '\'':
+ os << "'";
+ continue;
+ case '\\':
+ os << "\\\\";
+ continue;
+ default: {
+ // Emit something like \u006e, the JSON escaping of a 16-bit number.
+ auto uEscape = [&](uint32_t v) {
+ if (v > 0xffff) {
+ std::cerr << "warning: Bad 16-bit escapee " << int(u0) << '\n';
+ }
+ os << std::hex;
+ os << "\\u";
+ os << ((v >> 12) & 0xf);
+ os << ((v >> 8) & 0xf);
+ os << ((v >> 4) & 0xf);
+ os << (v & 0xf);
+ os << std::dec;
+ };
+
+ // Based off of
+ // https://github.com/emscripten-core/emscripten/blob/59e6b8f1262d75585d8416b728e8cbb3db176fe2/src/library_strings.js#L72-L91
+ if (!(u0 & 0x80)) {
+ if (u0 >= 32 && u0 < 127) {
+ // This requires no escaping at all.
+ os << char(u0);
+ } else {
+ uEscape(u0);
+ }
+ continue;
+ }
+
+ // This uses 2 bytes.
+ i++;
+ int u1 = str[i] & 63;
+ if ((u0 & 0xE0) == 0xC0) {
+ uEscape((((u0 & 31) << 6) | u1));
+ continue;
+ }
+
+ // This uses 3 bytes.
+ i++;
+ int u2 = str[i] & 63;
+ if ((u0 & 0xF0) == 0xE0) {
+ u0 = ((u0 & 15) << 12) | (u1 << 6) | u2;
+ } else {
+ // This uses 4 bytes.
+ if ((u0 & 0xF8) != 0xF0) {
+ std::cerr << "warning: Bad UTF-8 leading byte " << int(u0) << '\n';
+ }
+ i++;
+ u0 = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | (str[i] & 63);
+ }
+
+ if (u0 < 0x10000) {
+ uEscape(u0);
+ } else {
+ // There are two separate code points here.
+ auto ch = u0 - 0x10000;
+ uEscape(0xD800 | (ch >> 10));
+ uEscape(0xDC00 | (ch & 0x3FF));
+ }
+ }
+ }
+ }
+ return os << '"';
+}
+
} // namespace wasm::String
diff --git a/src/support/string.h b/src/support/string.h
index 8ab0ae1a6..6fb3f693b 100644
--- a/src/support/string.h
+++ b/src/support/string.h
@@ -75,7 +75,9 @@ inline bool isNumber(const std::string& str) {
return !str.empty() && std::all_of(str.begin(), str.end(), ::isdigit);
}
-std::ostream& printEscaped(std::ostream& os, std::string_view str);
+std::ostream& printEscaped(std::ostream& os, const std::string_view str);
+
+std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str);
} // namespace wasm::String
diff --git a/test/lit/passes/string-lowering.js b/test/lit/passes/string-lowering.js
new file mode 100644
index 000000000..105970bcc
--- /dev/null
+++ b/test/lit/passes/string-lowering.js
@@ -0,0 +1,17 @@
+var filename = process.argv[2];
+var module = new WebAssembly.Module(require('fs').readFileSync(filename));
+var sections = WebAssembly.Module.customSections(module, 'string.consts');
+var array = new Uint8Array(sections[0]);
+var string = new TextDecoder('utf-8').decode(array);
+
+function standardizeEncoding(s) {
+ // Different node.js versions print differently, so we must standardize to
+ // pass tests in all places. In particular at some point node.js started to
+ // abbreviate \u0000 as \x00 (both of which are valid).
+ return s.replace('\\u0000', '\\x00');
+}
+
+console.log("string:", standardizeEncoding(string));
+
+var json = JSON.stringify(JSON.parse(string));
+console.log("JSON:", standardizeEncoding(json));
diff --git a/test/lit/passes/string-lowering.wast b/test/lit/passes/string-lowering.wast
index 628092d1c..43d30fc64 100644
--- a/test/lit/passes/string-lowering.wast
+++ b/test/lit/passes/string-lowering.wast
@@ -2,8 +2,6 @@
;; operations are tested in string-gathering.wast (which is auto-updated, unlike
;; this which is manual).
-;; RUN: foreach %s %t wasm-opt --string-lowering -all -S -o - | filecheck %s
-
(module
(func $consts
(drop
@@ -15,9 +13,26 @@
(drop
(string.const "foo")
)
+ (drop
+ (string.const "needs\tescaping\00.'#%\"- .\r\n\\.ꙮ")
+ )
)
)
-;; The custom section should contain foo and bar, and foo only once.
-;; CHECK: custom section "string.consts", size 13, contents: "[\"bar\",\"foo\"]"
+;; The custom section should contain foo and bar, and foo only once, and the
+;; string with \t should be escaped.
+;;
+;; RUN: wasm-opt %s --string-lowering -all -S -o - | filecheck %s
+;;
+;; CHECK: custom section "string.consts", size 59, contents: "[\"bar\",\"foo\",\"needs\\tescaping\\u0000.'#%\\\"- .\\r\\n\\\\.\\ua66e\"]"
+
+;; The custom section should parse OK using JSON.parse from node.
+;; (Note we run --remove-unused-module-elements to remove externref-using
+;; imports, which require a newer version of node.)
+;;
+;; RUN: wasm-opt %s --string-lowering --remove-unused-module-elements -all -o %t.wasm
+;; RUN: node %S/string-lowering.js %t.wasm | filecheck %s --check-prefix=CHECK-JS
+;;
+;; CHECK-JS: string: ["bar","foo","needs\tescaping\x00.'#%\"- .\r\n\\.\ua66e"]
+;; CHECK-JS: JSON: ["bar","foo","needs\tescaping\x00.'#%\"- .\r\n\\.ꙮ"]