diff options
author | Alon Zakai <azakai@google.com> | 2024-02-20 13:23:10 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-20 13:23:10 -0800 |
commit | 07b91a81c169091b2464e6d587666ad3f0124a1e (patch) | |
tree | cc0df6add42ef4d9f30ea7db6a4171ca8461d18d | |
parent | d0fa7102b5be83be6d0670a5662013bf5f8d1b64 (diff) | |
download | binaryen-07b91a81c169091b2464e6d587666ad3f0124a1e.tar.gz binaryen-07b91a81c169091b2464e6d587666ad3f0124a1e.tar.bz2 binaryen-07b91a81c169091b2464e6d587666ad3f0124a1e.zip |
StringLowering: Escape the JSON in the custom section (#6316)
Also add an end-to-end test using node to verify we can parse the escaped
content properly using TextDecoder+JSON.parse.
-rw-r--r-- | src/passes/StringLowering.cpp | 20 | ||||
-rw-r--r-- | src/support/json.cpp | 4 | ||||
-rw-r--r-- | src/support/string.cpp | 88 | ||||
-rw-r--r-- | src/support/string.h | 4 | ||||
-rw-r--r-- | test/lit/passes/string-lowering.js | 17 | ||||
-rw-r--r-- | test/lit/passes/string-lowering.wast | 23 |
6 files changed, 140 insertions, 16 deletions
diff --git a/src/passes/StringLowering.cpp b/src/passes/StringLowering.cpp index d92a151c7..c70691ea1 100644 --- a/src/passes/StringLowering.cpp +++ b/src/passes/StringLowering.cpp @@ -37,7 +37,7 @@ #include "ir/type-updating.h" #include "ir/utils.h" #include "pass.h" -#include "support/json.h" +#include "support/string.h" #include "wasm-builder.h" #include "wasm.h" @@ -205,8 +205,9 @@ struct StringLowering : public StringGathering { void makeImports(Module* module) { Index importIndex = 0; - json::Value stringArray; - stringArray.setArray(); + std::stringstream json; + json << '['; + bool first = true; std::vector<Name> importedStrings; for (auto& global : module->globals) { if (global->init) { @@ -216,16 +217,19 @@ struct StringLowering : public StringGathering { importIndex++; global->init = nullptr; - auto str = json::Value::make(std::string(c->string.str).c_str()); - stringArray.push_back(str); + if (first) { + first = false; + } else { + json << ','; + } + String::printEscapedJSON(json, c->string.str); } } } // Add a custom section with the JSON. - std::stringstream stream; - stringArray.stringify(stream); - auto str = stream.str(); + json << ']'; + auto str = json.str(); auto vec = std::vector<char>(str.begin(), str.end()); module->customSections.emplace_back( CustomSection{"string.consts", std::move(vec)}); diff --git a/src/support/json.cpp b/src/support/json.cpp index d43ac0323..ab55cc75f 100644 --- a/src/support/json.cpp +++ b/src/support/json.cpp @@ -15,13 +15,13 @@ */ #include "support/json.h" +#include "support/string.h" namespace json { void Value::stringify(std::ostream& os, bool pretty) { if (isString()) { - // TODO: escaping - os << '"' << getCString() << '"'; + wasm::String::printEscapedJSON(os, getCString()); } else if (isArray()) { os << '['; auto first = true; diff --git a/src/support/string.cpp b/src/support/string.cpp index 09924849b..75dd1716c 100644 --- a/src/support/string.cpp +++ b/src/support/string.cpp @@ -106,7 +106,7 @@ std::string trim(const std::string& input) { return input.substr(0, size); } -std::ostream& printEscaped(std::ostream& os, std::string_view str) { +std::ostream& printEscaped(std::ostream& os, const std::string_view str) { os << '"'; for (unsigned char c : str) { switch (c) { @@ -140,4 +140,90 @@ std::ostream& printEscaped(std::ostream& os, std::string_view str) { return os << '"'; } +std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str) { + os << '"'; + for (size_t i = 0; i < str.size(); i++) { + int u0 = str[i]; + switch (u0) { + case '\t': + os << "\\t"; + continue; + case '\n': + os << "\\n"; + continue; + case '\r': + os << "\\r"; + continue; + case '"': + os << "\\\""; + continue; + case '\'': + os << "'"; + continue; + case '\\': + os << "\\\\"; + continue; + default: { + // Emit something like \u006e, the JSON escaping of a 16-bit number. + auto uEscape = [&](uint32_t v) { + if (v > 0xffff) { + std::cerr << "warning: Bad 16-bit escapee " << int(u0) << '\n'; + } + os << std::hex; + os << "\\u"; + os << ((v >> 12) & 0xf); + os << ((v >> 8) & 0xf); + os << ((v >> 4) & 0xf); + os << (v & 0xf); + os << std::dec; + }; + + // Based off of + // https://github.com/emscripten-core/emscripten/blob/59e6b8f1262d75585d8416b728e8cbb3db176fe2/src/library_strings.js#L72-L91 + if (!(u0 & 0x80)) { + if (u0 >= 32 && u0 < 127) { + // This requires no escaping at all. + os << char(u0); + } else { + uEscape(u0); + } + continue; + } + + // This uses 2 bytes. + i++; + int u1 = str[i] & 63; + if ((u0 & 0xE0) == 0xC0) { + uEscape((((u0 & 31) << 6) | u1)); + continue; + } + + // This uses 3 bytes. + i++; + int u2 = str[i] & 63; + if ((u0 & 0xF0) == 0xE0) { + u0 = ((u0 & 15) << 12) | (u1 << 6) | u2; + } else { + // This uses 4 bytes. + if ((u0 & 0xF8) != 0xF0) { + std::cerr << "warning: Bad UTF-8 leading byte " << int(u0) << '\n'; + } + i++; + u0 = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | (str[i] & 63); + } + + if (u0 < 0x10000) { + uEscape(u0); + } else { + // There are two separate code points here. + auto ch = u0 - 0x10000; + uEscape(0xD800 | (ch >> 10)); + uEscape(0xDC00 | (ch & 0x3FF)); + } + } + } + } + return os << '"'; +} + } // namespace wasm::String diff --git a/src/support/string.h b/src/support/string.h index 8ab0ae1a6..6fb3f693b 100644 --- a/src/support/string.h +++ b/src/support/string.h @@ -75,7 +75,9 @@ inline bool isNumber(const std::string& str) { return !str.empty() && std::all_of(str.begin(), str.end(), ::isdigit); } -std::ostream& printEscaped(std::ostream& os, std::string_view str); +std::ostream& printEscaped(std::ostream& os, const std::string_view str); + +std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str); } // namespace wasm::String diff --git a/test/lit/passes/string-lowering.js b/test/lit/passes/string-lowering.js new file mode 100644 index 000000000..105970bcc --- /dev/null +++ b/test/lit/passes/string-lowering.js @@ -0,0 +1,17 @@ +var filename = process.argv[2]; +var module = new WebAssembly.Module(require('fs').readFileSync(filename)); +var sections = WebAssembly.Module.customSections(module, 'string.consts'); +var array = new Uint8Array(sections[0]); +var string = new TextDecoder('utf-8').decode(array); + +function standardizeEncoding(s) { + // Different node.js versions print differently, so we must standardize to + // pass tests in all places. In particular at some point node.js started to + // abbreviate \u0000 as \x00 (both of which are valid). + return s.replace('\\u0000', '\\x00'); +} + +console.log("string:", standardizeEncoding(string)); + +var json = JSON.stringify(JSON.parse(string)); +console.log("JSON:", standardizeEncoding(json)); diff --git a/test/lit/passes/string-lowering.wast b/test/lit/passes/string-lowering.wast index 628092d1c..43d30fc64 100644 --- a/test/lit/passes/string-lowering.wast +++ b/test/lit/passes/string-lowering.wast @@ -2,8 +2,6 @@ ;; operations are tested in string-gathering.wast (which is auto-updated, unlike ;; this which is manual). -;; RUN: foreach %s %t wasm-opt --string-lowering -all -S -o - | filecheck %s - (module (func $consts (drop @@ -15,9 +13,26 @@ (drop (string.const "foo") ) + (drop + (string.const "needs\tescaping\00.'#%\"- .\r\n\\.ꙮ") + ) ) ) -;; The custom section should contain foo and bar, and foo only once. -;; CHECK: custom section "string.consts", size 13, contents: "[\"bar\",\"foo\"]" +;; The custom section should contain foo and bar, and foo only once, and the +;; string with \t should be escaped. +;; +;; RUN: wasm-opt %s --string-lowering -all -S -o - | filecheck %s +;; +;; CHECK: custom section "string.consts", size 59, contents: "[\"bar\",\"foo\",\"needs\\tescaping\\u0000.'#%\\\"- .\\r\\n\\\\.\\ua66e\"]" + +;; The custom section should parse OK using JSON.parse from node. +;; (Note we run --remove-unused-module-elements to remove externref-using +;; imports, which require a newer version of node.) +;; +;; RUN: wasm-opt %s --string-lowering --remove-unused-module-elements -all -o %t.wasm +;; RUN: node %S/string-lowering.js %t.wasm | filecheck %s --check-prefix=CHECK-JS +;; +;; CHECK-JS: string: ["bar","foo","needs\tescaping\x00.'#%\"- .\r\n\\.\ua66e"] +;; CHECK-JS: JSON: ["bar","foo","needs\tescaping\x00.'#%\"- .\r\n\\.ꙮ"] |