From b3fea30f84fef3ff7aa77775e00b83ba62d997cc Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Fri, 22 Mar 2024 16:56:33 -0700 Subject: [Strings] Represent string values as WTF-16 internally (#6418) WTF-16, i.e. arbitrary sequences of 16-bit values, is the encoding of Java and JavaScript strings, and using the same encoding makes the interpretation of string operations trivial, even when accounting for non-ascii characters. Specifically, use little-endian WTF-16. Re-encode string constants from WTF-8 to WTF-16 in the parsers, then back to WTF-8 in the writers. Update the constructor for string `Literal`s to interpret the string as WTF-16 and store a sequence of WTF-16 code units, i.e. 16-bit integers. Update `Builder::makeConstantExpression` accordingly to convert from the new `Literal` string representation back to a WTF-16 string. Update the interpreter to remove the logic for detecting non-ascii characters and bailing out. The naive implementations of all the string operations are correct now that our string encoding matches the JS string encoding. --- src/wasm-builder.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'src/wasm-builder.h') diff --git a/src/wasm-builder.h b/src/wasm-builder.h index 7d4991d9a..cc90a8abe 100644 --- a/src/wasm-builder.h +++ b/src/wasm-builder.h @@ -1265,12 +1265,17 @@ public: return makeRefI31(makeConst(value.geti31())); } if (type.isString()) { - // TODO: more than ascii support - std::string string; + // The string is already WTF-16, but we need to convert from `Literals` to + // actual string. + std::stringstream wtf16; for (auto c : value.getGCData()->values) { - string.push_back(c.getInteger()); + auto u = c.getInteger(); + assert(u < 0x10000); + wtf16 << uint8_t(u & 0xFF); + wtf16 << uint8_t(u >> 8); } - return makeStringConst(string); + // TODO: Use wtf16.view() once we have C++20. + return makeStringConst(wtf16.str()); } if (type.isRef() && type.getHeapType() == HeapType::ext) { return makeRefAs(ExternExternalize, -- cgit v1.2.3