From b3fea30f84fef3ff7aa77775e00b83ba62d997cc Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Fri, 22 Mar 2024 16:56:33 -0700 Subject: [Strings] Represent string values as WTF-16 internally (#6418) WTF-16, i.e. arbitrary sequences of 16-bit values, is the encoding of Java and JavaScript strings, and using the same encoding makes the interpretation of string operations trivial, even when accounting for non-ascii characters. Specifically, use little-endian WTF-16. Re-encode string constants from WTF-8 to WTF-16 in the parsers, then back to WTF-8 in the writers. Update the constructor for string `Literal`s to interpret the string as WTF-16 and store a sequence of WTF-16 code units, i.e. 16-bit integers. Update `Builder::makeConstantExpression` accordingly to convert from the new `Literal` string representation back to a WTF-16 string. Update the interpreter to remove the logic for detecting non-ascii characters and bailing out. The naive implementations of all the string operations are correct now that our string encoding matches the JS string encoding. --- src/wasm/literal.cpp | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'src/wasm/literal.cpp') diff --git a/src/wasm/literal.cpp b/src/wasm/literal.cpp index 7c674ffc5..afdc14c72 100644 --- a/src/wasm/literal.cpp +++ b/src/wasm/literal.cpp @@ -23,6 +23,7 @@ #include "ir/bits.h" #include "pretty_printing.h" #include "support/bits.h" +#include "support/string.h" #include "support/utilities.h" namespace wasm { @@ -77,12 +78,15 @@ Literal::Literal(std::shared_ptr gcData, HeapType type) (type.isBottom() && !gcData)); } -Literal::Literal(std::string string) +Literal::Literal(std::string_view string) : gcData(nullptr), type(Type(HeapType::string, NonNullable)) { // TODO: we could in theory internalize strings + // Extract individual WTF-16LE code units. Literals contents; - for (auto c : string) { - contents.push_back(Literal(int32_t(c))); + assert(string.size() % 2 == 0); + for (size_t i = 0; i < string.size(); i += 2) { + int32_t u = uint8_t(string[i]) | (uint8_t(string[i + 1]) << 8); + contents.push_back(Literal(u)); } gcData = std::make_shared(HeapType::string, contents); } @@ -636,10 +640,19 @@ std::ostream& operator<<(std::ostream& o, Literal literal) { o << "nullstring"; } else { o << "string(\""; + // Convert WTF-16 literals to WTF-16 string. + std::stringstream wtf16; for (auto c : data->values) { - // TODO: more than ascii - o << char(c.getInteger()); + auto u = c.getInteger(); + assert(u < 0x10000); + wtf16 << uint8_t(u & 0xFF); + wtf16 << uint8_t(u >> 8); } + // Convert to WTF-8 for printing. + // TODO: Use wtf16.view() once we have C++20. + [[maybe_unused]] bool valid = + String::convertWTF16ToWTF8(o, wtf16.str()); + assert(valid); o << "\")"; } break; -- cgit v1.2.3