From b3fea30f84fef3ff7aa77775e00b83ba62d997cc Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Fri, 22 Mar 2024 16:56:33 -0700 Subject: [Strings] Represent string values as WTF-16 internally (#6418) WTF-16, i.e. arbitrary sequences of 16-bit values, is the encoding of Java and JavaScript strings, and using the same encoding makes the interpretation of string operations trivial, even when accounting for non-ascii characters. Specifically, use little-endian WTF-16. Re-encode string constants from WTF-8 to WTF-16 in the parsers, then back to WTF-8 in the writers. Update the constructor for string `Literal`s to interpret the string as WTF-16 and store a sequence of WTF-16 code units, i.e. 16-bit integers. Update `Builder::makeConstantExpression` accordingly to convert from the new `Literal` string representation back to a WTF-16 string. Update the interpreter to remove the logic for detecting non-ascii characters and bailing out. The naive implementations of all the string operations are correct now that our string encoding matches the JS string encoding. --- src/support/string.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'src/support/string.h') diff --git a/src/support/string.h b/src/support/string.h index 6fb3f693b..be2c3c6a3 100644 --- a/src/support/string.h +++ b/src/support/string.h @@ -75,9 +75,24 @@ inline bool isNumber(const std::string& str) { return !str.empty() && std::all_of(str.begin(), str.end(), ::isdigit); } -std::ostream& printEscaped(std::ostream& os, const std::string_view str); +std::ostream& printEscaped(std::ostream& os, std::string_view str); -std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str); +// `str` must be a valid WTF-16 string. +std::ostream& printEscapedJSON(std::ostream& os, std::string_view str); + +std::ostream& writeWTF8CodePoint(std::ostream& os, uint32_t u); + +std::ostream& writeWTF16CodePoint(std::ostream& os, uint32_t u); + +// Writes the WTF-16LE encoding of the given WTF-8 string to `os`, inserting +// replacement characters as necessary when encountering invalid WTF-8. Returns +// `true` iff the input was valid WTF-8. +bool convertWTF8ToWTF16(std::ostream& os, std::string_view str); + +// Writes the WTF-8 encoding of the given WTF-16LE string to `os`, inserting a +// replacement character at the end if the string is an odd number of bytes. +// Returns `true` iff the input was valid WTF-16. +bool convertWTF16ToWTF8(std::ostream& os, std::string_view str); } // namespace wasm::String -- cgit v1.2.3