diff options
author | Thomas Lively <tlively@google.com> | 2024-03-22 16:56:33 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-22 23:56:33 +0000 |
commit | b3fea30f84fef3ff7aa77775e00b83ba62d997cc (patch) | |
tree | 53494a466d8e56d34d849d14927817a22f843748 /src/parser | |
parent | d3414c3deaebe7ba35731a8c20d7fa5f5a833ca3 (diff) | |
download | binaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.tar.gz binaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.tar.bz2 binaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.zip |
[Strings] Represent string values as WTF-16 internally (#6418)
WTF-16, i.e. arbitrary sequences of 16-bit values, is the encoding of Java and
JavaScript strings, and using the same encoding makes the interpretation of
string operations trivial, even when accounting for non-ascii characters.
Specifically, use little-endian WTF-16.
Re-encode string constants from WTF-8 to WTF-16 in the parsers, then back to
WTF-8 in the writers. Update the constructor for string `Literal`s to interpret
the string as WTF-16 and store a sequence of WTF-16 code units, i.e. 16-bit
integers. Update `Builder::makeConstantExpression` accordingly to convert from
the new `Literal` string representation back to a WTF-16 string.
Update the interpreter to remove the logic for detecting non-ascii characters
and bailing out. The naive implementations of all the string operations are
correct now that our string encoding matches the JS string encoding.
Diffstat (limited to 'src/parser')
-rw-r--r-- | src/parser/contexts.h | 9 | ||||
-rw-r--r-- | src/parser/lexer.cpp | 21 |
2 files changed, 10 insertions, 20 deletions
diff --git a/src/parser/contexts.h b/src/parser/contexts.h index 8b59ab40b..0979461a0 100644 --- a/src/parser/contexts.h +++ b/src/parser/contexts.h @@ -22,6 +22,7 @@ #include "lexer.h" #include "support/name.h" #include "support/result.h" +#include "support/string.h" #include "wasm-builder.h" #include "wasm-ir-builder.h" #include "wasm.h" @@ -2491,7 +2492,13 @@ struct ParseDefsCtx : TypeParserCtx<ParseDefsCtx> { Result<> makeStringConst(Index pos, const std::vector<Annotation>& annotations, std::string_view str) { - return withLoc(pos, irBuilder.makeStringConst(Name(str))); + // Re-encode from WTF-8 to WTF-16. + std::stringstream wtf16; + if (!String::convertWTF8ToWTF16(wtf16, str)) { + return in.err(pos, "invalid string constant"); + } + // TODO: Use wtf16.view() once we have C++20. + return withLoc(pos, irBuilder.makeStringConst(wtf16.str())); } Result<> makeStringMeasure(Index pos, diff --git a/src/parser/lexer.cpp b/src/parser/lexer.cpp index 8c7542dd7..48da163e1 100644 --- a/src/parser/lexer.cpp +++ b/src/parser/lexer.cpp @@ -23,6 +23,7 @@ #include <variant> #include "lexer.h" +#include "support/string.h" using namespace std::string_view_literals; @@ -308,25 +309,7 @@ public: if ((0xd800 <= u && u < 0xe000) || 0x110000 <= u) { return false; } - if (u < 0x80) { - // 0xxxxxxx - *escapeBuilder << uint8_t(u); - } else if (u < 0x800) { - // 110xxxxx 10xxxxxx - *escapeBuilder << uint8_t(0b11000000 | ((u >> 6) & 0b00011111)); - *escapeBuilder << uint8_t(0b10000000 | ((u >> 0) & 0b00111111)); - } else if (u < 0x10000) { - // 1110xxxx 10xxxxxx 10xxxxxx - *escapeBuilder << uint8_t(0b11100000 | ((u >> 12) & 0b00001111)); - *escapeBuilder << uint8_t(0b10000000 | ((u >> 6) & 0b00111111)); - *escapeBuilder << uint8_t(0b10000000 | ((u >> 0) & 0b00111111)); - } else { - // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - *escapeBuilder << uint8_t(0b11110000 | ((u >> 18) & 0b00000111)); - *escapeBuilder << uint8_t(0b10000000 | ((u >> 12) & 0b00111111)); - *escapeBuilder << uint8_t(0b10000000 | ((u >> 6) & 0b00111111)); - *escapeBuilder << uint8_t(0b10000000 | ((u >> 0) & 0b00111111)); - } + String::writeWTF8CodePoint(*escapeBuilder, u); return true; } }; |