diff options
author | Thomas Lively <tlively@google.com> | 2024-03-22 16:56:33 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-22 23:56:33 +0000 |
commit | b3fea30f84fef3ff7aa77775e00b83ba62d997cc (patch) | |
tree | 53494a466d8e56d34d849d14927817a22f843748 /src/wasm-interpreter.h | |
parent | d3414c3deaebe7ba35731a8c20d7fa5f5a833ca3 (diff) | |
download | binaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.tar.gz binaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.tar.bz2 binaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.zip |
[Strings] Represent string values as WTF-16 internally (#6418)
WTF-16, i.e. arbitrary sequences of 16-bit values, is the encoding of Java and
JavaScript strings, and using the same encoding makes the interpretation of
string operations trivial, even when accounting for non-ascii characters.
Specifically, use little-endian WTF-16.
Re-encode string constants from WTF-8 to WTF-16 in the parsers, then back to
WTF-8 in the writers. Update the constructor for string `Literal`s to interpret
the string as WTF-16 and store a sequence of WTF-16 code units, i.e. 16-bit
integers. Update `Builder::makeConstantExpression` accordingly to convert from
the new `Literal` string representation back to a WTF-16 string.
Update the interpreter to remove the logic for detecting non-ascii characters
and bailing out. The naive implementations of all the string operations are
correct now that our string encoding matches the JS string encoding.
Diffstat (limited to 'src/wasm-interpreter.h')
-rw-r--r-- | src/wasm-interpreter.h | 53 |
1 files changed, 4 insertions, 49 deletions
diff --git a/src/wasm-interpreter.h b/src/wasm-interpreter.h index c8031f617..f34cb83be 100644 --- a/src/wasm-interpreter.h +++ b/src/wasm-interpreter.h @@ -1900,23 +1900,7 @@ public: return Flow(NONCONSTANT_FLOW); } } - Flow visitStringConst(StringConst* curr) { - return Literal(curr->string.toString()); - } - - // Returns if there is a non-ascii character in a list of values, looking only - // up to an index that is provided (not inclusive). If the index is not - // provided we look in the entire list. - bool hasNonAsciiUpTo(const Literals& values, - std::optional<Index> maybeEnd = std::nullopt) { - Index end = maybeEnd ? *maybeEnd : values.size(); - for (Index i = 0; i < end; ++i) { - if (uint32_t(values[i].geti32()) > 127) { - return true; - } - } - return false; - } + Flow visitStringConst(StringConst* curr) { return Literal(curr->string.str); } Flow visitStringMeasure(StringMeasure* curr) { // For now we only support JS-style strings. @@ -1934,12 +1918,6 @@ public: trap("null ref"); } - // This is only correct if all the bytes stored in `values` correspond to - // single unicode code points. See `visitStringWTF16Get` for details. - if (hasNonAsciiUpTo(data->values)) { - return Flow(NONCONSTANT_FLOW); - } - return Literal(int32_t(data->values.size())); } Flow visitStringConcat(StringConcat* curr) { @@ -1960,18 +1938,13 @@ public: if (!leftData || !rightData) { trap("null ref"); } - // This is only correct if all the bytes in the left operand correspond - // to single unicode code points. - if (hasNonAsciiUpTo(leftData->values)) { - return Flow(NONCONSTANT_FLOW); - } Literals contents; contents.reserve(leftData->values.size() + rightData->values.size()); - for (Literal l : leftData->values) { + for (Literal& l : leftData->values) { contents.push_back(l); } - for (Literal l : rightData->values) { + for (Literal& l : rightData->values) { contents.push_back(l); } @@ -2011,11 +1984,6 @@ public: trap("oob"); } - // We don't handle non-ascii code points correctly yet. - if (hasNonAsciiUpTo(refValues)) { - return Flow(NONCONSTANT_FLOW); - } - for (Index i = 0; i < refValues.size(); i++) { ptrValues[startVal + i] = refValues[i]; } @@ -2132,17 +2100,6 @@ public: trap("string oob"); } - // This naive indexing approach is only correct if the first `i` bytes - // stored in `values` each corresponds to a single unicode code point. To - // implement this correctly in general, we would have to reinterpret the - // bytes as WTF-8, then count up to the `i`th code point, accounting - // properly for code points that would be represented by surrogate pairs in - // WTF-16. Alternatively, we could represent string contents as WTF-16 to - // begin with. - if (hasNonAsciiUpTo(values, i + 1)) { - return Flow(NONCONSTANT_FLOW); - } - return Literal(values[i].geti32()); } Flow visitStringIterNext(StringIterNext* curr) { @@ -2178,9 +2135,7 @@ public: auto startVal = start.getSingleValue().getUnsigned(); auto endVal = end.getSingleValue().getUnsigned(); endVal = std::min<size_t>(endVal, refValues.size()); - if (hasNonAsciiUpTo(refValues, endVal)) { - return Flow(NONCONSTANT_FLOW); - } + Literals contents; if (endVal > startVal) { contents.reserve(endVal - startVal); |