summaryrefslogtreecommitdiff
path: root/src/wasm-interpreter.h
diff options
context:
space:
mode:
authorThomas Lively <tlively@google.com>2024-03-22 16:56:33 -0700
committerGitHub <noreply@github.com>2024-03-22 23:56:33 +0000
commitb3fea30f84fef3ff7aa77775e00b83ba62d997cc (patch)
tree53494a466d8e56d34d849d14927817a22f843748 /src/wasm-interpreter.h
parentd3414c3deaebe7ba35731a8c20d7fa5f5a833ca3 (diff)
downloadbinaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.tar.gz
binaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.tar.bz2
binaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.zip
[Strings] Represent string values as WTF-16 internally (#6418)
WTF-16, i.e. arbitrary sequences of 16-bit values, is the encoding of Java and JavaScript strings, and using the same encoding makes the interpretation of string operations trivial, even when accounting for non-ascii characters. Specifically, use little-endian WTF-16. Re-encode string constants from WTF-8 to WTF-16 in the parsers, then back to WTF-8 in the writers. Update the constructor for string `Literal`s to interpret the string as WTF-16 and store a sequence of WTF-16 code units, i.e. 16-bit integers. Update `Builder::makeConstantExpression` accordingly to convert from the new `Literal` string representation back to a WTF-16 string. Update the interpreter to remove the logic for detecting non-ascii characters and bailing out. The naive implementations of all the string operations are correct now that our string encoding matches the JS string encoding.
Diffstat (limited to 'src/wasm-interpreter.h')
-rw-r--r--src/wasm-interpreter.h53
1 files changed, 4 insertions, 49 deletions
diff --git a/src/wasm-interpreter.h b/src/wasm-interpreter.h
index c8031f617..f34cb83be 100644
--- a/src/wasm-interpreter.h
+++ b/src/wasm-interpreter.h
@@ -1900,23 +1900,7 @@ public:
return Flow(NONCONSTANT_FLOW);
}
}
- Flow visitStringConst(StringConst* curr) {
- return Literal(curr->string.toString());
- }
-
- // Returns if there is a non-ascii character in a list of values, looking only
- // up to an index that is provided (not inclusive). If the index is not
- // provided we look in the entire list.
- bool hasNonAsciiUpTo(const Literals& values,
- std::optional<Index> maybeEnd = std::nullopt) {
- Index end = maybeEnd ? *maybeEnd : values.size();
- for (Index i = 0; i < end; ++i) {
- if (uint32_t(values[i].geti32()) > 127) {
- return true;
- }
- }
- return false;
- }
+ Flow visitStringConst(StringConst* curr) { return Literal(curr->string.str); }
Flow visitStringMeasure(StringMeasure* curr) {
// For now we only support JS-style strings.
@@ -1934,12 +1918,6 @@ public:
trap("null ref");
}
- // This is only correct if all the bytes stored in `values` correspond to
- // single unicode code points. See `visitStringWTF16Get` for details.
- if (hasNonAsciiUpTo(data->values)) {
- return Flow(NONCONSTANT_FLOW);
- }
-
return Literal(int32_t(data->values.size()));
}
Flow visitStringConcat(StringConcat* curr) {
@@ -1960,18 +1938,13 @@ public:
if (!leftData || !rightData) {
trap("null ref");
}
- // This is only correct if all the bytes in the left operand correspond
- // to single unicode code points.
- if (hasNonAsciiUpTo(leftData->values)) {
- return Flow(NONCONSTANT_FLOW);
- }
Literals contents;
contents.reserve(leftData->values.size() + rightData->values.size());
- for (Literal l : leftData->values) {
+ for (Literal& l : leftData->values) {
contents.push_back(l);
}
- for (Literal l : rightData->values) {
+ for (Literal& l : rightData->values) {
contents.push_back(l);
}
@@ -2011,11 +1984,6 @@ public:
trap("oob");
}
- // We don't handle non-ascii code points correctly yet.
- if (hasNonAsciiUpTo(refValues)) {
- return Flow(NONCONSTANT_FLOW);
- }
-
for (Index i = 0; i < refValues.size(); i++) {
ptrValues[startVal + i] = refValues[i];
}
@@ -2132,17 +2100,6 @@ public:
trap("string oob");
}
- // This naive indexing approach is only correct if the first `i` bytes
- // stored in `values` each corresponds to a single unicode code point. To
- // implement this correctly in general, we would have to reinterpret the
- // bytes as WTF-8, then count up to the `i`th code point, accounting
- // properly for code points that would be represented by surrogate pairs in
- // WTF-16. Alternatively, we could represent string contents as WTF-16 to
- // begin with.
- if (hasNonAsciiUpTo(values, i + 1)) {
- return Flow(NONCONSTANT_FLOW);
- }
-
return Literal(values[i].geti32());
}
Flow visitStringIterNext(StringIterNext* curr) {
@@ -2178,9 +2135,7 @@ public:
auto startVal = start.getSingleValue().getUnsigned();
auto endVal = end.getSingleValue().getUnsigned();
endVal = std::min<size_t>(endVal, refValues.size());
- if (hasNonAsciiUpTo(refValues, endVal)) {
- return Flow(NONCONSTANT_FLOW);
- }
+
Literals contents;
if (endVal > startVal) {
contents.reserve(endVal - startVal);