summaryrefslogtreecommitdiff
path: root/src/wasm-interpreter.h
diff options
context:
space:
mode:
authorThomas Lively <tlively@google.com>2024-03-18 21:17:55 -0700
committerGitHub <noreply@github.com>2024-03-18 21:17:55 -0700
commit63db13bf0f0f5dcc76c45a22ff43c424fa54a011 (patch)
tree35d3255d02908372a01b954f4dce4d8e66efb8de /src/wasm-interpreter.h
parentbfb5ec04ddba295c9e1390314f6610a8bf7fefbe (diff)
downloadbinaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.tar.gz
binaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.tar.bz2
binaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.zip
[Strings] Avoid mishandling unicode in interpreter (#6405)
Our interpreter implementations of `stringview_wtf16.length`, `stringview_wtf16.get_codeunit`, and `string.encode_wtf16_array` are not unicode-aware, so they were previously incorrect in the face of multi-byte code units. As a fix, bail out of the interpretation if there is a non-ascii code point that would make our naive implementation incorrect.
Diffstat (limited to 'src/wasm-interpreter.h')
-rw-r--r--src/wasm-interpreter.h34
1 files changed, 34 insertions, 0 deletions
diff --git a/src/wasm-interpreter.h b/src/wasm-interpreter.h
index c35920802..5179d6ad0 100644
--- a/src/wasm-interpreter.h
+++ b/src/wasm-interpreter.h
@@ -1902,6 +1902,16 @@ public:
Flow visitStringConst(StringConst* curr) {
return Literal(curr->string.toString());
}
+
+ bool hasNonAsciiUpTo(const Literals& values, Index end) {
+ for (Index i = 0; i < end; ++i) {
+ if (uint32_t(values[i].geti32()) > 127) {
+ return true;
+ }
+ }
+ return false;
+ }
+
Flow visitStringMeasure(StringMeasure* curr) {
// For now we only support JS-style strings.
if (curr->op != StringMeasureWTF16View) {
@@ -1917,6 +1927,13 @@ public:
if (!data) {
trap("null ref");
}
+
+ // This is only correct if all the bytes stored in `values` correspond to
+ // single unicode code points. See `visitStringWTF16Get` for details.
+ if (hasNonAsciiUpTo(data->values, data->values.size())) {
+ return Flow(NONCONSTANT_FLOW);
+ }
+
return Literal(int32_t(data->values.size()));
}
Flow visitStringConcat(StringConcat* curr) {
@@ -1980,6 +1997,11 @@ public:
trap("oob");
}
+ // We don't handle non-ascii code points correctly yet.
+ if (hasNonAsciiUpTo(refValues, refValues.size())) {
+ return Flow(NONCONSTANT_FLOW);
+ }
+
for (Index i = 0; i < refValues.size(); i++) {
ptrValues[startVal + i] = refValues[i];
}
@@ -2095,6 +2117,18 @@ public:
if (i >= values.size()) {
trap("string oob");
}
+
+ // This naive indexing approach is only correct if the first `i` bytes
+ // stored in `values` each corresponds to a single unicode code point. To
+ // implement this correctly in general, we would have to reinterpret the
+ // bytes as WTF-8, then count up to the `i`th code point, accounting
+ // properly for code points that would be represented by surrogate pairs in
+ // WTF-16. Alternatively, we could represent string contents as WTF-16 to
+ // begin with.
+ if (hasNonAsciiUpTo(values, i + 1)) {
+ return Flow(NONCONSTANT_FLOW);
+ }
+
return Literal(values[i].geti32());
}
Flow visitStringIterNext(StringIterNext* curr) {