diff options
author | Thomas Lively <tlively@google.com> | 2024-03-18 21:17:55 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-18 21:17:55 -0700 |
commit | 63db13bf0f0f5dcc76c45a22ff43c424fa54a011 (patch) | |
tree | 35d3255d02908372a01b954f4dce4d8e66efb8de /src/wasm-interpreter.h | |
parent | bfb5ec04ddba295c9e1390314f6610a8bf7fefbe (diff) | |
download | binaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.tar.gz binaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.tar.bz2 binaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.zip |
[Strings] Avoid mishandling unicode in interpreter (#6405)
Our interpreter implementations of `stringview_wtf16.length`,
`stringview_wtf16.get_codeunit`, and `string.encode_wtf16_array` are not
unicode-aware, so they were previously incorrect in the face of multi-byte code
units. As a fix, bail out of the interpretation if there is a non-ascii code
point that would make our naive implementation incorrect.
Diffstat (limited to 'src/wasm-interpreter.h')
-rw-r--r-- | src/wasm-interpreter.h | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/src/wasm-interpreter.h b/src/wasm-interpreter.h index c35920802..5179d6ad0 100644 --- a/src/wasm-interpreter.h +++ b/src/wasm-interpreter.h @@ -1902,6 +1902,16 @@ public: Flow visitStringConst(StringConst* curr) { return Literal(curr->string.toString()); } + + bool hasNonAsciiUpTo(const Literals& values, Index end) { + for (Index i = 0; i < end; ++i) { + if (uint32_t(values[i].geti32()) > 127) { + return true; + } + } + return false; + } + Flow visitStringMeasure(StringMeasure* curr) { // For now we only support JS-style strings. if (curr->op != StringMeasureWTF16View) { @@ -1917,6 +1927,13 @@ public: if (!data) { trap("null ref"); } + + // This is only correct if all the bytes stored in `values` correspond to + // single unicode code points. See `visitStringWTF16Get` for details. + if (hasNonAsciiUpTo(data->values, data->values.size())) { + return Flow(NONCONSTANT_FLOW); + } + return Literal(int32_t(data->values.size())); } Flow visitStringConcat(StringConcat* curr) { @@ -1980,6 +1997,11 @@ public: trap("oob"); } + // We don't handle non-ascii code points correctly yet. + if (hasNonAsciiUpTo(refValues, refValues.size())) { + return Flow(NONCONSTANT_FLOW); + } + for (Index i = 0; i < refValues.size(); i++) { ptrValues[startVal + i] = refValues[i]; } @@ -2095,6 +2117,18 @@ public: if (i >= values.size()) { trap("string oob"); } + + // This naive indexing approach is only correct if the first `i` bytes + // stored in `values` each corresponds to a single unicode code point. To + // implement this correctly in general, we would have to reinterpret the + // bytes as WTF-8, then count up to the `i`th code point, accounting + // properly for code points that would be represented by surrogate pairs in + // WTF-16. Alternatively, we could represent string contents as WTF-16 to + // begin with. + if (hasNonAsciiUpTo(values, i + 1)) { + return Flow(NONCONSTANT_FLOW); + } + return Literal(values[i].geti32()); } Flow visitStringIterNext(StringIterNext* curr) { |