[Strings] Avoid mishandling unicode in interpreter (#6405)

Our interpreter implementations of `stringview_wtf16.length`, `stringview_wtf16.get_codeunit`, and `string.encode_wtf16_array` are not unicode-aware, so they were previously incorrect in the face of multi-byte code units. As a fix, bail out of the interpretation if there is a non-ascii code point that would make our naive implementation incorrect.
author: Thomas Lively <tlively@google.com> 2024-03-18 21:17:55 -0700
committer: GitHub <noreply@github.com> 2024-03-18 21:17:55 -0700
commit: 63db13bf0f0f5dcc76c45a22ff43c424fa54a011 (patch)
tree: 35d3255d02908372a01b954f4dce4d8e66efb8de /src/wasm-interpreter.h
parent: bfb5ec04ddba295c9e1390314f6610a8bf7fefbe (diff)
download: binaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.tar.gz
binaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.tar.bz2
binaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.zip
1 files changed, 34 insertions, 0 deletions
diff --git a/src/wasm-interpreter.h b/src/wasm-interpreter.h
index c35920802..5179d6ad0 100644
--- a/src/wasm-interpreter.h
+++ b/src/wasm-interpreter.h
@@ -1902,6 +1902,16 @@ public:
   Flow visitStringConst(StringConst* curr) {
     return Literal(curr->string.toString());
   }
+
+  bool hasNonAsciiUpTo(const Literals& values, Index end) {
+    for (Index i = 0; i < end; ++i) {
+      if (uint32_t(values[i].geti32()) > 127) {
+        return true;
+      }
+    }
+    return false;
+  }
+
   Flow visitStringMeasure(StringMeasure* curr) {
     // For now we only support JS-style strings.
     if (curr->op != StringMeasureWTF16View) {
@@ -1917,6 +1927,13 @@ public:
     if (!data) {
       trap("null ref");
     }
+
+    // This is only correct if all the bytes stored in `values` correspond to
+    // single unicode code points. See `visitStringWTF16Get` for details.
+    if (hasNonAsciiUpTo(data->values, data->values.size())) {
+      return Flow(NONCONSTANT_FLOW);
+    }
+
     return Literal(int32_t(data->values.size()));
   }
   Flow visitStringConcat(StringConcat* curr) {
@@ -1980,6 +1997,11 @@ public:
       trap("oob");
     }
 
+    // We don't handle non-ascii code points correctly yet.
+    if (hasNonAsciiUpTo(refValues, refValues.size())) {
+      return Flow(NONCONSTANT_FLOW);
+    }
+
     for (Index i = 0; i < refValues.size(); i++) {
       ptrValues[startVal + i] = refValues[i];
     }
@@ -2095,6 +2117,18 @@ public:
     if (i >= values.size()) {
       trap("string oob");
     }
+
+    // This naive indexing approach is only correct if the first `i` bytes
+    // stored in `values` each corresponds to a single unicode code point. To
+    // implement this correctly in general, we would have to reinterpret the
+    // bytes as WTF-8, then count up to the `i`th code point, accounting
+    // properly for code points that would be represented by surrogate pairs in
+    // WTF-16. Alternatively, we could represent string contents as WTF-16 to
+    // begin with.
+    if (hasNonAsciiUpTo(values, i + 1)) {
+      return Flow(NONCONSTANT_FLOW);
+    }
+
     return Literal(values[i].geti32());
   }
   Flow visitStringIterNext(StringIterNext* curr) {
author	Thomas Lively <tlively@google.com>	2024-03-18 21:17:55 -0700
committer	GitHub <noreply@github.com>	2024-03-18 21:17:55 -0700
commit	63db13bf0f0f5dcc76c45a22ff43c424fa54a011 (patch)
tree	35d3255d02908372a01b954f4dce4d8e66efb8de /src/wasm-interpreter.h
parent	bfb5ec04ddba295c9e1390314f6610a8bf7fefbe (diff)
download	binaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.tar.gz binaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.tar.bz2 binaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.zip