[Strings] Avoid mishandling unicode in interpreter (#6405)

Our interpreter implementations of `stringview_wtf16.length`, `stringview_wtf16.get_codeunit`, and `string.encode_wtf16_array` are not unicode-aware, so they were previously incorrect in the face of multi-byte code units. As a fix, bail out of the interpretation if there is a non-ascii code point that would make our naive implementation incorrect.
author: Thomas Lively <tlively@google.com> 2024-03-18 21:17:55 -0700
committer: GitHub <noreply@github.com> 2024-03-18 21:17:55 -0700
commit: 63db13bf0f0f5dcc76c45a22ff43c424fa54a011 (patch)
tree: 35d3255d02908372a01b954f4dce4d8e66efb8de
parent: bfb5ec04ddba295c9e1390314f6610a8bf7fefbe (diff)
download: binaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.tar.gz
binaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.tar.bz2
binaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.zip
2 files changed, 135 insertions, 5 deletions
diff --git a/src/wasm-interpreter.h b/src/wasm-interpreter.h
index c35920802..5179d6ad0 100644
--- a/src/wasm-interpreter.h
+++ b/src/wasm-interpreter.h
@@ -1902,6 +1902,16 @@ public:
   Flow visitStringConst(StringConst* curr) {
     return Literal(curr->string.toString());
   }
+
+  bool hasNonAsciiUpTo(const Literals& values, Index end) {
+    for (Index i = 0; i < end; ++i) {
+      if (uint32_t(values[i].geti32()) > 127) {
+        return true;
+      }
+    }
+    return false;
+  }
+
   Flow visitStringMeasure(StringMeasure* curr) {
     // For now we only support JS-style strings.
     if (curr->op != StringMeasureWTF16View) {
@@ -1917,6 +1927,13 @@ public:
     if (!data) {
       trap("null ref");
     }
+
+    // This is only correct if all the bytes stored in `values` correspond to
+    // single unicode code points. See `visitStringWTF16Get` for details.
+    if (hasNonAsciiUpTo(data->values, data->values.size())) {
+      return Flow(NONCONSTANT_FLOW);
+    }
+
     return Literal(int32_t(data->values.size()));
   }
   Flow visitStringConcat(StringConcat* curr) {
@@ -1980,6 +1997,11 @@ public:
       trap("oob");
     }
 
+    // We don't handle non-ascii code points correctly yet.
+    if (hasNonAsciiUpTo(refValues, refValues.size())) {
+      return Flow(NONCONSTANT_FLOW);
+    }
+
     for (Index i = 0; i < refValues.size(); i++) {
       ptrValues[startVal + i] = refValues[i];
     }
@@ -2095,6 +2117,18 @@ public:
     if (i >= values.size()) {
       trap("string oob");
     }
+
+    // This naive indexing approach is only correct if the first `i` bytes
+    // stored in `values` each corresponds to a single unicode code point. To
+    // implement this correctly in general, we would have to reinterpret the
+    // bytes as WTF-8, then count up to the `i`th code point, accounting
+    // properly for code points that would be represented by surrogate pairs in
+    // WTF-16. Alternatively, we could represent string contents as WTF-16 to
+    // begin with.
+    if (hasNonAsciiUpTo(values, i + 1)) {
+      return Flow(NONCONSTANT_FLOW);
+    }
+
     return Literal(values[i].geti32());
   }
   Flow visitStringIterNext(StringIterNext* curr) {
diff --git a/test/lit/passes/precompute-strings.wast b/test/lit/passes/precompute-strings.wast
index f5b1660bc..aa138b289 100644
--- a/test/lit/passes/precompute-strings.wast
+++ b/test/lit/passes/precompute-strings.wast
@@ -1,12 +1,15 @@
 ;; NOTE: Assertions have been generated by update_lit_checks.py and should not be edited.
 
-;; RUN: wasm-opt %s --precompute --fuzz-exec -all -S -o - | filecheck %s
+;; RUN: wasm-opt %s --precompute -all -S -o - | filecheck %s
 
 (module
+ ;; CHECK:      (type $array16 (array (mut i16)))
+ (type $array16 (array (mut i16)))
+
  ;; CHECK:      (func $eq-no (type $0) (result i32)
  ;; CHECK-NEXT:  (i32.const 0)
  ;; CHECK-NEXT: )
- (func $eq-no (export "eq-no") (result i32)
+ (func $eq-no (result i32)
   (string.eq
    (string.const "ab")
    (string.const "cdefg")
@@ -16,7 +19,7 @@
  ;; CHECK:      (func $eq-yes (type $0) (result i32)
  ;; CHECK-NEXT:  (i32.const 1)
  ;; CHECK-NEXT: )
- (func $eq-yes (export "eq-yes") (result i32)
+ (func $eq-yes (result i32)
   (string.eq
    (string.const "ab")
    (string.const "ab")
@@ -26,11 +29,104 @@
  ;; CHECK:      (func $concat (type $0) (result i32)
  ;; CHECK-NEXT:  (i32.const 1)
  ;; CHECK-NEXT: )
- (func $concat (export "concat") (result i32)
+ (func $concat (result i32)
   (string.eq
    (string.concat (string.const "a") (string.const "b"))
    (string.const "ab")
   )
  )
-)
 
+ ;; CHECK:      (func $length (type $0) (result i32)
+ ;; CHECK-NEXT:  (i32.const 7)
+ ;; CHECK-NEXT: )
+ (func $length (result i32)
+  (stringview_wtf16.length
+   (string.as_wtf16
+    (string.const "1234567")
+   )
+  )
+ )
+
+ ;; CHECK:      (func $length-bad (type $0) (result i32)
+ ;; CHECK-NEXT:  (stringview_wtf16.length
+ ;; CHECK-NEXT:   (string.as_wtf16
+ ;; CHECK-NEXT:    (string.const "$_\c2\a3_\e2\82\ac_\f0\90\8d\88")
+ ;; CHECK-NEXT:   )
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT: )
+ (func $length-bad (result i32)
+  ;; Not precomputable because we don't handle unicode yet.
+  (stringview_wtf16.length
+   (string.as_wtf16
+    ;; $_£_€_𐍈
+    (string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88")
+   )
+  )
+ )
+
+ ;; CHECK:      (func $get_codepoint (type $0) (result i32)
+ ;; CHECK-NEXT:  (i32.const 95)
+ ;; CHECK-NEXT: )
+ (func $get_codepoint (result i32)
+  ;; This is computable because everything up to the requested index is ascii. Returns 95 ('_').
+  (stringview_wtf16.get_codeunit
+   (string.as_wtf16
+    ;; $_£_€_𐍈
+    (string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88")
+   )
+   (i32.const 1)
+  )
+ )
+
+ ;; CHECK:      (func $get_codepoint-bad (type $0) (result i32)
+ ;; CHECK-NEXT:  (stringview_wtf16.get_codeunit
+ ;; CHECK-NEXT:   (string.as_wtf16
+ ;; CHECK-NEXT:    (string.const "$_\c2\a3_\e2\82\ac_\f0\90\8d\88")
+ ;; CHECK-NEXT:   )
+ ;; CHECK-NEXT:   (i32.const 2)
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT: )
+ (func $get_codepoint-bad (export "get_codepoint-bad") (result i32)
+  ;; This is not computable because the requested code unit is not ascii.
+  (stringview_wtf16.get_codeunit
+   (string.as_wtf16
+    ;; $_£_€_𐍈
+    (string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88")
+   )
+   (i32.const 2)
+  )
+ )
+
+ ;; CHECK:      (func $encode (type $0) (result i32)
+ ;; CHECK-NEXT:  (i32.const 2)
+ ;; CHECK-NEXT: )
+ (func $encode (result i32)
+  (string.encode_wtf16_array
+   (string.const "$_")
+   (array.new_default $array16
+    (i32.const 20)
+   )
+   (i32.const 0)
+  )
+ )
+
+ ;; CHECK:      (func $encode-bad (type $0) (result i32)
+ ;; CHECK-NEXT:  (string.encode_wtf16_array
+ ;; CHECK-NEXT:   (string.const "$_\c2\a3_\e2\82\ac_\f0\90\8d\88")
+ ;; CHECK-NEXT:   (array.new_default $array16
+ ;; CHECK-NEXT:    (i32.const 20)
+ ;; CHECK-NEXT:   )
+ ;; CHECK-NEXT:   (i32.const 0)
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT: )
+ (func $encode-bad (result i32)
+  (string.encode_wtf16_array
+   ;; $_£_€_𐍈
+   (string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88")
+   (array.new_default $array16
+    (i32.const 20)
+   )
+   (i32.const 0)
+  )
+ )
+)
author	Thomas Lively <tlively@google.com>	2024-03-18 21:17:55 -0700
committer	GitHub <noreply@github.com>	2024-03-18 21:17:55 -0700
commit	63db13bf0f0f5dcc76c45a22ff43c424fa54a011 (patch)
tree	35d3255d02908372a01b954f4dce4d8e66efb8de
parent	bfb5ec04ddba295c9e1390314f6610a8bf7fefbe (diff)
download	binaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.tar.gz binaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.tar.bz2 binaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.zip