summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas Lively <tlively@google.com>2024-03-18 21:17:55 -0700
committerGitHub <noreply@github.com>2024-03-18 21:17:55 -0700
commit63db13bf0f0f5dcc76c45a22ff43c424fa54a011 (patch)
tree35d3255d02908372a01b954f4dce4d8e66efb8de
parentbfb5ec04ddba295c9e1390314f6610a8bf7fefbe (diff)
downloadbinaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.tar.gz
binaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.tar.bz2
binaryen-63db13bf0f0f5dcc76c45a22ff43c424fa54a011.zip
[Strings] Avoid mishandling unicode in interpreter (#6405)
Our interpreter implementations of `stringview_wtf16.length`, `stringview_wtf16.get_codeunit`, and `string.encode_wtf16_array` are not unicode-aware, so they were previously incorrect in the face of multi-byte code units. As a fix, bail out of the interpretation if there is a non-ascii code point that would make our naive implementation incorrect.
-rw-r--r--src/wasm-interpreter.h34
-rw-r--r--test/lit/passes/precompute-strings.wast106
2 files changed, 135 insertions, 5 deletions
diff --git a/src/wasm-interpreter.h b/src/wasm-interpreter.h
index c35920802..5179d6ad0 100644
--- a/src/wasm-interpreter.h
+++ b/src/wasm-interpreter.h
@@ -1902,6 +1902,16 @@ public:
Flow visitStringConst(StringConst* curr) {
return Literal(curr->string.toString());
}
+
+ bool hasNonAsciiUpTo(const Literals& values, Index end) {
+ for (Index i = 0; i < end; ++i) {
+ if (uint32_t(values[i].geti32()) > 127) {
+ return true;
+ }
+ }
+ return false;
+ }
+
Flow visitStringMeasure(StringMeasure* curr) {
// For now we only support JS-style strings.
if (curr->op != StringMeasureWTF16View) {
@@ -1917,6 +1927,13 @@ public:
if (!data) {
trap("null ref");
}
+
+ // This is only correct if all the bytes stored in `values` correspond to
+ // single unicode code points. See `visitStringWTF16Get` for details.
+ if (hasNonAsciiUpTo(data->values, data->values.size())) {
+ return Flow(NONCONSTANT_FLOW);
+ }
+
return Literal(int32_t(data->values.size()));
}
Flow visitStringConcat(StringConcat* curr) {
@@ -1980,6 +1997,11 @@ public:
trap("oob");
}
+ // We don't handle non-ascii code points correctly yet.
+ if (hasNonAsciiUpTo(refValues, refValues.size())) {
+ return Flow(NONCONSTANT_FLOW);
+ }
+
for (Index i = 0; i < refValues.size(); i++) {
ptrValues[startVal + i] = refValues[i];
}
@@ -2095,6 +2117,18 @@ public:
if (i >= values.size()) {
trap("string oob");
}
+
+ // This naive indexing approach is only correct if the first `i` bytes
+ // stored in `values` each corresponds to a single unicode code point. To
+ // implement this correctly in general, we would have to reinterpret the
+ // bytes as WTF-8, then count up to the `i`th code point, accounting
+ // properly for code points that would be represented by surrogate pairs in
+ // WTF-16. Alternatively, we could represent string contents as WTF-16 to
+ // begin with.
+ if (hasNonAsciiUpTo(values, i + 1)) {
+ return Flow(NONCONSTANT_FLOW);
+ }
+
return Literal(values[i].geti32());
}
Flow visitStringIterNext(StringIterNext* curr) {
diff --git a/test/lit/passes/precompute-strings.wast b/test/lit/passes/precompute-strings.wast
index f5b1660bc..aa138b289 100644
--- a/test/lit/passes/precompute-strings.wast
+++ b/test/lit/passes/precompute-strings.wast
@@ -1,12 +1,15 @@
;; NOTE: Assertions have been generated by update_lit_checks.py and should not be edited.
-;; RUN: wasm-opt %s --precompute --fuzz-exec -all -S -o - | filecheck %s
+;; RUN: wasm-opt %s --precompute -all -S -o - | filecheck %s
(module
+ ;; CHECK: (type $array16 (array (mut i16)))
+ (type $array16 (array (mut i16)))
+
;; CHECK: (func $eq-no (type $0) (result i32)
;; CHECK-NEXT: (i32.const 0)
;; CHECK-NEXT: )
- (func $eq-no (export "eq-no") (result i32)
+ (func $eq-no (result i32)
(string.eq
(string.const "ab")
(string.const "cdefg")
@@ -16,7 +19,7 @@
;; CHECK: (func $eq-yes (type $0) (result i32)
;; CHECK-NEXT: (i32.const 1)
;; CHECK-NEXT: )
- (func $eq-yes (export "eq-yes") (result i32)
+ (func $eq-yes (result i32)
(string.eq
(string.const "ab")
(string.const "ab")
@@ -26,11 +29,104 @@
;; CHECK: (func $concat (type $0) (result i32)
;; CHECK-NEXT: (i32.const 1)
;; CHECK-NEXT: )
- (func $concat (export "concat") (result i32)
+ (func $concat (result i32)
(string.eq
(string.concat (string.const "a") (string.const "b"))
(string.const "ab")
)
)
-)
+ ;; CHECK: (func $length (type $0) (result i32)
+ ;; CHECK-NEXT: (i32.const 7)
+ ;; CHECK-NEXT: )
+ (func $length (result i32)
+ (stringview_wtf16.length
+ (string.as_wtf16
+ (string.const "1234567")
+ )
+ )
+ )
+
+ ;; CHECK: (func $length-bad (type $0) (result i32)
+ ;; CHECK-NEXT: (stringview_wtf16.length
+ ;; CHECK-NEXT: (string.as_wtf16
+ ;; CHECK-NEXT: (string.const "$_\c2\a3_\e2\82\ac_\f0\90\8d\88")
+ ;; CHECK-NEXT: )
+ ;; CHECK-NEXT: )
+ ;; CHECK-NEXT: )
+ (func $length-bad (result i32)
+ ;; Not precomputable because we don't handle unicode yet.
+ (stringview_wtf16.length
+ (string.as_wtf16
+ ;; $_£_€_𐍈
+ (string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88")
+ )
+ )
+ )
+
+ ;; CHECK: (func $get_codepoint (type $0) (result i32)
+ ;; CHECK-NEXT: (i32.const 95)
+ ;; CHECK-NEXT: )
+ (func $get_codepoint (result i32)
+ ;; This is computable because everything up to the requested index is ascii. Returns 95 ('_').
+ (stringview_wtf16.get_codeunit
+ (string.as_wtf16
+ ;; $_£_€_𐍈
+ (string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88")
+ )
+ (i32.const 1)
+ )
+ )
+
+ ;; CHECK: (func $get_codepoint-bad (type $0) (result i32)
+ ;; CHECK-NEXT: (stringview_wtf16.get_codeunit
+ ;; CHECK-NEXT: (string.as_wtf16
+ ;; CHECK-NEXT: (string.const "$_\c2\a3_\e2\82\ac_\f0\90\8d\88")
+ ;; CHECK-NEXT: )
+ ;; CHECK-NEXT: (i32.const 2)
+ ;; CHECK-NEXT: )
+ ;; CHECK-NEXT: )
+ (func $get_codepoint-bad (export "get_codepoint-bad") (result i32)
+ ;; This is not computable because the requested code unit is not ascii.
+ (stringview_wtf16.get_codeunit
+ (string.as_wtf16
+ ;; $_£_€_𐍈
+ (string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88")
+ )
+ (i32.const 2)
+ )
+ )
+
+ ;; CHECK: (func $encode (type $0) (result i32)
+ ;; CHECK-NEXT: (i32.const 2)
+ ;; CHECK-NEXT: )
+ (func $encode (result i32)
+ (string.encode_wtf16_array
+ (string.const "$_")
+ (array.new_default $array16
+ (i32.const 20)
+ )
+ (i32.const 0)
+ )
+ )
+
+ ;; CHECK: (func $encode-bad (type $0) (result i32)
+ ;; CHECK-NEXT: (string.encode_wtf16_array
+ ;; CHECK-NEXT: (string.const "$_\c2\a3_\e2\82\ac_\f0\90\8d\88")
+ ;; CHECK-NEXT: (array.new_default $array16
+ ;; CHECK-NEXT: (i32.const 20)
+ ;; CHECK-NEXT: )
+ ;; CHECK-NEXT: (i32.const 0)
+ ;; CHECK-NEXT: )
+ ;; CHECK-NEXT: )
+ (func $encode-bad (result i32)
+ (string.encode_wtf16_array
+ ;; $_£_€_𐍈
+ (string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88")
+ (array.new_default $array16
+ (i32.const 20)
+ )
+ (i32.const 0)
+ )
+ )
+)