[Strings] Represent string values as WTF-16 internally (#6418)

WTF-16, i.e. arbitrary sequences of 16-bit values, is the encoding of Java and JavaScript strings, and using the same encoding makes the interpretation of string operations trivial, even when accounting for non-ascii characters. Specifically, use little-endian WTF-16. Re-encode string constants from WTF-8 to WTF-16 in the parsers, then back to WTF-8 in the writers. Update the constructor for string `Literal`s to interpret the string as WTF-16 and store a sequence of WTF-16 code units, i.e. 16-bit integers. Update `Builder::makeConstantExpression` accordingly to convert from the new `Literal` string representation back to a WTF-16 string. Update the interpreter to remove the logic for detecting non-ascii characters and bailing out. The naive implementations of all the string operations are correct now that our string encoding matches the JS string encoding.
author: Thomas Lively <tlively@google.com> 2024-03-22 16:56:33 -0700
committer: GitHub <noreply@github.com> 2024-03-22 23:56:33 +0000
commit: b3fea30f84fef3ff7aa77775e00b83ba62d997cc (patch)
tree: 53494a466d8e56d34d849d14927817a22f843748 /test/lit/passes
parent: d3414c3deaebe7ba35731a8c20d7fa5f5a833ca3 (diff)
download: binaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.tar.gz
binaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.tar.bz2
binaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.zip
2 files changed, 42 insertions, 60 deletions
diff --git a/test/lit/passes/precompute-strings.wast b/test/lit/passes/precompute-strings.wast
index 58c7e52d0..6046ee6b1 100644
--- a/test/lit/passes/precompute-strings.wast
+++ b/test/lit/passes/precompute-strings.wast
@@ -12,13 +12,15 @@
 
  ;; CHECK:      (type $3 (func (result (ref any))))
 
- ;; CHECK:      (export "get_codepoint-bad" (func $get_codepoint-bad))
+ ;; CHECK:      (export "get_codepoint-unicode" (func $get_codepoint-unicode))
+
+ ;; CHECK:      (export "get_codepoint-surrogate" (func $get_codepoint-surrogate))
 
  ;; CHECK:      (export "test" (func $encode-stashed))
 
  ;; CHECK:      (export "slice" (func $slice))
 
- ;; CHECK:      (export "slice-bad" (func $slice-bad))
+ ;; CHECK:      (export "slice-unicode" (func $slice-unicode))
 
  ;; CHECK:      (func $eq-no (type $0) (result i32)
  ;; CHECK-NEXT:  (i32.const 0)
@@ -50,19 +52,14 @@
   )
  )
 
- ;; CHECK:      (func $concat-bad (type $0) (result i32)
- ;; CHECK-NEXT:  (string.eq
- ;; CHECK-NEXT:   (string.concat
- ;; CHECK-NEXT:    (string.const "a\f0")
- ;; CHECK-NEXT:    (string.const "b")
- ;; CHECK-NEXT:   )
- ;; CHECK-NEXT:   (string.const "a\f0b")
- ;; CHECK-NEXT:  )
+ ;; CHECK:      (func $concat-surrogates (type $0) (result i32)
+ ;; CHECK-NEXT:  (i32.const 1)
  ;; CHECK-NEXT: )
- (func $concat-bad (result i32)
+ (func $concat-surrogates (result i32)
   (string.eq
-   (string.concat (string.const "a\F0") (string.const "b"))
-   (string.const "a\F0b")
+   ;; Concatenating these surrogates creates '𐍈', which has a different UTF-8 encoding.
+   (string.concat (string.const "\ED\A0\80") (string.const "\ED\BD\88"))
+   (string.const "\F0\90\8D\88")
   )
  )
 
@@ -77,18 +74,13 @@
   )
  )
 
- ;; CHECK:      (func $length-bad (type $0) (result i32)
- ;; CHECK-NEXT:  (stringview_wtf16.length
- ;; CHECK-NEXT:   (string.as_wtf16
- ;; CHECK-NEXT:    (string.const "$_\c2\a3_\e2\82\ac_\f0\90\8d\88")
- ;; CHECK-NEXT:   )
- ;; CHECK-NEXT:  )
+ ;; CHECK:      (func $length-unicode (type $0) (result i32)
+ ;; CHECK-NEXT:  (i32.const 8)
  ;; CHECK-NEXT: )
- (func $length-bad (result i32)
-  ;; Not precomputable because we don't handle unicode yet.
+ (func $length-unicode (result i32)
   (stringview_wtf16.length
    (string.as_wtf16
-    ;; $_£_€_𐍈
+    ;; $_£_€_𐍈 (the last character is encoded as a surrogate pair)
     (string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88")
    )
   )
@@ -98,7 +90,7 @@
  ;; CHECK-NEXT:  (i32.const 95)
  ;; CHECK-NEXT: )
  (func $get_codepoint (result i32)
-  ;; This is computable because everything up to the requested index is ascii. Returns 95 ('_').
+  ;; Returns 95 ('_').
   (stringview_wtf16.get_codeunit
    (string.as_wtf16
     ;; $_£_€_𐍈
@@ -108,22 +100,31 @@
   )
  )
 
- ;; CHECK:      (func $get_codepoint-bad (type $0) (result i32)
- ;; CHECK-NEXT:  (stringview_wtf16.get_codeunit
- ;; CHECK-NEXT:   (string.as_wtf16
- ;; CHECK-NEXT:    (string.const "$_\c2\a3_\e2\82\ac_\f0\90\8d\88")
- ;; CHECK-NEXT:   )
- ;; CHECK-NEXT:   (i32.const 2)
- ;; CHECK-NEXT:  )
+ ;; CHECK:      (func $get_codepoint-unicode (type $0) (result i32)
+ ;; CHECK-NEXT:  (i32.const 8364)
  ;; CHECK-NEXT: )
- (func $get_codepoint-bad (export "get_codepoint-bad") (result i32)
-  ;; This is not computable because the requested code unit is not ascii.
+ (func $get_codepoint-unicode (export "get_codepoint-unicode") (result i32)
+  ;; Returns 8364 ('€')
   (stringview_wtf16.get_codeunit
    (string.as_wtf16
     ;; $_£_€_𐍈
     (string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88")
    )
-   (i32.const 2)
+   (i32.const 4)
+  )
+ )
+
+ ;; CHECK:      (func $get_codepoint-surrogate (type $0) (result i32)
+ ;; CHECK-NEXT:  (i32.const 55296)
+ ;; CHECK-NEXT: )
+ (func $get_codepoint-surrogate (export "get_codepoint-surrogate") (result i32)
+  ;; Returns 0xd800 (the high surrogate in '𐍈')
+  (stringview_wtf16.get_codeunit
+   (string.as_wtf16
+    ;; $_£_€_𐍈
+    (string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88")
+   )
+   (i32.const 6)
   )
  )
 
@@ -148,7 +149,7 @@
   )
  )
 
- ;; CHECK:      (func $encode-bad (type $0) (result i32)
+ ;; CHECK:      (func $encode-unicode (type $0) (result i32)
  ;; CHECK-NEXT:  (string.encode_wtf16_array
  ;; CHECK-NEXT:   (string.const "$_\c2\a3_\e2\82\ac_\f0\90\8d\88")
  ;; CHECK-NEXT:   (array.new_default $array16
@@ -157,7 +158,7 @@
  ;; CHECK-NEXT:   (i32.const 0)
  ;; CHECK-NEXT:  )
  ;; CHECK-NEXT: )
- (func $encode-bad (result i32)
+ (func $encode-unicode (result i32)
   (string.encode_wtf16_array
    ;; $_£_€_𐍈
    (string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88")
@@ -220,17 +221,10 @@
   )
  )
 
- ;; CHECK:      (func $slice-bad (type $2) (result (ref string))
- ;; CHECK-NEXT:  (stringview_wtf16.slice
- ;; CHECK-NEXT:   (string.as_wtf16
- ;; CHECK-NEXT:    (string.const "abcd\c2\a3fgh")
- ;; CHECK-NEXT:   )
- ;; CHECK-NEXT:   (i32.const 3)
- ;; CHECK-NEXT:   (i32.const 6)
- ;; CHECK-NEXT:  )
+ ;; CHECK:      (func $slice-unicode (type $2) (result (ref string))
+ ;; CHECK-NEXT:  (string.const "d\c2\a3f")
  ;; CHECK-NEXT: )
- (func $slice-bad (export "slice-bad") (result (ref string))
-  ;; This slice contains non-ascii, so we do not optimize.
+ (func $slice-unicode (export "slice-unicode") (result (ref string))
   (stringview_wtf16.slice
    ;; abcd£fgh
    (string.as_wtf16
diff --git a/test/lit/passes/string-lowering.wast b/test/lit/passes/string-lowering.wast
index f7f47871b..c060bc8bd 100644
--- a/test/lit/passes/string-lowering.wast
+++ b/test/lit/passes/string-lowering.wast
@@ -16,18 +16,6 @@
     (drop
       (string.const "needs\tescaping\00.'#%\"- .\r\n\\08\0C\0A\0D\09.ꙮ")
     )
-    (drop
-      (string.const "invalid WTF-8 leading byte \FF")
-    )
-    (drop
-      (string.const "invalid trailing byte \C0\00")
-    )
-    (drop
-      (string.const "unexpected end \C0")
-    )
-    (drop
-      (string.const "invalid surrogate sequence \ED\A0\81\ED\B0\B7")
-    )
   )
 )
 
@@ -36,7 +24,7 @@
 ;;
 ;; RUN: wasm-opt %s --string-lowering -all -S -o - | filecheck %s
 ;;
-;; CHECK: custom section "string.consts", size 202, contents: "[\"bar\",\"foo\",\"invalid WTF-8 leading byte \\ufffd\",\"invalid surrogate sequence \\ud801\\udc37\",\"invalid trailing byte \\ufffd\",\"needs\\tescaping\\u0000.'#%\\\"- .\\r\\n\\\\08\\f\\n\\r\\t.\\ua66e\",\"unexpected end \\ufffd\"]"
+;; CHECK: custom section "string.consts", size 69, contents: "[\"bar\",\"foo\",\"needs\\tescaping\\u0000.'#%\\\"- .\\r\\n\\\\08\\f\\n\\r\\t.\\ua66e\"]"
 
 ;; The custom section should parse OK using JSON.parse from node.
 ;; (Note we run --remove-unused-module-elements to remove externref-using
@@ -45,5 +33,5 @@
 ;; RUN: wasm-opt %s --string-lowering --remove-unused-module-elements -all -o %t.wasm
 ;; RUN: node %S/string-lowering.js %t.wasm | filecheck %s --check-prefix=CHECK-JS
 ;;
-;; CHECK-JS: string: ["bar","foo","invalid WTF-8 leading byte \ufffd","invalid surrogate sequence \ud801\udc37","invalid trailing byte \ufffd","needs\tescaping\x00.'#%\"- .\r\n\\08\f\n\r\t.\ua66e","unexpected end \ufffd"]
-;; CHECK-JS: JSON: ["bar","foo","invalid WTF-8 leading byte �","invalid surrogate sequence 𐐷","invalid trailing byte �","needs\tescaping\x00.'#%\"- .\r\n\\08\f\n\r\t.ꙮ","unexpected end �"]
+;; CHECK-JS: string: ["bar","foo","needs\tescaping\x00.'#%\"- .\r\n\\08\f\n\r\t.\ua66e"]
+;; CHECK-JS: JSON: ["bar","foo","needs\tescaping\x00.'#%\"- .\r\n\\08\f\n\r\t.ꙮ"]
author	Thomas Lively <tlively@google.com>	2024-03-22 16:56:33 -0700
committer	GitHub <noreply@github.com>	2024-03-22 23:56:33 +0000
commit	b3fea30f84fef3ff7aa77775e00b83ba62d997cc (patch)
tree	53494a466d8e56d34d849d14927817a22f843748 /test/lit/passes
parent	d3414c3deaebe7ba35731a8c20d7fa5f5a833ca3 (diff)
download	binaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.tar.gz binaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.tar.bz2 binaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.zip