diff options
author | Thomas Lively <tlively@google.com> | 2024-03-22 16:56:33 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-22 23:56:33 +0000 |
commit | b3fea30f84fef3ff7aa77775e00b83ba62d997cc (patch) | |
tree | 53494a466d8e56d34d849d14927817a22f843748 /test/lit/passes | |
parent | d3414c3deaebe7ba35731a8c20d7fa5f5a833ca3 (diff) | |
download | binaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.tar.gz binaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.tar.bz2 binaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.zip |
[Strings] Represent string values as WTF-16 internally (#6418)
WTF-16, i.e. arbitrary sequences of 16-bit values, is the encoding of Java and
JavaScript strings, and using the same encoding makes the interpretation of
string operations trivial, even when accounting for non-ascii characters.
Specifically, use little-endian WTF-16.
Re-encode string constants from WTF-8 to WTF-16 in the parsers, then back to
WTF-8 in the writers. Update the constructor for string `Literal`s to interpret
the string as WTF-16 and store a sequence of WTF-16 code units, i.e. 16-bit
integers. Update `Builder::makeConstantExpression` accordingly to convert from
the new `Literal` string representation back to a WTF-16 string.
Update the interpreter to remove the logic for detecting non-ascii characters
and bailing out. The naive implementations of all the string operations are
correct now that our string encoding matches the JS string encoding.
Diffstat (limited to 'test/lit/passes')
-rw-r--r-- | test/lit/passes/precompute-strings.wast | 84 | ||||
-rw-r--r-- | test/lit/passes/string-lowering.wast | 18 |
2 files changed, 42 insertions, 60 deletions
diff --git a/test/lit/passes/precompute-strings.wast b/test/lit/passes/precompute-strings.wast index 58c7e52d0..6046ee6b1 100644 --- a/test/lit/passes/precompute-strings.wast +++ b/test/lit/passes/precompute-strings.wast @@ -12,13 +12,15 @@ ;; CHECK: (type $3 (func (result (ref any)))) - ;; CHECK: (export "get_codepoint-bad" (func $get_codepoint-bad)) + ;; CHECK: (export "get_codepoint-unicode" (func $get_codepoint-unicode)) + + ;; CHECK: (export "get_codepoint-surrogate" (func $get_codepoint-surrogate)) ;; CHECK: (export "test" (func $encode-stashed)) ;; CHECK: (export "slice" (func $slice)) - ;; CHECK: (export "slice-bad" (func $slice-bad)) + ;; CHECK: (export "slice-unicode" (func $slice-unicode)) ;; CHECK: (func $eq-no (type $0) (result i32) ;; CHECK-NEXT: (i32.const 0) @@ -50,19 +52,14 @@ ) ) - ;; CHECK: (func $concat-bad (type $0) (result i32) - ;; CHECK-NEXT: (string.eq - ;; CHECK-NEXT: (string.concat - ;; CHECK-NEXT: (string.const "a\f0") - ;; CHECK-NEXT: (string.const "b") - ;; CHECK-NEXT: ) - ;; CHECK-NEXT: (string.const "a\f0b") - ;; CHECK-NEXT: ) + ;; CHECK: (func $concat-surrogates (type $0) (result i32) + ;; CHECK-NEXT: (i32.const 1) ;; CHECK-NEXT: ) - (func $concat-bad (result i32) + (func $concat-surrogates (result i32) (string.eq - (string.concat (string.const "a\F0") (string.const "b")) - (string.const "a\F0b") + ;; Concatenating these surrogates creates '𐍈', which has a different UTF-8 encoding. + (string.concat (string.const "\ED\A0\80") (string.const "\ED\BD\88")) + (string.const "\F0\90\8D\88") ) ) @@ -77,18 +74,13 @@ ) ) - ;; CHECK: (func $length-bad (type $0) (result i32) - ;; CHECK-NEXT: (stringview_wtf16.length - ;; CHECK-NEXT: (string.as_wtf16 - ;; CHECK-NEXT: (string.const "$_\c2\a3_\e2\82\ac_\f0\90\8d\88") - ;; CHECK-NEXT: ) - ;; CHECK-NEXT: ) + ;; CHECK: (func $length-unicode (type $0) (result i32) + ;; CHECK-NEXT: (i32.const 8) ;; CHECK-NEXT: ) - (func $length-bad (result i32) - ;; Not precomputable because we don't handle unicode yet. + (func $length-unicode (result i32) (stringview_wtf16.length (string.as_wtf16 - ;; $_£_€_𐍈 + ;; $_£_€_𐍈 (the last character is encoded as a surrogate pair) (string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88") ) ) @@ -98,7 +90,7 @@ ;; CHECK-NEXT: (i32.const 95) ;; CHECK-NEXT: ) (func $get_codepoint (result i32) - ;; This is computable because everything up to the requested index is ascii. Returns 95 ('_'). + ;; Returns 95 ('_'). (stringview_wtf16.get_codeunit (string.as_wtf16 ;; $_£_€_𐍈 @@ -108,22 +100,31 @@ ) ) - ;; CHECK: (func $get_codepoint-bad (type $0) (result i32) - ;; CHECK-NEXT: (stringview_wtf16.get_codeunit - ;; CHECK-NEXT: (string.as_wtf16 - ;; CHECK-NEXT: (string.const "$_\c2\a3_\e2\82\ac_\f0\90\8d\88") - ;; CHECK-NEXT: ) - ;; CHECK-NEXT: (i32.const 2) - ;; CHECK-NEXT: ) + ;; CHECK: (func $get_codepoint-unicode (type $0) (result i32) + ;; CHECK-NEXT: (i32.const 8364) ;; CHECK-NEXT: ) - (func $get_codepoint-bad (export "get_codepoint-bad") (result i32) - ;; This is not computable because the requested code unit is not ascii. + (func $get_codepoint-unicode (export "get_codepoint-unicode") (result i32) + ;; Returns 8364 ('€') (stringview_wtf16.get_codeunit (string.as_wtf16 ;; $_£_€_𐍈 (string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88") ) - (i32.const 2) + (i32.const 4) + ) + ) + + ;; CHECK: (func $get_codepoint-surrogate (type $0) (result i32) + ;; CHECK-NEXT: (i32.const 55296) + ;; CHECK-NEXT: ) + (func $get_codepoint-surrogate (export "get_codepoint-surrogate") (result i32) + ;; Returns 0xd800 (the high surrogate in '𐍈') + (stringview_wtf16.get_codeunit + (string.as_wtf16 + ;; $_£_€_𐍈 + (string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88") + ) + (i32.const 6) ) ) @@ -148,7 +149,7 @@ ) ) - ;; CHECK: (func $encode-bad (type $0) (result i32) + ;; CHECK: (func $encode-unicode (type $0) (result i32) ;; CHECK-NEXT: (string.encode_wtf16_array ;; CHECK-NEXT: (string.const "$_\c2\a3_\e2\82\ac_\f0\90\8d\88") ;; CHECK-NEXT: (array.new_default $array16 @@ -157,7 +158,7 @@ ;; CHECK-NEXT: (i32.const 0) ;; CHECK-NEXT: ) ;; CHECK-NEXT: ) - (func $encode-bad (result i32) + (func $encode-unicode (result i32) (string.encode_wtf16_array ;; $_£_€_𐍈 (string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88") @@ -220,17 +221,10 @@ ) ) - ;; CHECK: (func $slice-bad (type $2) (result (ref string)) - ;; CHECK-NEXT: (stringview_wtf16.slice - ;; CHECK-NEXT: (string.as_wtf16 - ;; CHECK-NEXT: (string.const "abcd\c2\a3fgh") - ;; CHECK-NEXT: ) - ;; CHECK-NEXT: (i32.const 3) - ;; CHECK-NEXT: (i32.const 6) - ;; CHECK-NEXT: ) + ;; CHECK: (func $slice-unicode (type $2) (result (ref string)) + ;; CHECK-NEXT: (string.const "d\c2\a3f") ;; CHECK-NEXT: ) - (func $slice-bad (export "slice-bad") (result (ref string)) - ;; This slice contains non-ascii, so we do not optimize. + (func $slice-unicode (export "slice-unicode") (result (ref string)) (stringview_wtf16.slice ;; abcd£fgh (string.as_wtf16 diff --git a/test/lit/passes/string-lowering.wast b/test/lit/passes/string-lowering.wast index f7f47871b..c060bc8bd 100644 --- a/test/lit/passes/string-lowering.wast +++ b/test/lit/passes/string-lowering.wast @@ -16,18 +16,6 @@ (drop (string.const "needs\tescaping\00.'#%\"- .\r\n\\08\0C\0A\0D\09.ꙮ") ) - (drop - (string.const "invalid WTF-8 leading byte \FF") - ) - (drop - (string.const "invalid trailing byte \C0\00") - ) - (drop - (string.const "unexpected end \C0") - ) - (drop - (string.const "invalid surrogate sequence \ED\A0\81\ED\B0\B7") - ) ) ) @@ -36,7 +24,7 @@ ;; ;; RUN: wasm-opt %s --string-lowering -all -S -o - | filecheck %s ;; -;; CHECK: custom section "string.consts", size 202, contents: "[\"bar\",\"foo\",\"invalid WTF-8 leading byte \\ufffd\",\"invalid surrogate sequence \\ud801\\udc37\",\"invalid trailing byte \\ufffd\",\"needs\\tescaping\\u0000.'#%\\\"- .\\r\\n\\\\08\\f\\n\\r\\t.\\ua66e\",\"unexpected end \\ufffd\"]" +;; CHECK: custom section "string.consts", size 69, contents: "[\"bar\",\"foo\",\"needs\\tescaping\\u0000.'#%\\\"- .\\r\\n\\\\08\\f\\n\\r\\t.\\ua66e\"]" ;; The custom section should parse OK using JSON.parse from node. ;; (Note we run --remove-unused-module-elements to remove externref-using @@ -45,5 +33,5 @@ ;; RUN: wasm-opt %s --string-lowering --remove-unused-module-elements -all -o %t.wasm ;; RUN: node %S/string-lowering.js %t.wasm | filecheck %s --check-prefix=CHECK-JS ;; -;; CHECK-JS: string: ["bar","foo","invalid WTF-8 leading byte \ufffd","invalid surrogate sequence \ud801\udc37","invalid trailing byte \ufffd","needs\tescaping\x00.'#%\"- .\r\n\\08\f\n\r\t.\ua66e","unexpected end \ufffd"] -;; CHECK-JS: JSON: ["bar","foo","invalid WTF-8 leading byte �","invalid surrogate sequence 𐐷","invalid trailing byte �","needs\tescaping\x00.'#%\"- .\r\n\\08\f\n\r\t.ꙮ","unexpected end �"] +;; CHECK-JS: string: ["bar","foo","needs\tescaping\x00.'#%\"- .\r\n\\08\f\n\r\t.\ua66e"] +;; CHECK-JS: JSON: ["bar","foo","needs\tescaping\x00.'#%\"- .\r\n\\08\f\n\r\t.ꙮ"] |