[Strings] Escape strings printed by fuzz-exec (#6441)

Previously we printed strings as WTF-8 in the output of fuzz-exec, but this could produce invalid unicode output and did not make unprintable characters visible. Fix both these problems by escaping the output, using the JSON string escape procedure since the string to be escaped is WTF-16. Reimplement the same escaping procedure in fuzz_shell.js so that the way we print strings when running on a real JS engine matches the way we print them in our own fuzz-exec interpreter. Fixes #6435.
author: Thomas Lively <tlively@google.com> 2024-03-26 10:44:37 -0700
committer: GitHub <noreply@github.com> 2024-03-26 10:44:37 -0700
commit: 431e858c4f4ac0343914eb42196f8bb64ac99023 (patch)
tree: 7071e42b72b2cb49c9a15845c5fe1675d3ebd4bf
parent: c9a5da466df084da5c0bbcb03b56aa1bd9585dcd (diff)
download: binaryen-431e858c4f4ac0343914eb42196f8bb64ac99023.tar.gz
binaryen-431e858c4f4ac0343914eb42196f8bb64ac99023.tar.bz2
binaryen-431e858c4f4ac0343914eb42196f8bb64ac99023.zip
3 files changed, 92 insertions, 12 deletions
diff --git a/scripts/fuzz_shell.js b/scripts/fuzz_shell.js
index be65ce31c..1e4068dc8 100644
--- a/scripts/fuzz_shell.js
+++ b/scripts/fuzz_shell.js
@@ -48,8 +48,58 @@ function printed(x, y) {
     // 'object', below.
     return 'null';
   } else if (typeof x === 'string') {
-    // Emit a string in the same format as the binaryen interpreter.
-    return 'string("' + x + '")';
+    // Emit a string in the same format as the binaryen interpreter. This
+    // escaping routine must be kept in sync with String::printEscapedJSON.
+    var escaped = '';
+    for (u of x) {
+      switch (u) {
+        case '"':
+          escaped += '\\"';
+          continue;
+        case '\\':
+          escaped += '\\\\';
+          continue;
+        case '\b':
+          escaped += '\\b';
+          continue;
+        case '\f':
+          escaped += '\\f';
+          continue;
+        case '\n':
+          escaped += '\\n';
+          continue;
+        case '\r':
+          escaped += '\\r';
+          continue;
+        case '\t':
+          escaped += '\\t';
+          continue;
+        default:
+          break;
+      }
+
+      var codePoint = u.codePointAt(0);
+      if (32 <= codePoint && codePoint < 127) {
+        escaped += u;
+        continue
+      }
+
+      var printEscape = (codePoint) => {
+        escaped += '\\u'
+        escaped += ((codePoint & 0xF000) >> 12).toString(16);
+        escaped += ((codePoint & 0x0F00) >> 8).toString(16);
+        escaped += ((codePoint & 0x00F0) >> 4).toString(16);
+        escaped += (codePoint & 0x000F).toString(16);
+      };
+
+      if (codePoint < 0x10000) {
+        printEscape(codePoint);
+      } else {
+        printEscape(0xD800 + ((codePoint - 0x10000) >> 10));
+        printEscape(0xDC00 + ((codePoint - 0x10000) & 0x3FF));
+      }
+    }
+    return 'string("' + escaped + '")';
   } else if (typeof x === 'bigint') {
     // Print bigints in legalized form, which is two 32-bit numbers of the low
     // and high bits.
@@ -146,4 +196,3 @@ for (var e in exports) {
     console.log('exception thrown: ' + e);
   }
 }
-
diff --git a/src/wasm/literal.cpp b/src/wasm/literal.cpp
index afdc14c72..887c777ec 100644
--- a/src/wasm/literal.cpp
+++ b/src/wasm/literal.cpp
@@ -639,7 +639,7 @@ std::ostream& operator<<(std::ostream& o, Literal literal) {
           if (!data) {
             o << "nullstring";
           } else {
-            o << "string(\"";
+            o << "string(";
             // Convert WTF-16 literals to WTF-16 string.
             std::stringstream wtf16;
             for (auto c : data->values) {
@@ -648,12 +648,11 @@ std::ostream& operator<<(std::ostream& o, Literal literal) {
               wtf16 << uint8_t(u & 0xFF);
               wtf16 << uint8_t(u >> 8);
             }
-            // Convert to WTF-8 for printing.
+            // Escape to ensure we have valid unicode output and to make
+            // unprintable characters visible.
             // TODO: Use wtf16.view() once we have C++20.
-            [[maybe_unused]] bool valid =
-              String::convertWTF16ToWTF8(o, wtf16.str());
-            assert(valid);
-            o << "\")";
+            String::printEscapedJSON(o, wtf16.str());
+            o << ")";
           }
           break;
         }
diff --git a/test/lit/exec/strings.wast b/test/lit/exec/strings.wast
index 106e1e214..4fb17a9e3 100644
--- a/test/lit/exec/strings.wast
+++ b/test/lit/exec/strings.wast
@@ -7,7 +7,7 @@
 
   (memory 1 1)
 
-  (import "fuzzing-support" "log" (func $log (param i32)))
+  (import "fuzzing-support" "log-i32" (func $log (param i32)))
 
   ;; CHECK:      [fuzz-exec] calling new_wtf16_array
   ;; CHECK-NEXT: [fuzz-exec] note result: new_wtf16_array => string("ello")
@@ -280,7 +280,9 @@
   (func $slice (export "slice") (result (ref string))
     ;; Slicing [3:6] here should definitely output "def".
     (stringview_wtf16.slice
-      (string.const "abcdefgh")
+      (string.as_wtf16
+        (string.const "abcdefgh")
+      )
       (i32.const 3)
       (i32.const 6)
     )
@@ -291,7 +293,9 @@
   (func $slice-big (export "slice-big") (result (ref string))
     ;; Slicing [3:huge unsigned value] leads to slicing til the end: "defgh".
     (stringview_wtf16.slice
-      (string.const "abcdefgh")
+      (string.as_wtf16
+        (string.const "abcdefgh")
+      )
       (i32.const 3)
       (i32.const -1)
     )
@@ -337,6 +341,26 @@
       (i32.const 1)
     )
   )
+
+  ;; CHECK:      [fuzz-exec] calling slice-unicode
+  ;; CHECK-NEXT: [fuzz-exec] note result: slice-unicode => string("d\u00a3f")
+  (func $slice-unicode (export "slice-unicode") (result (ref string))
+    (stringview_wtf16.slice
+      ;; abcd£fgh
+      (string.as_wtf16
+        (string.const "abcd\C2\A3fgh")
+      )
+      (i32.const 3)
+      (i32.const 6)
+    )
+  )
+
+  ;; CHECK:      [fuzz-exec] calling concat-surrogates
+  ;; CHECK-NEXT: [fuzz-exec] note result: concat-surrogates => string("\ud800\udf48")
+  (func $concat-surrogates (export "concat-surrogates") (result (ref string))
+    ;; Concatenating these surrogates creates '𐍈'.
+    (string.concat (string.const "\ED\A0\80") (string.const "\ED\BD\88"))
+  )
 )
 ;; CHECK:      [fuzz-exec] calling new_wtf16_array
 ;; CHECK-NEXT: [fuzz-exec] note result: new_wtf16_array => string("ello")
@@ -423,6 +447,12 @@
 
 ;; CHECK:      [fuzz-exec] calling new_empty_oob_2
 ;; CHECK-NEXT: [trap array oob]
+
+;; CHECK:      [fuzz-exec] calling slice-unicode
+;; CHECK-NEXT: [fuzz-exec] note result: slice-unicode => string("d\u00a3f")
+
+;; CHECK:      [fuzz-exec] calling concat-surrogates
+;; CHECK-NEXT: [fuzz-exec] note result: concat-surrogates => string("\ud800\udf48")
 ;; CHECK-NEXT: [fuzz-exec] comparing compare.1
 ;; CHECK-NEXT: [fuzz-exec] comparing compare.10
 ;; CHECK-NEXT: [fuzz-exec] comparing compare.2
@@ -433,6 +463,7 @@
 ;; CHECK-NEXT: [fuzz-exec] comparing compare.7
 ;; CHECK-NEXT: [fuzz-exec] comparing compare.8
 ;; CHECK-NEXT: [fuzz-exec] comparing compare.9
+;; CHECK-NEXT: [fuzz-exec] comparing concat-surrogates
 ;; CHECK-NEXT: [fuzz-exec] comparing const
 ;; CHECK-NEXT: [fuzz-exec] comparing encode
 ;; CHECK-NEXT: [fuzz-exec] comparing encode-overflow
@@ -450,3 +481,4 @@
 ;; CHECK-NEXT: [fuzz-exec] comparing new_wtf16_array
 ;; CHECK-NEXT: [fuzz-exec] comparing slice
 ;; CHECK-NEXT: [fuzz-exec] comparing slice-big
+;; CHECK-NEXT: [fuzz-exec] comparing slice-unicode
author	Thomas Lively <tlively@google.com>	2024-03-26 10:44:37 -0700
committer	GitHub <noreply@github.com>	2024-03-26 10:44:37 -0700
commit	431e858c4f4ac0343914eb42196f8bb64ac99023 (patch)
tree	7071e42b72b2cb49c9a15845c5fe1675d3ebd4bf
parent	c9a5da466df084da5c0bbcb03b56aa1bd9585dcd (diff)
download	binaryen-431e858c4f4ac0343914eb42196f8bb64ac99023.tar.gz binaryen-431e858c4f4ac0343914eb42196f8bb64ac99023.tar.bz2 binaryen-431e858c4f4ac0343914eb42196f8bb64ac99023.zip