[Fuzzing] Emit secondary wasm files in ClusterFuzz testcases (#7122)

The two files are then linked and run by fuzz_shell.js (we had this functionality already in order to fuzz wasm-split). By adding multiple build and run commands of both the primary and secondary wasm files, we can end up with multiple instances of two different wasm files that call between themselves. To help testing, add a script that extracts the wasm files from the testcase. This may also be useful in the future for testcase reduction.
author: Alon Zakai <azakai@google.com> 2024-11-26 15:12:36 -0800
committer: GitHub <noreply@github.com> 2024-11-26 15:12:36 -0800
commit: 73971d78e5355e8f08b4026b741992d78bd77476 (patch)
tree: e1f3b8761cb2c5a226e9b87daac954eeb5e91ed7
parent: 4ffe27255ce99d452d05d4b352e3f6e1e9ca7d83 (diff)
download: binaryen-73971d78e5355e8f08b4026b741992d78bd77476.tar.gz
binaryen-73971d78e5355e8f08b4026b741992d78bd77476.tar.bz2
binaryen-73971d78e5355e8f08b4026b741992d78bd77476.zip
3 files changed, 158 insertions, 27 deletions
diff --git a/scripts/clusterfuzz/extract_wasms.py b/scripts/clusterfuzz/extract_wasms.py
new file mode 100644
index 000000000..bb727810d
--- /dev/null
+++ b/scripts/clusterfuzz/extract_wasms.py
@@ -0,0 +1,74 @@
+#
+# Copyright 2024 WebAssembly Community Group participants
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+Wasm extractor for testcases generated by the ClusterFuzz run.py script. Usage:
+
+extract_wasms.py INFILE.js OUTFILE
+
+That will find embedded wasm files in INFILE.js, of the form
+
+  var .. = new Uint8Array([..wasm_contents..]);
+
+and extract them into OUTFILE.0.wasm, OUTFILE.1.wasm, etc. It also emits
+OUTFILE.js which will no longer contain the embedded contents, after which the
+script can be run as
+
+  d8 OUTFILE.js -- OUTFILE.0.wasm
+
+That is, the embedded file can now be provided as a filename argument.
+'''
+
+import re
+import sys
+
+file_counter = 0
+
+
+def get_wasm_filename():
+    global file_counter
+    file_counter += 1
+    return f'{out}.{file_counter - 1}.wasm'
+
+
+in_js = sys.argv[1]
+out = sys.argv[2]
+
+with open(in_js) as f:
+    js = f.read()
+
+
+def repl(text):
+    # We found something of the form
+    #
+    #   var binary = new Uint8Array([..binary data as numbers..]);
+    #
+    # Parse out the numbers into a binary wasm file.
+    numbers = text.groups()[0]
+    numbers = numbers.split(',')
+    numbers = [int(n) for n in numbers]
+    with open(get_wasm_filename(), 'wb') as f:
+        f.write(bytes(numbers))
+
+    # Replace it with nothing.
+    return ''
+
+
+# Replace the wasm files and write them out.
+js = re.sub(r'var \w+ = new Uint8Array\(\[([\d,]+)\]\);', repl, js)
+
+# Write out the new JS.
+with open(f'{out}.js', 'w') as f:
+    f.write(js)
diff --git a/scripts/clusterfuzz/run.py b/scripts/clusterfuzz/run.py
index 6bbb74ef8..8ac880e0d 100755
--- a/scripts/clusterfuzz/run.py
+++ b/scripts/clusterfuzz/run.py
@@ -150,7 +150,18 @@ def get_js_file_contents(i, output_dir):
     # Prepend the wasm contents, so they are used (rather than the normal
     # mechanism where the wasm file's name is provided in argv).
     wasm_contents = get_wasm_contents(i, output_dir)
-    js = f'var binary = {wasm_contents};\n\n' + js
+    pre = f'var binary = {wasm_contents};\n'
+    bytes = wasm_contents.count(',')
+
+    # Sometimes add a second wasm file as well.
+    has_second = False
+    if system_random.random() < 0.333:
+        has_second = True
+        wasm_contents = get_wasm_contents(i, output_dir)
+        pre += f'var secondBinary = {wasm_contents};\n'
+        bytes += wasm_contents.count(',')
+
+    js = pre + '\n' + js
 
     # The default JS builds and runs the wasm. Append some random additional
     # operations as well, as more compiles and executions can find things. To
@@ -171,16 +182,23 @@ def get_js_file_contents(i, output_dir):
     x = math.pow(x, power)
     num = math.floor(x * MAX_EXTRA_JS_OPERATIONS)
     assert num >= 0 and num <= MAX_EXTRA_JS_OPERATIONS
+
+    extra_js_operations = [
+        # Compile and link the wasm again. Each link adds more to the total
+        # exports that we can call.
+        'build(binary);\n',
+        # Run all the exports we've accumulated.
+        'callExports();\n',
+    ]
+    if has_second:
+        extra_js_operations += [
+            'build(secondBinary);\n',
+        ]
+
     for i in range(num):
-        js += system_random.choice([
-            # Compile and link the wasm again. Each link adds more to the total
-            # exports that we can call.
-            'build(binary);\n',
-            # Run all the exports we've accumulated.
-            'callExports();\n',
-        ])
-
-    print(f'Created {wasm_contents.count(",")} wasm bytes')
+        js += system_random.choice(extra_js_operations)
+
+    print(f'Created {bytes} wasm bytes')
 
     return js
 
diff --git a/test/unit/test_cluster_fuzz.py b/test/unit/test_cluster_fuzz.py
index 387f65fd1..56250d46a 100644
--- a/test/unit/test_cluster_fuzz.py
+++ b/test/unit/test_cluster_fuzz.py
@@ -1,3 +1,4 @@
+import glob
 import os
 import platform
 import re
@@ -159,6 +160,9 @@ class ClusterFuzz(utils.BinaryenTestCase):
         seen_sizes = []
         seen_exports = []
 
+        # Second wasm files are also emitted sometimes.
+        seen_second_sizes = []
+
         # The number of struct.news appears in the metrics report like this:
         #
         # StructNew      : 18
@@ -179,23 +183,16 @@ class ClusterFuzz(utils.BinaryenTestCase):
             with open(flags_file) as f:
                 self.assertEqual(f.read(), '--wasm-staging')
 
-            # The fuzz files begin with
-            #
-            #   var binary = new Uint8Array([..binary data as numbers..]);
-            #
-            with open(fuzz_file) as f:
-                first_line = f.readline().strip()
-                start = 'var binary = new Uint8Array(['
-                end = ']);'
-                self.assertTrue(first_line.startswith(start))
-                self.assertTrue(first_line.endswith(end))
-                numbers = first_line[len(start):-len(end)]
-
-            # Convert to binary, and see that it is a valid file.
-            numbers_array = [int(x) for x in numbers.split(',')]
-            binary_file = os.path.join(temp_dir.name, 'file.wasm')
-            with open(binary_file, 'wb') as f:
-                f.write(bytes(numbers_array))
+            # Extract the wasm file(s) from the JS. Make sure to not notice
+            # stale files.
+            for f in glob.glob('extracted*'):
+                os.unlink(f)
+            extractor = shared.in_binaryen('scripts', 'clusterfuzz', 'extract_wasms.py')
+            subprocess.check_call([sys.executable, extractor, fuzz_file, 'extracted'])
+
+            # One wasm file must always exist, and must be valid.
+            binary_file = 'extracted.0.wasm'
+            assert os.path.exists(binary_file)
             metrics = subprocess.check_output(
                 shared.WASM_OPT + ['-all', '--metrics', binary_file, '-q'], text=True)
 
@@ -215,6 +212,19 @@ class ClusterFuzz(utils.BinaryenTestCase):
             self.assertEqual(len(exports), 1)
             seen_exports.append(int(exports[0]))
 
+            # Sometimes a second wasm file should exist, and it must be valid
+            # too.
+            second_binary_file = 'extracted.1.wasm'
+            if os.path.exists(second_binary_file):
+                subprocess.check_call(
+                    shared.WASM_OPT + ['-all', second_binary_file, '-q'])
+
+                # Note its size (we leave detailed metrics for the first one;
+                # they are generated by the same logic in run.py, so just
+                # verifying some valid second wasms are emitted, of random
+                # sizes, is enough).
+                seen_second_sizes.append(os.path.getsize(second_binary_file))
+
         print()
 
         print('struct.news are distributed as ~ mean 15, stddev 24, median 10')
@@ -247,10 +257,27 @@ class ClusterFuzz(utils.BinaryenTestCase):
 
         print()
 
+        # Second files appear in ~ 1/3 of testcases.
+        print('number of second wasms should be around 33 +- 8')
+        print(f'number of second wasms: {len(seen_second_sizes)}')
+        assert seen_second_sizes, 'must see at least one second wasm'
+        print('second sizes are distributed as ~ mean 2933, stddev 2011, median 2510')
+        print(f'mean sizes:   {statistics.mean(seen_second_sizes)}')
+        print(f'stdev sizes:  {statistics.stdev(seen_second_sizes)}')
+        print(f'median sizes: {statistics.median(seen_second_sizes)}')
+        # Relax the assert on the max seen second size compared to the max seen
+        # primary size, as we see fewer of these. 500 is still proof of an
+        # interesting wasm file.
+        self.assertGreaterEqual(max(seen_second_sizes), 500)
+        self.assertGreater(statistics.stdev(seen_second_sizes), 0)
+
+        print()
+
         # To check for interesting JS file contents, we'll note how many times
         # we build and run the wasm.
         seen_builds = []
         seen_calls = []
+        seen_second_builds = []
 
         for i in range(1, N + 1):
             fuzz_file = os.path.join(temp_dir.name, f'fuzz-binaryen-{i}.js')
@@ -258,6 +285,7 @@ class ClusterFuzz(utils.BinaryenTestCase):
                 js = f.read()
             seen_builds.append(js.count('build(binary);'))
             seen_calls.append(js.count('callExports();'))
+            seen_second_builds.append(js.count('build(secondBinary);'))
 
         # There is always one build and one call (those are in the default
         # fuzz_shell.js), and we add a couple of operations, each with equal
@@ -284,6 +312,17 @@ class ClusterFuzz(utils.BinaryenTestCase):
 
         print()
 
+        # Second wasm files are more rarely added, only 1/3 of the time or so,
+        # but over 100 samples we are still overwhelmingly likely to see one.
+        print('JS second builds are distributed as ~ mean 1.8, stddev 2.2, median 1')
+        print(f'mean JS second builds:   {statistics.mean(seen_second_builds)}')
+        print(f'stdev JS second builds:  {statistics.stdev(seen_second_builds)}')
+        print(f'median JS second builds: {statistics.median(seen_second_builds)}')
+        self.assertGreaterEqual(max(seen_second_builds), 2)
+        self.assertGreater(statistics.stdev(seen_second_builds), 0)
+
+        print()
+
     # "zzz" in test name so that this runs last. If it runs first, it can be
     # confusing as it appears next to the logging of which bundle we use (see
     # setUpClass).
author	Alon Zakai <azakai@google.com>	2024-11-26 15:12:36 -0800
committer	GitHub <noreply@github.com>	2024-11-26 15:12:36 -0800
commit	73971d78e5355e8f08b4026b741992d78bd77476 (patch)
tree	e1f3b8761cb2c5a226e9b87daac954eeb5e91ed7
parent	4ffe27255ce99d452d05d4b352e3f6e1e9ca7d83 (diff)
download	binaryen-73971d78e5355e8f08b4026b741992d78bd77476.tar.gz binaryen-73971d78e5355e8f08b4026b741992d78bd77476.tar.bz2 binaryen-73971d78e5355e8f08b4026b741992d78bd77476.zip