3 files changed, 158 insertions, 27 deletions
diff --git a/scripts/clusterfuzz/extract_wasms.py b/scripts/clusterfuzz/extract_wasms.py
new file mode 100644
index 000000000..bb727810d
--- /dev/null
+++ b/scripts/clusterfuzz/extract_wasms.py
@@ -0,0 +1,74 @@
+#
+# Copyright 2024 WebAssembly Community Group participants
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+Wasm extractor for testcases generated by the ClusterFuzz run.py script. Usage:
+
+extract_wasms.py INFILE.js OUTFILE
+
+That will find embedded wasm files in INFILE.js, of the form
+
+  var .. = new Uint8Array([..wasm_contents..]);
+
+and extract them into OUTFILE.0.wasm, OUTFILE.1.wasm, etc. It also emits
+OUTFILE.js which will no longer contain the embedded contents, after which the
+script can be run as
+
+  d8 OUTFILE.js -- OUTFILE.0.wasm
+
+That is, the embedded file can now be provided as a filename argument.
+'''
+
+import re
+import sys
+
+file_counter = 0
+
+
+def get_wasm_filename():
+    global file_counter
+    file_counter += 1
+    return f'{out}.{file_counter - 1}.wasm'
+
+
+in_js = sys.argv[1]
+out = sys.argv[2]
+
+with open(in_js) as f:
+    js = f.read()
+
+
+def repl(text):
+    # We found something of the form
+    #
+    #   var binary = new Uint8Array([..binary data as numbers..]);
+    #
+    # Parse out the numbers into a binary wasm file.
+    numbers = text.groups()[0]
+    numbers = numbers.split(',')
+    numbers = [int(n) for n in numbers]
+    with open(get_wasm_filename(), 'wb') as f:
+        f.write(bytes(numbers))
+
+    # Replace it with nothing.
+    return ''
+
+
+# Replace the wasm files and write them out.
+js = re.sub(r'var \w+ = new Uint8Array\(\[([\d,]+)\]\);', repl, js)
+
+# Write out the new JS.
+with open(f'{out}.js', 'w') as f:
+    f.write(js)
diff --git a/scripts/clusterfuzz/run.py b/scripts/clusterfuzz/run.py
index 6bbb74ef8..8ac880e0d 100755
--- a/scripts/clusterfuzz/run.py
+++ b/scripts/clusterfuzz/run.py
@@ -150,7 +150,18 @@ def get_js_file_contents(i, output_dir):
     # Prepend the wasm contents, so they are used (rather than the normal
     # mechanism where the wasm file's name is provided in argv).
     wasm_contents = get_wasm_contents(i, output_dir)
-    js = f'var binary = {wasm_contents};\n\n' + js
+    pre = f'var binary = {wasm_contents};\n'
+    bytes = wasm_contents.count(',')
+
+    # Sometimes add a second wasm file as well.
+    has_second = False
+    if system_random.random() < 0.333:
+        has_second = True
+        wasm_contents = get_wasm_contents(i, output_dir)
+        pre += f'var secondBinary = {wasm_contents};\n'
+        bytes += wasm_contents.count(',')
+
+    js = pre + '\n' + js
 
     # The default JS builds and runs the wasm. Append some random additional
     # operations as well, as more compiles and executions can find things. To
@@ -171,16 +182,23 @@ def get_js_file_contents(i, output_dir):
     x = math.pow(x, power)
     num = math.floor(x * MAX_EXTRA_JS_OPERATIONS)
     assert num >= 0 and num <= MAX_EXTRA_JS_OPERATIONS
+
+    extra_js_operations = [
+        # Compile and link the wasm again. Each link adds more to the total
+        # exports that we can call.
+        'build(binary);\n',
+        # Run all the exports we've accumulated.
+        'callExports();\n',
+    ]
+    if has_second:
+        extra_js_operations += [
+            'build(secondBinary);\n',
+        ]
+
     for i in range(num):
-        js += system_random.choice([
-            # Compile and link the wasm again. Each link adds more to the total
-            # exports that we can call.
-            'build(binary);\n',
-            # Run all the exports we've accumulated.
-            'callExports();\n',
-        ])
-
-    print(f'Created {wasm_contents.count(",")} wasm bytes')
+        js += system_random.choice(extra_js_operations)
+
+    print(f'Created {bytes} wasm bytes')
 
     return js
 
diff --git a/test/unit/test_cluster_fuzz.py b/test/unit/test_cluster_fuzz.py
index 387f65fd1..56250d46a 100644
--- a/test/unit/test_cluster_fuzz.py
+++ b/test/unit/test_cluster_fuzz.py
@@ -1,3 +1,4 @@
+import glob
 import os
 import platform
 import re
@@ -159,6 +160,9 @@ class ClusterFuzz(utils.BinaryenTestCase):
         seen_sizes = []
         seen_exports = []
 
+        # Second wasm files are also emitted sometimes.
+        seen_second_sizes = []
+
         # The number of struct.news appears in the metrics report like this:
         #
         # StructNew      : 18
@@ -179,23 +183,16 @@ class ClusterFuzz(utils.BinaryenTestCase):
             with open(flags_file) as f:
                 self.assertEqual(f.read(), '--wasm-staging')
 
-            # The fuzz files begin with
-            #
-            #   var binary = new Uint8Array([..binary data as numbers..]);
-            #
-            with open(fuzz_file) as f:
-                first_line = f.readline().strip()
-                start = 'var binary = new Uint8Array(['
-                end = ']);'
-                self.assertTrue(first_line.startswith(start))
-                self.assertTrue(first_line.endswith(end))
-                numbers = first_line[len(start):-len(end)]
-
-            # Convert to binary, and see that it is a valid file.
-            numbers_array = [int(x) for x in numbers.split(',')]
-            binary_file = os.path.join(temp_dir.name, 'file.wasm')
-            with open(binary_file, 'wb') as f:
-                f.write(bytes(numbers_array))
+            # Extract the wasm file(s) from the JS. Make sure to not notice
+            # stale files.
+            for f in glob.glob('extracted*'):
+                os.unlink(f)
+            extractor = shared.in_binaryen('scripts', 'clusterfuzz', 'extract_wasms.py')
+            subprocess.check_call([sys.executable, extractor, fuzz_file, 'extracted'])
+
+            # One wasm file must always exist, and must be valid.
+            binary_file = 'extracted.0.wasm'
+            assert os.path.exists(binary_file)
             metrics = subprocess.check_output(
                 shared.WASM_OPT + ['-all', '--metrics', binary_file, '-q'], text=True)
 
@@ -215,6 +212,19 @@ class ClusterFuzz(utils.BinaryenTestCase):
             self.assertEqual(len(exports), 1)
             seen_exports.append(int(exports[0]))
 
+            # Sometimes a second wasm file should exist, and it must be valid
+            # too.
+            second_binary_file = 'extracted.1.wasm'
+            if os.path.exists(second_binary_file):
+                subprocess.check_call(
+                    shared.WASM_OPT + ['-all', second_binary_file, '-q'])
+
+                # Note its size (we leave detailed metrics for the first one;
+                # they are generated by the same logic in run.py, so just
+                # verifying some valid second wasms are emitted, of random
+                # sizes, is enough).
+                seen_second_sizes.append(os.path.getsize(second_binary_file))
+
         print()
 
         print('struct.news are distributed as ~ mean 15, stddev 24, median 10')
@@ -247,10 +257,27 @@ class ClusterFuzz(utils.BinaryenTestCase):
 
         print()
 
+        # Second files appear in ~ 1/3 of testcases.
+        print('number of second wasms should be around 33 +- 8')
+        print(f'number of second wasms: {len(seen_second_sizes)}')
+        assert seen_second_sizes, 'must see at least one second wasm'
+        print('second sizes are distributed as ~ mean 2933, stddev 2011, median 2510')
+        print(f'mean sizes:   {statistics.mean(seen_second_sizes)}')
+        print(f'stdev sizes:  {statistics.stdev(seen_second_sizes)}')
+        print(f'median sizes: {statistics.median(seen_second_sizes)}')
+        # Relax the assert on the max seen second size compared to the max seen
+        # primary size, as we see fewer of these. 500 is still proof of an
+        # interesting wasm file.
+        self.assertGreaterEqual(max(seen_second_sizes), 500)
+        self.assertGreater(statistics.stdev(seen_second_sizes), 0)
+
+        print()
+
         # To check for interesting JS file contents, we'll note how many times
         # we build and run the wasm.
         seen_builds = []
         seen_calls = []
+        seen_second_builds = []
 
         for i in range(1, N + 1):
             fuzz_file = os.path.join(temp_dir.name, f'fuzz-binaryen-{i}.js')
@@ -258,6 +285,7 @@ class ClusterFuzz(utils.BinaryenTestCase):
                 js = f.read()
             seen_builds.append(js.count('build(binary);'))
             seen_calls.append(js.count('callExports();'))
+            seen_second_builds.append(js.count('build(secondBinary);'))
 
         # There is always one build and one call (those are in the default
         # fuzz_shell.js), and we add a couple of operations, each with equal
@@ -284,6 +312,17 @@ class ClusterFuzz(utils.BinaryenTestCase):
 
         print()
 
+        # Second wasm files are more rarely added, only 1/3 of the time or so,
+        # but over 100 samples we are still overwhelmingly likely to see one.
+        print('JS second builds are distributed as ~ mean 1.8, stddev 2.2, median 1')
+        print(f'mean JS second builds:   {statistics.mean(seen_second_builds)}')
+        print(f'stdev JS second builds:  {statistics.stdev(seen_second_builds)}')
+        print(f'median JS second builds: {statistics.median(seen_second_builds)}')
+        self.assertGreaterEqual(max(seen_second_builds), 2)
+        self.assertGreater(statistics.stdev(seen_second_builds), 0)
+
+        print()
+
     # "zzz" in test name so that this runs last. If it runs first, it can be
     # confusing as it appears next to the logging of which bundle we use (see
     # setUpClass).