summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--scripts/clusterfuzz/extract_wasms.py74
-rwxr-xr-xscripts/clusterfuzz/run.py38
-rw-r--r--test/unit/test_cluster_fuzz.py73
3 files changed, 158 insertions, 27 deletions
diff --git a/scripts/clusterfuzz/extract_wasms.py b/scripts/clusterfuzz/extract_wasms.py
new file mode 100644
index 000000000..bb727810d
--- /dev/null
+++ b/scripts/clusterfuzz/extract_wasms.py
@@ -0,0 +1,74 @@
+#
+# Copyright 2024 WebAssembly Community Group participants
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+Wasm extractor for testcases generated by the ClusterFuzz run.py script. Usage:
+
+extract_wasms.py INFILE.js OUTFILE
+
+That will find embedded wasm files in INFILE.js, of the form
+
+ var .. = new Uint8Array([..wasm_contents..]);
+
+and extract them into OUTFILE.0.wasm, OUTFILE.1.wasm, etc. It also emits
+OUTFILE.js which will no longer contain the embedded contents, after which the
+script can be run as
+
+ d8 OUTFILE.js -- OUTFILE.0.wasm
+
+That is, the embedded file can now be provided as a filename argument.
+'''
+
+import re
+import sys
+
+file_counter = 0
+
+
+def get_wasm_filename():
+ global file_counter
+ file_counter += 1
+ return f'{out}.{file_counter - 1}.wasm'
+
+
+in_js = sys.argv[1]
+out = sys.argv[2]
+
+with open(in_js) as f:
+ js = f.read()
+
+
+def repl(text):
+ # We found something of the form
+ #
+ # var binary = new Uint8Array([..binary data as numbers..]);
+ #
+ # Parse out the numbers into a binary wasm file.
+ numbers = text.groups()[0]
+ numbers = numbers.split(',')
+ numbers = [int(n) for n in numbers]
+ with open(get_wasm_filename(), 'wb') as f:
+ f.write(bytes(numbers))
+
+ # Replace it with nothing.
+ return ''
+
+
+# Replace the wasm files and write them out.
+js = re.sub(r'var \w+ = new Uint8Array\(\[([\d,]+)\]\);', repl, js)
+
+# Write out the new JS.
+with open(f'{out}.js', 'w') as f:
+ f.write(js)
diff --git a/scripts/clusterfuzz/run.py b/scripts/clusterfuzz/run.py
index 6bbb74ef8..8ac880e0d 100755
--- a/scripts/clusterfuzz/run.py
+++ b/scripts/clusterfuzz/run.py
@@ -150,7 +150,18 @@ def get_js_file_contents(i, output_dir):
# Prepend the wasm contents, so they are used (rather than the normal
# mechanism where the wasm file's name is provided in argv).
wasm_contents = get_wasm_contents(i, output_dir)
- js = f'var binary = {wasm_contents};\n\n' + js
+ pre = f'var binary = {wasm_contents};\n'
+ bytes = wasm_contents.count(',')
+
+ # Sometimes add a second wasm file as well.
+ has_second = False
+ if system_random.random() < 0.333:
+ has_second = True
+ wasm_contents = get_wasm_contents(i, output_dir)
+ pre += f'var secondBinary = {wasm_contents};\n'
+ bytes += wasm_contents.count(',')
+
+ js = pre + '\n' + js
# The default JS builds and runs the wasm. Append some random additional
# operations as well, as more compiles and executions can find things. To
@@ -171,16 +182,23 @@ def get_js_file_contents(i, output_dir):
x = math.pow(x, power)
num = math.floor(x * MAX_EXTRA_JS_OPERATIONS)
assert num >= 0 and num <= MAX_EXTRA_JS_OPERATIONS
+
+ extra_js_operations = [
+ # Compile and link the wasm again. Each link adds more to the total
+ # exports that we can call.
+ 'build(binary);\n',
+ # Run all the exports we've accumulated.
+ 'callExports();\n',
+ ]
+ if has_second:
+ extra_js_operations += [
+ 'build(secondBinary);\n',
+ ]
+
for i in range(num):
- js += system_random.choice([
- # Compile and link the wasm again. Each link adds more to the total
- # exports that we can call.
- 'build(binary);\n',
- # Run all the exports we've accumulated.
- 'callExports();\n',
- ])
-
- print(f'Created {wasm_contents.count(",")} wasm bytes')
+ js += system_random.choice(extra_js_operations)
+
+ print(f'Created {bytes} wasm bytes')
return js
diff --git a/test/unit/test_cluster_fuzz.py b/test/unit/test_cluster_fuzz.py
index 387f65fd1..56250d46a 100644
--- a/test/unit/test_cluster_fuzz.py
+++ b/test/unit/test_cluster_fuzz.py
@@ -1,3 +1,4 @@
+import glob
import os
import platform
import re
@@ -159,6 +160,9 @@ class ClusterFuzz(utils.BinaryenTestCase):
seen_sizes = []
seen_exports = []
+ # Second wasm files are also emitted sometimes.
+ seen_second_sizes = []
+
# The number of struct.news appears in the metrics report like this:
#
# StructNew : 18
@@ -179,23 +183,16 @@ class ClusterFuzz(utils.BinaryenTestCase):
with open(flags_file) as f:
self.assertEqual(f.read(), '--wasm-staging')
- # The fuzz files begin with
- #
- # var binary = new Uint8Array([..binary data as numbers..]);
- #
- with open(fuzz_file) as f:
- first_line = f.readline().strip()
- start = 'var binary = new Uint8Array(['
- end = ']);'
- self.assertTrue(first_line.startswith(start))
- self.assertTrue(first_line.endswith(end))
- numbers = first_line[len(start):-len(end)]
-
- # Convert to binary, and see that it is a valid file.
- numbers_array = [int(x) for x in numbers.split(',')]
- binary_file = os.path.join(temp_dir.name, 'file.wasm')
- with open(binary_file, 'wb') as f:
- f.write(bytes(numbers_array))
+ # Extract the wasm file(s) from the JS. Make sure to not notice
+ # stale files.
+ for f in glob.glob('extracted*'):
+ os.unlink(f)
+ extractor = shared.in_binaryen('scripts', 'clusterfuzz', 'extract_wasms.py')
+ subprocess.check_call([sys.executable, extractor, fuzz_file, 'extracted'])
+
+ # One wasm file must always exist, and must be valid.
+ binary_file = 'extracted.0.wasm'
+ assert os.path.exists(binary_file)
metrics = subprocess.check_output(
shared.WASM_OPT + ['-all', '--metrics', binary_file, '-q'], text=True)
@@ -215,6 +212,19 @@ class ClusterFuzz(utils.BinaryenTestCase):
self.assertEqual(len(exports), 1)
seen_exports.append(int(exports[0]))
+ # Sometimes a second wasm file should exist, and it must be valid
+ # too.
+ second_binary_file = 'extracted.1.wasm'
+ if os.path.exists(second_binary_file):
+ subprocess.check_call(
+ shared.WASM_OPT + ['-all', second_binary_file, '-q'])
+
+ # Note its size (we leave detailed metrics for the first one;
+ # they are generated by the same logic in run.py, so just
+ # verifying some valid second wasms are emitted, of random
+ # sizes, is enough).
+ seen_second_sizes.append(os.path.getsize(second_binary_file))
+
print()
print('struct.news are distributed as ~ mean 15, stddev 24, median 10')
@@ -247,10 +257,27 @@ class ClusterFuzz(utils.BinaryenTestCase):
print()
+ # Second files appear in ~ 1/3 of testcases.
+ print('number of second wasms should be around 33 +- 8')
+ print(f'number of second wasms: {len(seen_second_sizes)}')
+ assert seen_second_sizes, 'must see at least one second wasm'
+ print('second sizes are distributed as ~ mean 2933, stddev 2011, median 2510')
+ print(f'mean sizes: {statistics.mean(seen_second_sizes)}')
+ print(f'stdev sizes: {statistics.stdev(seen_second_sizes)}')
+ print(f'median sizes: {statistics.median(seen_second_sizes)}')
+ # Relax the assert on the max seen second size compared to the max seen
+ # primary size, as we see fewer of these. 500 is still proof of an
+ # interesting wasm file.
+ self.assertGreaterEqual(max(seen_second_sizes), 500)
+ self.assertGreater(statistics.stdev(seen_second_sizes), 0)
+
+ print()
+
# To check for interesting JS file contents, we'll note how many times
# we build and run the wasm.
seen_builds = []
seen_calls = []
+ seen_second_builds = []
for i in range(1, N + 1):
fuzz_file = os.path.join(temp_dir.name, f'fuzz-binaryen-{i}.js')
@@ -258,6 +285,7 @@ class ClusterFuzz(utils.BinaryenTestCase):
js = f.read()
seen_builds.append(js.count('build(binary);'))
seen_calls.append(js.count('callExports();'))
+ seen_second_builds.append(js.count('build(secondBinary);'))
# There is always one build and one call (those are in the default
# fuzz_shell.js), and we add a couple of operations, each with equal
@@ -284,6 +312,17 @@ class ClusterFuzz(utils.BinaryenTestCase):
print()
+ # Second wasm files are more rarely added, only 1/3 of the time or so,
+ # but over 100 samples we are still overwhelmingly likely to see one.
+ print('JS second builds are distributed as ~ mean 1.8, stddev 2.2, median 1')
+ print(f'mean JS second builds: {statistics.mean(seen_second_builds)}')
+ print(f'stdev JS second builds: {statistics.stdev(seen_second_builds)}')
+ print(f'median JS second builds: {statistics.median(seen_second_builds)}')
+ self.assertGreaterEqual(max(seen_second_builds), 2)
+ self.assertGreater(statistics.stdev(seen_second_builds), 0)
+
+ print()
+
# "zzz" in test name so that this runs last. If it runs first, it can be
# confusing as it appears next to the logging of which bundle we use (see
# setUpClass).