diff options
-rw-r--r-- | scripts/clusterfuzz/extract_wasms.py | 74 | ||||
-rwxr-xr-x | scripts/clusterfuzz/run.py | 38 | ||||
-rw-r--r-- | test/unit/test_cluster_fuzz.py | 73 |
3 files changed, 158 insertions, 27 deletions
diff --git a/scripts/clusterfuzz/extract_wasms.py b/scripts/clusterfuzz/extract_wasms.py new file mode 100644 index 000000000..bb727810d --- /dev/null +++ b/scripts/clusterfuzz/extract_wasms.py @@ -0,0 +1,74 @@ +# +# Copyright 2024 WebAssembly Community Group participants +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +''' +Wasm extractor for testcases generated by the ClusterFuzz run.py script. Usage: + +extract_wasms.py INFILE.js OUTFILE + +That will find embedded wasm files in INFILE.js, of the form + + var .. = new Uint8Array([..wasm_contents..]); + +and extract them into OUTFILE.0.wasm, OUTFILE.1.wasm, etc. It also emits +OUTFILE.js which will no longer contain the embedded contents, after which the +script can be run as + + d8 OUTFILE.js -- OUTFILE.0.wasm + +That is, the embedded file can now be provided as a filename argument. +''' + +import re +import sys + +file_counter = 0 + + +def get_wasm_filename(): + global file_counter + file_counter += 1 + return f'{out}.{file_counter - 1}.wasm' + + +in_js = sys.argv[1] +out = sys.argv[2] + +with open(in_js) as f: + js = f.read() + + +def repl(text): + # We found something of the form + # + # var binary = new Uint8Array([..binary data as numbers..]); + # + # Parse out the numbers into a binary wasm file. + numbers = text.groups()[0] + numbers = numbers.split(',') + numbers = [int(n) for n in numbers] + with open(get_wasm_filename(), 'wb') as f: + f.write(bytes(numbers)) + + # Replace it with nothing. + return '' + + +# Replace the wasm files and write them out. +js = re.sub(r'var \w+ = new Uint8Array\(\[([\d,]+)\]\);', repl, js) + +# Write out the new JS. +with open(f'{out}.js', 'w') as f: + f.write(js) diff --git a/scripts/clusterfuzz/run.py b/scripts/clusterfuzz/run.py index 6bbb74ef8..8ac880e0d 100755 --- a/scripts/clusterfuzz/run.py +++ b/scripts/clusterfuzz/run.py @@ -150,7 +150,18 @@ def get_js_file_contents(i, output_dir): # Prepend the wasm contents, so they are used (rather than the normal # mechanism where the wasm file's name is provided in argv). wasm_contents = get_wasm_contents(i, output_dir) - js = f'var binary = {wasm_contents};\n\n' + js + pre = f'var binary = {wasm_contents};\n' + bytes = wasm_contents.count(',') + + # Sometimes add a second wasm file as well. + has_second = False + if system_random.random() < 0.333: + has_second = True + wasm_contents = get_wasm_contents(i, output_dir) + pre += f'var secondBinary = {wasm_contents};\n' + bytes += wasm_contents.count(',') + + js = pre + '\n' + js # The default JS builds and runs the wasm. Append some random additional # operations as well, as more compiles and executions can find things. To @@ -171,16 +182,23 @@ def get_js_file_contents(i, output_dir): x = math.pow(x, power) num = math.floor(x * MAX_EXTRA_JS_OPERATIONS) assert num >= 0 and num <= MAX_EXTRA_JS_OPERATIONS + + extra_js_operations = [ + # Compile and link the wasm again. Each link adds more to the total + # exports that we can call. + 'build(binary);\n', + # Run all the exports we've accumulated. + 'callExports();\n', + ] + if has_second: + extra_js_operations += [ + 'build(secondBinary);\n', + ] + for i in range(num): - js += system_random.choice([ - # Compile and link the wasm again. Each link adds more to the total - # exports that we can call. - 'build(binary);\n', - # Run all the exports we've accumulated. - 'callExports();\n', - ]) - - print(f'Created {wasm_contents.count(",")} wasm bytes') + js += system_random.choice(extra_js_operations) + + print(f'Created {bytes} wasm bytes') return js diff --git a/test/unit/test_cluster_fuzz.py b/test/unit/test_cluster_fuzz.py index 387f65fd1..56250d46a 100644 --- a/test/unit/test_cluster_fuzz.py +++ b/test/unit/test_cluster_fuzz.py @@ -1,3 +1,4 @@ +import glob import os import platform import re @@ -159,6 +160,9 @@ class ClusterFuzz(utils.BinaryenTestCase): seen_sizes = [] seen_exports = [] + # Second wasm files are also emitted sometimes. + seen_second_sizes = [] + # The number of struct.news appears in the metrics report like this: # # StructNew : 18 @@ -179,23 +183,16 @@ class ClusterFuzz(utils.BinaryenTestCase): with open(flags_file) as f: self.assertEqual(f.read(), '--wasm-staging') - # The fuzz files begin with - # - # var binary = new Uint8Array([..binary data as numbers..]); - # - with open(fuzz_file) as f: - first_line = f.readline().strip() - start = 'var binary = new Uint8Array([' - end = ']);' - self.assertTrue(first_line.startswith(start)) - self.assertTrue(first_line.endswith(end)) - numbers = first_line[len(start):-len(end)] - - # Convert to binary, and see that it is a valid file. - numbers_array = [int(x) for x in numbers.split(',')] - binary_file = os.path.join(temp_dir.name, 'file.wasm') - with open(binary_file, 'wb') as f: - f.write(bytes(numbers_array)) + # Extract the wasm file(s) from the JS. Make sure to not notice + # stale files. + for f in glob.glob('extracted*'): + os.unlink(f) + extractor = shared.in_binaryen('scripts', 'clusterfuzz', 'extract_wasms.py') + subprocess.check_call([sys.executable, extractor, fuzz_file, 'extracted']) + + # One wasm file must always exist, and must be valid. + binary_file = 'extracted.0.wasm' + assert os.path.exists(binary_file) metrics = subprocess.check_output( shared.WASM_OPT + ['-all', '--metrics', binary_file, '-q'], text=True) @@ -215,6 +212,19 @@ class ClusterFuzz(utils.BinaryenTestCase): self.assertEqual(len(exports), 1) seen_exports.append(int(exports[0])) + # Sometimes a second wasm file should exist, and it must be valid + # too. + second_binary_file = 'extracted.1.wasm' + if os.path.exists(second_binary_file): + subprocess.check_call( + shared.WASM_OPT + ['-all', second_binary_file, '-q']) + + # Note its size (we leave detailed metrics for the first one; + # they are generated by the same logic in run.py, so just + # verifying some valid second wasms are emitted, of random + # sizes, is enough). + seen_second_sizes.append(os.path.getsize(second_binary_file)) + print() print('struct.news are distributed as ~ mean 15, stddev 24, median 10') @@ -247,10 +257,27 @@ class ClusterFuzz(utils.BinaryenTestCase): print() + # Second files appear in ~ 1/3 of testcases. + print('number of second wasms should be around 33 +- 8') + print(f'number of second wasms: {len(seen_second_sizes)}') + assert seen_second_sizes, 'must see at least one second wasm' + print('second sizes are distributed as ~ mean 2933, stddev 2011, median 2510') + print(f'mean sizes: {statistics.mean(seen_second_sizes)}') + print(f'stdev sizes: {statistics.stdev(seen_second_sizes)}') + print(f'median sizes: {statistics.median(seen_second_sizes)}') + # Relax the assert on the max seen second size compared to the max seen + # primary size, as we see fewer of these. 500 is still proof of an + # interesting wasm file. + self.assertGreaterEqual(max(seen_second_sizes), 500) + self.assertGreater(statistics.stdev(seen_second_sizes), 0) + + print() + # To check for interesting JS file contents, we'll note how many times # we build and run the wasm. seen_builds = [] seen_calls = [] + seen_second_builds = [] for i in range(1, N + 1): fuzz_file = os.path.join(temp_dir.name, f'fuzz-binaryen-{i}.js') @@ -258,6 +285,7 @@ class ClusterFuzz(utils.BinaryenTestCase): js = f.read() seen_builds.append(js.count('build(binary);')) seen_calls.append(js.count('callExports();')) + seen_second_builds.append(js.count('build(secondBinary);')) # There is always one build and one call (those are in the default # fuzz_shell.js), and we add a couple of operations, each with equal @@ -284,6 +312,17 @@ class ClusterFuzz(utils.BinaryenTestCase): print() + # Second wasm files are more rarely added, only 1/3 of the time or so, + # but over 100 samples we are still overwhelmingly likely to see one. + print('JS second builds are distributed as ~ mean 1.8, stddev 2.2, median 1') + print(f'mean JS second builds: {statistics.mean(seen_second_builds)}') + print(f'stdev JS second builds: {statistics.stdev(seen_second_builds)}') + print(f'median JS second builds: {statistics.median(seen_second_builds)}') + self.assertGreaterEqual(max(seen_second_builds), 2) + self.assertGreater(statistics.stdev(seen_second_builds), 0) + + print() + # "zzz" in test name so that this runs last. If it runs first, it can be # confusing as it appears next to the logging of which bundle we use (see # setUpClass). |