From b0e999a2b8841d8be21cbcdc84cbc1d6469e36d7 Mon Sep 17 00:00:00 2001
From: Alon Zakai <azakai@google.com>
Date: Tue, 19 Nov 2024 09:28:01 -0800
Subject: Fuzzing: ClusterFuzz integration (#7079)

The main addition here is a bundle_clusterfuzz.py script which will package up
the exact files that should be uploaded to ClusterFuzz. It also documents the
process and bundling and testing. You can do

bundle.py OUTPUT_FILE.tgz

That bundles wasm-opt from ./bin., which is enough for local testing. For
actually uploading to ClusterFuzz, we need a portable build, and @dschuff
had the idea to reuse the emsdk build, which works nicely. Doing

bundle.py OUTPUT_FILE.tgz --build-dir=/path/to/emsdk/upstream/

will bundle wasm-opt (+libs) from the emsdk. I verified that those builds
work on ClusterFuzz.

I added several forms of testing here. First, our main fuzzer fuzz_opt.py now
has a ClusterFuzz testcase handler, which simulates a ClusterFuzz environment.
Second, there are smoke tests that run in the unit test suite, and can also be
run separately:

python -m unittest test/unit/test_cluster_fuzz.py

Those unit tests can also run on a given bundle, e.g. one created from an
emsdk build, for testing right before upload:

BINARYEN_CLUSTER_FUZZ_BUNDLE=/path/to/bundle.tgz python -m unittest test/unit/test_cluster_fuzz.py

A third piece of testing is to add a --fuzz-passes test. That is a mode for
-ttf (translate random data into a valid wasm fuzz testcase) that uses random
data to pick and run a set of passes, to further shape the wasm. (--fuzz-passes
had no previous testing, and this PR fixes it and tidies it up a little, adding some
newer passes too).

Otherwise this PR includes the key run.py script that is bundled and then
executed by ClusterFuzz, basically a python script that runs wasm-opt -ttf [..]
to generate testcases, sets up their JS, and emits them.

fuzz_shell.js, which is the JS to execute testcases, will now check if it is
provided binary data of a wasm file. If so, it does not read a wasm file from
argv[1]. (This is needed because ClusterFuzz expects a single file for the
testcase, so we make a JS file with bundled wasm inside it.)
---
 scripts/bundle_clusterfuzz.py | 135 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)
 create mode 100755 scripts/bundle_clusterfuzz.py

(limited to 'scripts/bundle_clusterfuzz.py')

diff --git a/scripts/bundle_clusterfuzz.py b/scripts/bundle_clusterfuzz.py
new file mode 100755
index 000000000..a03553837
--- /dev/null
+++ b/scripts/bundle_clusterfuzz.py
@@ -0,0 +1,135 @@
+#!/usr/bin/python3
+
+'''
+Bundle files for uploading to ClusterFuzz.
+
+Usage:
+
+bundle.py OUTPUT_FILE.tgz [--build-dir=BUILD_DIR]
+
+The output file will be a .tgz file.
+
+if a build directory is provided, we will look under there to find bin/wasm-opt
+and lib/libbinaryen.so. A useful place to get builds from is the Emscripten SDK,
+as you can do
+
+  ./emsdk install tot
+
+after which ./upstream/ (from the emsdk dir) will contain builds of wasm-opt and
+libbinaryen.so (that are designed to run on as many systems as possible, by not
+depending on newer libc symbols, etc., as opposed to a normal local build).
+Thus, the full workflow could be
+
+  cd emsdk
+  ./emsdk install tot
+  cd ../binaryen
+  python3 scripts/bundle_clusterfuzz.py binaryen_wasm_fuzzer.tgz --build-dir=../emsdk/upstream
+
+When using --build-dir in this way, you are responsible for ensuring that the
+wasm-opt in the build dir is compatible with the scripts in the current dir
+(e.g., if run.py here passes a flag that is only in a new/older version of
+wasm-opt, a problem can happen).
+
+Before uploading to ClusterFuzz, it is worth doing the following:
+
+  1. Run the local fuzzer (scripts/fuzz_opt.py). That includes a ClusterFuzz
+     testcase handler, which simulates what ClusterFuzz does.
+
+  2. Run the unit tests, which include smoke tests for our ClusterFuzz support:
+
+       python -m unittest test/unit/test_cluster_fuzz.py
+
+     Look at the logs, which will contain statistics on the wasm files the
+     fuzzer emits, and see that they look reasonable.
+
+     You should run the unit tests on the bundle you are about to upload, by
+     setting the proper env var like this (using the same filename as above):
+
+       BINARYEN_CLUSTER_FUZZ_BUNDLE=`pwd`/binaryen_wasm_fuzzer.tgz python -m unittest test/unit/test_cluster_fuzz.py
+
+     Note that you must pass an absolute filename (e.g. using pwd as shown).
+
+     The unittest logs should reflect that that bundle is being used at the
+     very start ("Using existing bundle: ..." rather than "Making a new
+     bundle"). Note that some of the unittests also create their own bundles, to
+     test the bundling script itself, so later down you will see logging of
+     bundle creation even if you provide a bundle.
+
+After uploading to ClusterFuzz, you can wait a while for it to run, and then:
+
+  1. Inspect the log to see that we generate all the testcases properly, and
+     their sizes look reasonably random, etc.
+
+  2. Inspect the sample testcase and run it locally, to see that
+
+       d8 --wasm-staging testcase.js
+
+     properly runs the testcase, emitting logging etc.
+
+  3. Check the stats and crashes page (known crashes should at least be showing
+     up). Note that these may take longer to show up than 1 and 2.
+'''
+
+import os
+import sys
+import tarfile
+
+# Read the filenames first, as importing |shared| changes the directory.
+output_file = os.path.abspath(sys.argv[1])
+print(f'Bundling to: {output_file}')
+assert output_file.endswith('.tgz'), 'Can only generate a .tgz'
+
+build_dir = None
+if len(sys.argv) >= 3:
+    assert sys.argv[2].startswith('--build-dir=')
+    build_dir = sys.argv[2].split('=')[1]
+    build_dir = os.path.abspath(build_dir)
+    # Delete the argument, as importing |shared| scans it.
+    sys.argv.pop()
+
+from test import shared # noqa
+
+# Pick where to get the builds
+if build_dir:
+    binaryen_bin = os.path.join(build_dir, 'bin')
+    binaryen_lib = os.path.join(build_dir, 'lib')
+else:
+    binaryen_bin = shared.options.binaryen_bin
+    binaryen_lib = shared.options.binaryen_lib
+
+with tarfile.open(output_file, "w:gz") as tar:
+    # run.py
+    run = os.path.join(shared.options.binaryen_root, 'scripts', 'clusterfuzz', 'run.py')
+    print(f'  .. run:         {run}')
+    tar.add(run, arcname='run.py')
+
+    # fuzz_shell.js
+    fuzz_shell = os.path.join(shared.options.binaryen_root, 'scripts', 'fuzz_shell.js')
+    print(f'  .. fuzz_shell:  {fuzz_shell}')
+    tar.add(fuzz_shell, arcname='scripts/fuzz_shell.js')
+
+    # wasm-opt binary
+    wasm_opt = os.path.join(binaryen_bin, 'wasm-opt')
+    print(f'  .. wasm-opt:    {wasm_opt}')
+    tar.add(wasm_opt, arcname='bin/wasm-opt')
+
+    # For a dynamic build we also need libbinaryen.so and possibly other files.
+    # Try both .so and .dylib suffixes for more OS coverage.
+    for suffix in ['.so', '.dylib']:
+        libbinaryen = os.path.join(binaryen_lib, f'libbinaryen{suffix}')
+        if os.path.exists(libbinaryen):
+            print(f'  .. libbinaryen: {libbinaryen}')
+            tar.add(libbinaryen, arcname=f'lib/libbinaryen{suffix}')
+
+            # The emsdk build also includes some more necessary files.
+            for name in [f'libc++{suffix}', f'libc++{suffix}.2', f'libc++{suffix}.2.0']:
+                path = os.path.join(binaryen_lib, name)
+                if os.path.exists(path):
+                    print(f'  ......... : {path}')
+                    tar.add(path, arcname=f'lib/{name}')
+
+print('Done.')
+print('To run the tests on this bundle, do:')
+print()
+print(f'BINARYEN_CLUSTER_FUZZ_BUNDLE={output_file} python -m unittest test/unit/test_cluster_fuzz.py')
+print()
-- 
cgit v1.2.3