summaryrefslogtreecommitdiff
path: root/scripts/benchmarking
diff options
context:
space:
mode:
authorAlon Zakai <azakai@google.com>2024-07-25 11:16:45 -0700
committerGitHub <noreply@github.com>2024-07-25 11:16:45 -0700
commit9cc1cb1ca66e89cbe7b7b5b52897f3bee3ee422c (patch)
tree632a9c57123fc25374d981acb2cb40600e3ea4f0 /scripts/benchmarking
parentd903dd30f6426b8eb07605cae01baf4158364e2d (diff)
downloadbinaryen-9cc1cb1ca66e89cbe7b7b5b52897f3bee3ee422c.tar.gz
binaryen-9cc1cb1ca66e89cbe7b7b5b52897f3bee3ee422c.tar.bz2
binaryen-9cc1cb1ca66e89cbe7b7b5b52897f3bee3ee422c.zip
Cost analysis: Remove "Unacceptable" hack (#6782)
We marked various expressions as having cost "Unacceptable", fixed at 100, to ensure we never moved them out from an If arm, etc. Giving them such a high cost avoids that problem - the cost is higher than the limit we have for moving code from conditional to unconditional execution - but it also means the total cost is unrealistic. For example, a function with one such instruction + an add (cost 1) would end up with cost 101, and removing the add would look insignificant, which causes issues for things that want to compare costs (like Monomorphization). To fix this, adjust some costs. The main change here is to give casts a cost of 5. I measured this in depth, see the attached benchmark scripts, and it looks clear that in both V8 and SpiderMonkey the cost of a cast is high enough to make it not worth turning an if with ref.test arm into a select (which would always execute the test). Other costs adjusted here matter a lot less, because they are on operations that have side effects and so the optimizer will anyhow not move them from conditional to unconditional execution, but I tried to make them a bit more realistic while I was removing "Unacceptable": * Give most atomic operations the 10 cost we've been using for atomic loads/ stores. Perhaps wait and notify should be slower, however, but it seems like assuming fast switching might be more relevant. * Give growth operations a cost of 20, and throw operations a cost of 10. These numbers are entirely made up as I am not even sure how to measure them in a useful way (but, again, this should not matter much as they have side effects).
Diffstat (limited to 'scripts/benchmarking')
-rw-r--r--scripts/benchmarking/bench.js159
-rw-r--r--scripts/benchmarking/bench.wat283
2 files changed, 442 insertions, 0 deletions
diff --git a/scripts/benchmarking/bench.js b/scripts/benchmarking/bench.js
new file mode 100644
index 000000000..d1284a241
--- /dev/null
+++ b/scripts/benchmarking/bench.js
@@ -0,0 +1,159 @@
+
+// Benchmarking script. This runs on compiled bench.wat and prints out timings.
+//
+// Usage:
+//
+// * wasm-opt scripts/benchmarking/bench.wat -all --inline-functions-with-loops --always-inline-max-function-size=1000 --inlining --precompute-propagate --optimize-instructions --inlining --simplify-locals --coalesce-locals --vacuum --remove-unused-module-elements -o bench.wasm -g
+// * Inspect the optimized wasm to see that inlining etc. worked properly
+// (we rely on inlining to let us write bench.wat in a short/simple form, and
+// we use very specific optimizations in order to not optimize away the
+// differences we care about).
+// * d8 bench.js -- bench.wasm
+// etc.
+//
+
+// Shell integration.
+if (typeof console === 'undefined') {
+ console = { log: print };
+}
+var tempRet0;
+var binary;
+if (typeof process === 'object' && typeof require === 'function' /* node.js detection */) {
+ var args = process.argv.slice(2);
+ binary = require('fs').readFileSync(args[0]);
+ if (!binary.buffer) binary = new Uint8Array(binary);
+} else {
+ var args;
+ if (typeof scriptArgs != 'undefined') {
+ args = scriptArgs;
+ } else if (typeof arguments != 'undefined') {
+ args = arguments;
+ }
+ if (typeof readbuffer === 'function') {
+ binary = new Uint8Array(readbuffer(args[0]));
+ } else {
+ binary = read(args[0], 'binary');
+ }
+}
+
+// Create the wasm.
+const module = new WebAssembly.Module(binary);
+const instance = new WebAssembly.Instance(module, {});
+const exports = instance.exports;
+
+// Create the benchmarkers.
+function makeBenchmarker(name) {
+ return {
+ name: name,
+ func: exports[name],
+ time: 0,
+ sum: 0,
+ iters: 0,
+ };
+}
+
+const benchmarkers = [
+ makeBenchmarker('len'),
+ makeBenchmarker('and'),
+ makeBenchmarker('iff-both'),
+ makeBenchmarker('or'),
+ makeBenchmarker('iff-either'),
+ makeBenchmarker('select'),
+ makeBenchmarker('iff-nextor'),
+ makeBenchmarker('select-three'),
+ makeBenchmarker('iff-three'),
+];
+
+// We'll call the benchmark functions in random orders. Random orders avoid any
+// interaction between the benchmarks from causing bias in the results.
+//
+// An alternative to randomly ordering the benchmarks in each iteration would be
+// to fully benchmark one, then do the next, so there are large amounts of time
+// between them, but that also allows them to become more like microbenchmarks
+// where the branch predictor etc. might display very favorable behavior.
+// Interleaving them makes things slightly more realistic.
+//
+// If we have too many benchmarks then eventually computing all orders ahead of
+// time will not work, but so long as we can, it is faster this way rather than
+// to compute random orders on the fly as we go.
+function makeOrders(prefix) {
+ // Given a prefix of an order, like [] or [0, 3], return all the possible
+ // orders beginning with that prefix.
+
+ // We cannot repeat anything already seen.
+ const seen = new Set();
+ for (var x of prefix) {
+ seen.add(x);
+ }
+
+ // Starting from the prefix, extend it by one item in all valid ways.
+ const extensions = [];
+ for (var i = 0; i < benchmarkers.length; i++) {
+ if (!seen.has(i)) {
+ extensions.push(prefix.concat(i));
+ }
+ }
+
+ if (prefix.length == benchmarkers.length - 1) {
+ // The extensions are complete orders; stop the recursion.
+ return extensions;
+ }
+
+ // Recursively generate the full orders.
+ const ret = [];
+ for (var extension of extensions) {
+ for (var order of makeOrders(extension)) {
+ ret.push(order);
+ }
+ }
+ return ret;
+}
+
+const orders = makeOrders([]);
+
+// Params.
+const M = 10000000;
+const N = 100;
+
+console.log('iters :', M);
+console.log('list len :', N);
+console.log('benchmarkers:', benchmarkers.length);
+console.log('orderings :', orders.length);
+
+// Create a long linked list of objects of both type $A and $B.
+var list = null;
+for (var i = 0; i < N; i++) {
+ list = Math.random() < 0.5 ? exports.makeA(list) : exports.makeB(list);
+}
+
+console.log('benchmarking...');
+
+// Call the benchmark functions.
+
+for (var i = 0; i < M; i++) {
+ const order = orders[Math.floor(Math.random() * orders.length)];
+ for (var k = 0; k < benchmarkers.length; k++) {
+ const benchmarker = benchmarkers[order[k]];
+ const start = performance.now();
+ const result = benchmarker.func(list);
+ benchmarker.time += performance.now() - start;
+ benchmarker.sum += result;
+ benchmarker.iters++;
+ }
+}
+
+for (var benchmarker of benchmarkers) {
+ if (benchmarker.iters != M) {
+ throw 'wat';
+ }
+}
+
+console.log();
+for (var benchmarker of benchmarkers) {
+ console.log(`${benchmarker.name} time: \t${benchmarker.time}`)
+}
+console.log();
+for (var benchmarker of benchmarkers) {
+ console.log(`${benchmarker.name} mean sum: \t${benchmarker.sum / M}`)
+}
+
diff --git a/scripts/benchmarking/bench.wat b/scripts/benchmarking/bench.wat
new file mode 100644
index 000000000..87d54b66b
--- /dev/null
+++ b/scripts/benchmarking/bench.wat
@@ -0,0 +1,283 @@
+;; See bench.js.
+
+(module
+ ;; A chain of three types. Each type has a "next" field, so we can form linked
+ ;; lists.
+ (type $A (sub (struct (field $next (ref null $A)))))
+ (type $B (sub $A (struct (field $next (ref null $A)))))
+ (type $C (sub $B (struct (field $next (ref null $A)))))
+
+ (type $func (func (param (ref $A)) (result i32)))
+
+ ;; Internal helper to iterate over a linked list and call a function on each
+ ;; item, and return the sum of those calls' results.
+ (func $iter (param $list (ref null $A)) (param $func (ref $func)) (result i32)
+ (local $sum i32)
+ (loop $loop
+ (if
+ (ref.is_null
+ (local.get $list)
+ )
+ (then
+ (return
+ (local.get $sum)
+ )
+ )
+ (else
+ (local.set $sum
+ (i32.add
+ (local.get $sum)
+ (call_ref $func
+ (ref.as_non_null
+ (local.get $list)
+ )
+ (local.get $func)
+ )
+ )
+ )
+ (local.set $list
+ (struct.get $A $next
+ (local.get $list)
+ )
+ )
+ (br $loop)
+ )
+ )
+ )
+ )
+
+ ;; Using the helper, and depending on inlining to optimize this, lets us
+ ;; write the exports concisely. First, code to compute the length of the list
+ ;; (for comparison purposes).
+ (func $len (export "len") (param $list (ref $A)) (result i32)
+ (call $iter
+ (local.get $list)
+ ;; Add one each time this is called.
+ (ref.func $one)
+ )
+ )
+ (func $one (param $list (ref $A)) (result i32)
+ (i32.const 1)
+ )
+
+ ;; At each point in the linked list, check if both the current and next item
+ ;; are inputs are $B, using an if to short-circuit when possible.
+ (func $iff-both (export "iff-both") (param $list (ref $A)) (result i32)
+ (call $iter
+ (local.get $list)
+ (ref.func $do-iff-both)
+ )
+ )
+ (func $do-iff-both (param $list (ref $A)) (result i32)
+ (if (result i32)
+ (ref.test (ref $B)
+ (struct.get $A $next
+ (local.get $list)
+ )
+ )
+ (then
+ (ref.test (ref $B)
+ (local.get $list)
+ )
+ )
+ (else
+ (i32.const 0)
+ )
+ )
+ )
+
+ ;; The same computation, but using an and, so both tests always execute.
+ (func $and (export "and") (param $list (ref $A)) (result i32)
+ (call $iter
+ (local.get $list)
+ (ref.func $do-and)
+ )
+ )
+ (func $do-and (param $list (ref $A)) (result i32)
+ (i32.and
+ (ref.test (ref $B)
+ (struct.get $A $next
+ (local.get $list)
+ )
+ )
+ (ref.test (ref $B)
+ (local.get $list)
+ )
+ )
+ )
+
+ ;; Similar, but return 1 if either test succeeds (using an if).
+ (func $iff-either (export "iff-either") (param $list (ref $A)) (result i32)
+ (call $iter
+ (local.get $list)
+ (ref.func $do-iff-either)
+ )
+ )
+ (func $do-iff-either (param $list (ref $A)) (result i32)
+ (if (result i32)
+ (ref.test (ref $B)
+ (struct.get $A $next
+ (local.get $list)
+ )
+ )
+ (then
+ (i32.const 1)
+ )
+ (else
+ (ref.test (ref $B)
+ (local.get $list)
+ )
+ )
+ )
+ )
+
+
+ ;; The same computation, but using an or, so both tests always execute.
+ (func $or (export "or") (param $list (ref $A)) (result i32)
+ (call $iter
+ (local.get $list)
+ (ref.func $do-or)
+ )
+ )
+ (func $do-or (param $list (ref $A)) (result i32)
+ (i32.or
+ (ref.test (ref $B)
+ (struct.get $A $next
+ (local.get $list)
+ )
+ )
+ (ref.test (ref $B)
+ (local.get $list)
+ )
+ )
+ )
+
+ ;; Use a select to do a test of "is next null ? 0 : test curr".
+ (func $select (export "select") (param $list (ref $A)) (result i32)
+ (call $iter
+ (local.get $list)
+ (ref.func $do-select)
+ )
+ )
+ (func $do-select (param $list (ref $A)) (result i32)
+ (select
+ (i32.const 0)
+ (ref.test (ref $B)
+ (local.get $list)
+ )
+ (ref.is_null
+ (struct.get $A $next
+ (local.get $list)
+ )
+ )
+ )
+ )
+
+ ;; Use an iff to do the same.
+ (func $iff-nextor (export "iff-nextor") (param $list (ref $A)) (result i32)
+ (call $iter
+ (local.get $list)
+ (ref.func $do-iff-nextor)
+ )
+ )
+ (func $do-iff-nextor (param $list (ref $A)) (result i32)
+ (if (result i32)
+ (ref.is_null
+ (struct.get $A $next
+ (local.get $list)
+ )
+ )
+ (then
+ (i32.const 0)
+ )
+ (else
+ (ref.test (ref $B)
+ (local.get $list)
+ )
+ )
+ )
+ )
+
+ ;; Use an if over three tests: "test if next is B or C depending on if curr is
+ ;; B."
+ (func $iff-three (export "iff-three") (param $list (ref $A)) (result i32)
+ (call $iter
+ (local.get $list)
+ (ref.func $do-iff-three)
+ )
+ )
+ (func $do-iff-three (param $list (ref $A)) (result i32)
+ (local $next (ref null $A))
+ (local.set $next
+ (struct.get $A $next
+ (local.get $list)
+ )
+ )
+ (if (result i32)
+ (ref.test (ref $B)
+ (local.get $list)
+ )
+ (then
+ (ref.test (ref $C)
+ (local.get $next)
+ )
+ )
+ (else
+ (ref.test (ref $B)
+ (local.get $next)
+ )
+ )
+ )
+ )
+
+ ;; Use a select for the same.
+ (func $select-three (export "select-three") (param $list (ref $A)) (result i32)
+ (call $iter
+ (local.get $list)
+ (ref.func $do-select-three)
+ )
+ )
+ (func $do-select-three (param $list (ref $A)) (result i32)
+ (local $next (ref null $A))
+ (local.set $next
+ (struct.get $A $next
+ (local.get $list)
+ )
+ )
+ (select
+ (ref.test (ref $C)
+ (local.get $next)
+ )
+ (ref.test (ref $B)
+ (local.get $next)
+ )
+ (ref.test (ref $B)
+ (local.get $list)
+ )
+ )
+ )
+
+ ;; Creation functions.
+
+ (func $makeA (export "makeA") (param $next (ref null $A)) (result anyref)
+ (struct.new $A
+ (local.get $next)
+ )
+ )
+
+ (func $makeB (export "makeB") (param $next (ref null $A)) (result anyref)
+ (struct.new $B
+ (local.get $next)
+ )
+ )
+
+ (func $makeC (export "makeC") (param $next (ref null $A)) (result anyref)
+ ;; This function is not used in benchmarks yet, but it keeps the type $C
+ ;; alive, which prevents $B from looking like it could be final, which might
+ ;; allow the optimizer to simplify more than we want.
+ (struct.new $C
+ (local.get $next)
+ )
+ )
+)
+