diff options
author | Alon Zakai <azakai@google.com> | 2023-10-03 16:39:12 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-03 16:39:12 -0700 |
commit | b2e096d79c36daa2cbfb7dc3db31af76e9f45cc8 (patch) | |
tree | 52f962128c9fe6c1b8cc8b847e5af40f33117c40 | |
parent | 24779b2a3fe5e5c7cc6b1da3661d346cd9c129ae (diff) | |
download | binaryen-b2e096d79c36daa2cbfb7dc3db31af76e9f45cc8.tar.gz binaryen-b2e096d79c36daa2cbfb7dc3db31af76e9f45cc8.tar.bz2 binaryen-b2e096d79c36daa2cbfb7dc3db31af76e9f45cc8.zip |
RemoveUnusedBrs: Allow less unconditional work and in particular division (#5989)
Fixes #5983: The testcase from there is used here in a new testcase
remove-unused-brs_levels in which we check if we are willing to unconditionally
do a division operation. Turning an if with an arm that does a division into a
select, which always does the division, is almost 5x slower, so we should probably
be extremely careful about doing that.
I took some measurements and have some suggestions for changes in this PR:
* Raise the cost of div/rem to what I measure on my machine, which is 5x slower
than an add, or worse.
* For some reason we added the if arms rather than take the max of them, so
fix that. This does not help the issue, but was confusing.
* Adjust TooCostlyToRunUnconditionally in the pass from 9 to 8 (this helps
balance the last point).
* Use half that value when not optimizing for size. That is, we allow only 4 extra
unconditional work normally, and 8 in -Os, and when -Oz then we allow any
extra amount.
Aside from the new testcases, some existing ones changed. They all appear to
change in a reasonable way, to me.
We should perhaps go even further than this, and not even run a division
unconditionally in -Os, but I wasn't sure it makes sense to go that far as
other benchmarks may be affected. For now, this makes the benchmark in
#5983 run at full speed in -O3 or -Os, and it remains slow in -Oz. The
modified version of the benchmark that only divides in the if (no other
operations) is still fast in -O3, but it become slow in -Os as we do turn that
if into a select (but again, I didn't want to go that far as to overfit on that one
benchmark).
-rw-r--r-- | src/ir/cost.h | 4 | ||||
-rw-r--r-- | src/passes/RemoveUnusedBrs.cpp | 62 | ||||
-rw-r--r-- | test/lit/passes/remove-unused-brs-gc.wast | 22 | ||||
-rw-r--r-- | test/lit/passes/remove-unused-brs.wast | 10 | ||||
-rw-r--r-- | test/lit/passes/remove-unused-brs_levels.wast | 129 | ||||
-rw-r--r-- | test/passes/remove-unused-brs_enable-multivalue.txt | 7 | ||||
-rw-r--r-- | test/wasm2js/conversions-modified.2asm.js.opt | 22 | ||||
-rw-r--r-- | test/wasm2js/float-ops.2asm.js.opt | 16 |
8 files changed, 223 insertions, 49 deletions
diff --git a/src/ir/cost.h b/src/ir/cost.h index 73d027788..efc50eb1b 100644 --- a/src/ir/cost.h +++ b/src/ir/cost.h @@ -263,7 +263,7 @@ struct CostAnalyzer : public OverriddenVisitor<CostAnalyzer, CostType> { case DivUInt32: case RemSInt32: case RemUInt32: - ret = curr->right->is<Const>() ? 2 : 3; + ret = curr->right->is<Const>() ? 5 : 6; break; case AndInt32: case OrInt32: @@ -284,7 +284,7 @@ struct CostAnalyzer : public OverriddenVisitor<CostAnalyzer, CostType> { case DivUInt64: case RemSInt64: case RemUInt64: - ret = curr->right->is<Const>() ? 3 : 4; + ret = curr->right->is<Const>() ? 7 : 8; break; case AndInt64: case OrInt64: diff --git a/src/passes/RemoveUnusedBrs.cpp b/src/passes/RemoveUnusedBrs.cpp index fca5b7db4..576f2fe91 100644 --- a/src/passes/RemoveUnusedBrs.cpp +++ b/src/passes/RemoveUnusedBrs.cpp @@ -80,38 +80,64 @@ static bool canTurnIfIntoBrIf(Expression* ifCondition, return !EffectAnalyzer(options, wasm, ifCondition).invalidates(value); } -// This leads to similar choices as LLVM does. -// See https://github.com/WebAssembly/binaryen/pull/4228 -// It can be tuned more later. -const Index TooCostlyToRunUnconditionally = 9; +// This leads to similar choices as LLVM does in some cases, by balancing the +// extra work of code that is run unconditionally with the speedup from not +// branching to decide whether to run it or not. +// See: +// * https://github.com/WebAssembly/binaryen/pull/4228 +// * https://github.com/WebAssembly/binaryen/issues/5983 +const Index TooCostlyToRunUnconditionally = 8; static_assert(TooCostlyToRunUnconditionally < CostAnalyzer::Unacceptable, "We never run code unconditionally if it has unacceptable cost"); -// Check if it is not worth it to run code unconditionally. This -// assumes we are trying to run two expressions where previously -// only one of the two might have executed. We assume here that -// executing both is good for code size. static bool tooCostlyToRunUnconditionally(const PassOptions& passOptions, - Expression* one, - Expression* two) { - // If we care mostly about code size, just do it for that reason. - if (passOptions.shrinkLevel) { - return false; + Index cost) { + if (passOptions.shrinkLevel == 0) { + // We are focused on speed. Any extra cost is risky, but allow a small + // amount. + return cost > TooCostlyToRunUnconditionally / 2; + } else if (passOptions.shrinkLevel == 1) { + // We are optimizing for size in a balanced manner. Allow some extra + // overhead here. + return cost >= TooCostlyToRunUnconditionally; + } else { + // We should have already decided what to do if shrink_level=2 and not + // gotten here, and other values are invalid. + WASM_UNREACHABLE("bad shrink level"); } - // Consider the cost of executing all the code unconditionally. - auto total = CostAnalyzer(one).cost + CostAnalyzer(two).cost; - return total >= TooCostlyToRunUnconditionally; } // As above, but a single expression that we are considering moving to a place // where it executes unconditionally. static bool tooCostlyToRunUnconditionally(const PassOptions& passOptions, Expression* curr) { - if (passOptions.shrinkLevel) { + // If we care entirely about code size, just do it for that reason (early + // exit to avoid work). + if (passOptions.shrinkLevel >= 2) { + return false; + } + auto cost = CostAnalyzer(curr).cost; + return tooCostlyToRunUnconditionally(passOptions, cost); +} + +// Check if it is not worth it to run code unconditionally. This +// assumes we are trying to run two expressions where previously +// only one of the two might have executed. We assume here that +// executing both is good for code size. +static bool tooCostlyToRunUnconditionally(const PassOptions& passOptions, + Expression* one, + Expression* two) { + // If we care entirely about code size, just do it for that reason (early + // exit to avoid work). + if (passOptions.shrinkLevel >= 2) { return false; } - return CostAnalyzer(curr).cost >= TooCostlyToRunUnconditionally; + + // Consider the cost of executing all the code unconditionally, which adds + // either the cost of running one or two, so the maximum is the worst case. + auto max = std::max(CostAnalyzer(one).cost, CostAnalyzer(two).cost); + return tooCostlyToRunUnconditionally(passOptions, max); } struct RemoveUnusedBrs : public WalkerPass<PostWalker<RemoveUnusedBrs>> { diff --git a/test/lit/passes/remove-unused-brs-gc.wast b/test/lit/passes/remove-unused-brs-gc.wast index 53100cc91..7a620193e 100644 --- a/test/lit/passes/remove-unused-brs-gc.wast +++ b/test/lit/passes/remove-unused-brs-gc.wast @@ -655,22 +655,20 @@ ;; CHECK-NEXT: ) ;; CHECK-NEXT: ) ;; CHECK-NEXT: (drop - ;; CHECK-NEXT: (select (result (ref null $struct)) - ;; CHECK-NEXT: (block (result (ref null $struct)) - ;; CHECK-NEXT: (block $something (result (ref null $struct)) - ;; CHECK-NEXT: (drop - ;; CHECK-NEXT: (block (result nullref) - ;; CHECK-NEXT: (br_on_non_null $something - ;; CHECK-NEXT: (local.get $struct) - ;; CHECK-NEXT: ) - ;; CHECK-NEXT: (ref.null none) + ;; CHECK-NEXT: (if (result (ref null $struct)) + ;; CHECK-NEXT: (local.get $x) + ;; CHECK-NEXT: (block $something (result (ref null $struct)) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (block (result nullref) + ;; CHECK-NEXT: (br_on_non_null $something + ;; CHECK-NEXT: (local.get $struct) ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (ref.null none) ;; CHECK-NEXT: ) - ;; CHECK-NEXT: (ref.null none) ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (ref.null none) ;; CHECK-NEXT: ) ;; CHECK-NEXT: (ref.null none) - ;; CHECK-NEXT: (local.get $x) ;; CHECK-NEXT: ) ;; CHECK-NEXT: ) ;; CHECK-NEXT: (drop @@ -716,6 +714,8 @@ ) ) ) + ;; We do not selectify here because the amount of work in the if is + ;; significant (there is a cast and a branch). (drop (if (result anyref) (local.get $x) diff --git a/test/lit/passes/remove-unused-brs.wast b/test/lit/passes/remove-unused-brs.wast index 82963f792..93cf4cbd2 100644 --- a/test/lit/passes/remove-unused-brs.wast +++ b/test/lit/passes/remove-unused-brs.wast @@ -31,10 +31,7 @@ ;; CHECK-NEXT: (i32.const 1) ;; CHECK-NEXT: (i32.lt_u ;; CHECK-NEXT: (i32.sub - ;; CHECK-NEXT: (i32.or - ;; CHECK-NEXT: (local.get $0) - ;; CHECK-NEXT: (i32.const 32) - ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (local.get $0) ;; CHECK-NEXT: (i32.const 97) ;; CHECK-NEXT: ) ;; CHECK-NEXT: (i32.const 6) @@ -60,10 +57,7 @@ (i32.const 1) (i32.lt_u (i32.sub - (i32.or - (local.get $0) - (i32.const 32) - ) + (local.get $0) (i32.const 97) ) (i32.const 6) diff --git a/test/lit/passes/remove-unused-brs_levels.wast b/test/lit/passes/remove-unused-brs_levels.wast new file mode 100644 index 000000000..f4737627e --- /dev/null +++ b/test/lit/passes/remove-unused-brs_levels.wast @@ -0,0 +1,129 @@ +;; NOTE: Assertions have been generated by update_lit_checks.py and should not be edited. +;; RUN: wasm-opt %s --remove-unused-brs -all -S --shrink-level=0 -o - | filecheck %s --check-prefix=SHRINK_0 +;; RUN: wasm-opt %s --remove-unused-brs -all -S --shrink-level=1 -o - | filecheck %s --check-prefix=SHRINK_1 +;; RUN: wasm-opt %s --remove-unused-brs -all -S --shrink-level=2 -o - | filecheck %s --check-prefix=SHRINK_2 + + +(module + ;; SHRINK_0: (func $selectify-division (type $0) (param $x i32) (result i32) + ;; SHRINK_0-NEXT: (if (result i32) + ;; SHRINK_0-NEXT: (i32.eq + ;; SHRINK_0-NEXT: (local.get $x) + ;; SHRINK_0-NEXT: (i32.const 53498923) + ;; SHRINK_0-NEXT: ) + ;; SHRINK_0-NEXT: (i32.div_s + ;; SHRINK_0-NEXT: (local.get $x) + ;; SHRINK_0-NEXT: (i32.const 13) + ;; SHRINK_0-NEXT: ) + ;; SHRINK_0-NEXT: (local.get $x) + ;; SHRINK_0-NEXT: ) + ;; SHRINK_0-NEXT: ) + ;; SHRINK_1: (func $selectify-division (type $0) (param $x i32) (result i32) + ;; SHRINK_1-NEXT: (select + ;; SHRINK_1-NEXT: (i32.div_s + ;; SHRINK_1-NEXT: (local.get $x) + ;; SHRINK_1-NEXT: (i32.const 13) + ;; SHRINK_1-NEXT: ) + ;; SHRINK_1-NEXT: (local.get $x) + ;; SHRINK_1-NEXT: (i32.eq + ;; SHRINK_1-NEXT: (local.get $x) + ;; SHRINK_1-NEXT: (i32.const 53498923) + ;; SHRINK_1-NEXT: ) + ;; SHRINK_1-NEXT: ) + ;; SHRINK_1-NEXT: ) + ;; SHRINK_2: (func $selectify-division (type $0) (param $x i32) (result i32) + ;; SHRINK_2-NEXT: (select + ;; SHRINK_2-NEXT: (i32.div_s + ;; SHRINK_2-NEXT: (local.get $x) + ;; SHRINK_2-NEXT: (i32.const 13) + ;; SHRINK_2-NEXT: ) + ;; SHRINK_2-NEXT: (local.get $x) + ;; SHRINK_2-NEXT: (i32.eq + ;; SHRINK_2-NEXT: (local.get $x) + ;; SHRINK_2-NEXT: (i32.const 53498923) + ;; SHRINK_2-NEXT: ) + ;; SHRINK_2-NEXT: ) + ;; SHRINK_2-NEXT: ) + (func $selectify-division (param $x i32) (result i32) + ;; See #5983: this if, if turned into a select, becomes almost 5x slower. + ;; We only want to selectify here when the shrink level is 1 or 2. + (if (result i32) + (i32.eq + (local.get $x) + (i32.const 53498923) + ) + (i32.div_s + (local.get $x) + (i32.const 13) + ) + (local.get $x) + ) + ) + + ;; SHRINK_0: (func $selectify-division2 (type $0) (param $x i32) (result i32) + ;; SHRINK_0-NEXT: (if (result i32) + ;; SHRINK_0-NEXT: (i32.eq + ;; SHRINK_0-NEXT: (local.get $x) + ;; SHRINK_0-NEXT: (i32.const 53498923) + ;; SHRINK_0-NEXT: ) + ;; SHRINK_0-NEXT: (i32.div_s + ;; SHRINK_0-NEXT: (i32.div_s + ;; SHRINK_0-NEXT: (local.get $x) + ;; SHRINK_0-NEXT: (i32.const 13) + ;; SHRINK_0-NEXT: ) + ;; SHRINK_0-NEXT: (i32.const 13) + ;; SHRINK_0-NEXT: ) + ;; SHRINK_0-NEXT: (local.get $x) + ;; SHRINK_0-NEXT: ) + ;; SHRINK_0-NEXT: ) + ;; SHRINK_1: (func $selectify-division2 (type $0) (param $x i32) (result i32) + ;; SHRINK_1-NEXT: (if (result i32) + ;; SHRINK_1-NEXT: (i32.eq + ;; SHRINK_1-NEXT: (local.get $x) + ;; SHRINK_1-NEXT: (i32.const 53498923) + ;; SHRINK_1-NEXT: ) + ;; SHRINK_1-NEXT: (i32.div_s + ;; SHRINK_1-NEXT: (i32.div_s + ;; SHRINK_1-NEXT: (local.get $x) + ;; SHRINK_1-NEXT: (i32.const 13) + ;; SHRINK_1-NEXT: ) + ;; SHRINK_1-NEXT: (i32.const 13) + ;; SHRINK_1-NEXT: ) + ;; SHRINK_1-NEXT: (local.get $x) + ;; SHRINK_1-NEXT: ) + ;; SHRINK_1-NEXT: ) + ;; SHRINK_2: (func $selectify-division2 (type $0) (param $x i32) (result i32) + ;; SHRINK_2-NEXT: (select + ;; SHRINK_2-NEXT: (i32.div_s + ;; SHRINK_2-NEXT: (i32.div_s + ;; SHRINK_2-NEXT: (local.get $x) + ;; SHRINK_2-NEXT: (i32.const 13) + ;; SHRINK_2-NEXT: ) + ;; SHRINK_2-NEXT: (i32.const 13) + ;; SHRINK_2-NEXT: ) + ;; SHRINK_2-NEXT: (local.get $x) + ;; SHRINK_2-NEXT: (i32.eq + ;; SHRINK_2-NEXT: (local.get $x) + ;; SHRINK_2-NEXT: (i32.const 53498923) + ;; SHRINK_2-NEXT: ) + ;; SHRINK_2-NEXT: ) + ;; SHRINK_2-NEXT: ) + (func $selectify-division2 (param $x i32) (result i32) + ;; As above, but now only with a shrink level of 2 should we selectify, as + ;; there are two divisions. + (if (result i32) + (i32.eq + (local.get $x) + (i32.const 53498923) + ) + (i32.div_s + (i32.div_s + (local.get $x) + (i32.const 13) + ) + (i32.const 13) + ) + (local.get $x) + ) + ) +) diff --git a/test/passes/remove-unused-brs_enable-multivalue.txt b/test/passes/remove-unused-brs_enable-multivalue.txt index 9df644daa..0a66153e9 100644 --- a/test/passes/remove-unused-brs_enable-multivalue.txt +++ b/test/passes/remove-unused-brs_enable-multivalue.txt @@ -2502,8 +2502,9 @@ (i32.const 0) ) (func $ifs-copies-recursive (param $20 i32) (result i32) - (local.set $20 - (select + (if + (i32.const 1) + (local.set $20 (select (select (i32.const 4) @@ -2513,8 +2514,6 @@ (local.get $20) (i32.const 2) ) - (local.get $20) - (i32.const 1) ) ) (local.get $20) diff --git a/test/wasm2js/conversions-modified.2asm.js.opt b/test/wasm2js/conversions-modified.2asm.js.opt index 811845f9d..5eb7435ee 100644 --- a/test/wasm2js/conversions-modified.2asm.js.opt +++ b/test/wasm2js/conversions-modified.2asm.js.opt @@ -121,15 +121,29 @@ function asmFunc(imports) { } function legalstub$7($0) { - i64toi32_i32$HIGH_BITS = Math_fround(Math_abs($0)) >= Math_fround(1.0) ? ~~($0 > Math_fround(0.0) ? Math_fround(Math_min(Math_fround(Math_floor(Math_fround($0 * Math_fround(2.3283064365386963e-10)))), Math_fround(4294967296.0))) : Math_fround(Math_ceil(Math_fround(Math_fround($0 - Math_fround(~~$0 >>> 0 >>> 0)) * Math_fround(2.3283064365386963e-10))))) >>> 0 : 0; + var $1 = 0, $2 = 0; + $2 = ~~$0 >>> 0; + if (Math_fround(Math_abs($0)) >= Math_fround(1.0)) { + $1 = ~~($0 > Math_fround(0.0) ? Math_fround(Math_min(Math_fround(Math_floor(Math_fround($0 * Math_fround(2.3283064365386963e-10)))), Math_fround(4294967296.0))) : Math_fround(Math_ceil(Math_fround(Math_fround($0 - Math_fround(~~$0 >>> 0 >>> 0)) * Math_fround(2.3283064365386963e-10))))) >>> 0 + } else { + $1 = 0 + } + i64toi32_i32$HIGH_BITS = $1; setTempRet0(i64toi32_i32$HIGH_BITS | 0); - return ~~$0 >>> 0; + return $2; } function legalstub$9($0) { - i64toi32_i32$HIGH_BITS = Math_abs($0) >= 1.0 ? ~~($0 > 0.0 ? Math_min(Math_floor($0 * 2.3283064365386963e-10), 4294967295.0) : Math_ceil(($0 - +(~~$0 >>> 0 >>> 0)) * 2.3283064365386963e-10)) >>> 0 : 0; + var $1 = 0, $2 = 0; + $2 = ~~$0 >>> 0; + if (Math_abs($0) >= 1.0) { + $1 = ~~($0 > 0.0 ? Math_min(Math_floor($0 * 2.3283064365386963e-10), 4294967295.0) : Math_ceil(($0 - +(~~$0 >>> 0 >>> 0)) * 2.3283064365386963e-10)) >>> 0 + } else { + $1 = 0 + } + i64toi32_i32$HIGH_BITS = $1; setTempRet0(i64toi32_i32$HIGH_BITS | 0); - return ~~$0 >>> 0; + return $2; } function legalstub$12($0, $1) { diff --git a/test/wasm2js/float-ops.2asm.js.opt b/test/wasm2js/float-ops.2asm.js.opt index 204774e5c..e00c51801 100644 --- a/test/wasm2js/float-ops.2asm.js.opt +++ b/test/wasm2js/float-ops.2asm.js.opt @@ -236,12 +236,24 @@ function asmFunc(imports) { function $47($0) { $0 = Math_fround($0); - return !(~~$0 >>> 0 | (Math_fround(Math_abs($0)) >= Math_fround(1.0) ? ~~($0 > Math_fround(0.0) ? Math_fround(Math_min(Math_fround(Math_floor(Math_fround($0 * Math_fround(2.3283064365386963e-10)))), Math_fround(4294967296.0))) : Math_fround(Math_ceil(Math_fround(Math_fround($0 - Math_fround(~~$0 >>> 0 >>> 0)) * Math_fround(2.3283064365386963e-10))))) >>> 0 : 0)) | 0; + var $1_1 = 0; + if (Math_fround(Math_abs($0)) >= Math_fround(1.0)) { + $1_1 = ~~($0 > Math_fround(0.0) ? Math_fround(Math_min(Math_fround(Math_floor(Math_fround($0 * Math_fround(2.3283064365386963e-10)))), Math_fround(4294967296.0))) : Math_fround(Math_ceil(Math_fround(Math_fround($0 - Math_fround(~~$0 >>> 0 >>> 0)) * Math_fround(2.3283064365386963e-10))))) >>> 0 + } else { + $1_1 = 0 + } + return !($1_1 | ~~$0 >>> 0) | 0; } function $48($0) { $0 = +$0; - return !(~~$0 >>> 0 | (Math_abs($0) >= 1.0 ? ~~($0 > 0.0 ? Math_min(Math_floor($0 * 2.3283064365386963e-10), 4294967295.0) : Math_ceil(($0 - +(~~$0 >>> 0 >>> 0)) * 2.3283064365386963e-10)) >>> 0 : 0)) | 0; + var $1_1 = 0; + if (Math_abs($0) >= 1.0) { + $1_1 = ~~($0 > 0.0 ? Math_min(Math_floor($0 * 2.3283064365386963e-10), 4294967295.0) : Math_ceil(($0 - +(~~$0 >>> 0 >>> 0)) * 2.3283064365386963e-10)) >>> 0 + } else { + $1_1 = 0 + } + return !($1_1 | ~~$0 >>> 0) | 0; } function legalstub$43($0, $1_1) { |