RemoveUnusedBrs: Allow less unconditional work and in particular division (#5989)

Fixes #5983: The testcase from there is used here in a new testcase remove-unused-brs_levels in which we check if we are willing to unconditionally do a division operation. Turning an if with an arm that does a division into a select, which always does the division, is almost 5x slower, so we should probably be extremely careful about doing that. I took some measurements and have some suggestions for changes in this PR: * Raise the cost of div/rem to what I measure on my machine, which is 5x slower than an add, or worse. * For some reason we added the if arms rather than take the max of them, so fix that. This does not help the issue, but was confusing. * Adjust TooCostlyToRunUnconditionally in the pass from 9 to 8 (this helps balance the last point). * Use half that value when not optimizing for size. That is, we allow only 4 extra unconditional work normally, and 8 in -Os, and when -Oz then we allow any extra amount. Aside from the new testcases, some existing ones changed. They all appear to change in a reasonable way, to me. We should perhaps go even further than this, and not even run a division unconditionally in -Os, but I wasn't sure it makes sense to go that far as other benchmarks may be affected. For now, this makes the benchmark in #5983 run at full speed in -O3 or -Os, and it remains slow in -Oz. The modified version of the benchmark that only divides in the if (no other operations) is still fast in -O3, but it become slow in -Os as we do turn that if into a select (but again, I didn't want to go that far as to overfit on that one benchmark).
author: Alon Zakai <azakai@google.com> 2023-10-03 16:39:12 -0700
committer: GitHub <noreply@github.com> 2023-10-03 16:39:12 -0700
commit: b2e096d79c36daa2cbfb7dc3db31af76e9f45cc8 (patch)
tree: 52f962128c9fe6c1b8cc8b847e5af40f33117c40
parent: 24779b2a3fe5e5c7cc6b1da3661d346cd9c129ae (diff)
download: binaryen-b2e096d79c36daa2cbfb7dc3db31af76e9f45cc8.tar.gz
binaryen-b2e096d79c36daa2cbfb7dc3db31af76e9f45cc8.tar.bz2
binaryen-b2e096d79c36daa2cbfb7dc3db31af76e9f45cc8.zip
8 files changed, 223 insertions, 49 deletions
diff --git a/src/ir/cost.h b/src/ir/cost.h
index 73d027788..efc50eb1b 100644
--- a/src/ir/cost.h
+++ b/src/ir/cost.h
@@ -263,7 +263,7 @@ struct CostAnalyzer : public OverriddenVisitor<CostAnalyzer, CostType> {
       case DivUInt32:
       case RemSInt32:
       case RemUInt32:
-        ret = curr->right->is<Const>() ? 2 : 3;
+        ret = curr->right->is<Const>() ? 5 : 6;
         break;
       case AndInt32:
       case OrInt32:
@@ -284,7 +284,7 @@ struct CostAnalyzer : public OverriddenVisitor<CostAnalyzer, CostType> {
       case DivUInt64:
       case RemSInt64:
       case RemUInt64:
-        ret = curr->right->is<Const>() ? 3 : 4;
+        ret = curr->right->is<Const>() ? 7 : 8;
         break;
       case AndInt64:
       case OrInt64:
diff --git a/src/passes/RemoveUnusedBrs.cpp b/src/passes/RemoveUnusedBrs.cpp
index fca5b7db4..576f2fe91 100644
--- a/src/passes/RemoveUnusedBrs.cpp
+++ b/src/passes/RemoveUnusedBrs.cpp
@@ -80,38 +80,64 @@ static bool canTurnIfIntoBrIf(Expression* ifCondition,
   return !EffectAnalyzer(options, wasm, ifCondition).invalidates(value);
 }
 
-// This leads to similar choices as LLVM does.
-// See https://github.com/WebAssembly/binaryen/pull/4228
-// It can be tuned more later.
-const Index TooCostlyToRunUnconditionally = 9;
+// This leads to similar choices as LLVM does in some cases, by balancing the
+// extra work of code that is run unconditionally with the speedup from not
+// branching to decide whether to run it or not.
+// See:
+//  * https://github.com/WebAssembly/binaryen/pull/4228
+//  * https://github.com/WebAssembly/binaryen/issues/5983
+const Index TooCostlyToRunUnconditionally = 8;
 
 static_assert(TooCostlyToRunUnconditionally < CostAnalyzer::Unacceptable,
               "We never run code unconditionally if it has unacceptable cost");
 
-// Check if it is not worth it to run code unconditionally. This
-// assumes we are trying to run two expressions where previously
-// only one of the two might have executed. We assume here that
-// executing both is good for code size.
 static bool tooCostlyToRunUnconditionally(const PassOptions& passOptions,
-                                          Expression* one,
-                                          Expression* two) {
-  // If we care mostly about code size, just do it for that reason.
-  if (passOptions.shrinkLevel) {
-    return false;
+                                          Index cost) {
+  if (passOptions.shrinkLevel == 0) {
+    // We are focused on speed. Any extra cost is risky, but allow a small
+    // amount.
+    return cost > TooCostlyToRunUnconditionally / 2;
+  } else if (passOptions.shrinkLevel == 1) {
+    // We are optimizing for size in a balanced manner. Allow some extra
+    // overhead here.
+    return cost >= TooCostlyToRunUnconditionally;
+  } else {
+    // We should have already decided what to do if shrink_level=2 and not
+    // gotten here, and other values are invalid.
+    WASM_UNREACHABLE("bad shrink level");
   }
-  // Consider the cost of executing all the code unconditionally.
-  auto total = CostAnalyzer(one).cost + CostAnalyzer(two).cost;
-  return total >= TooCostlyToRunUnconditionally;
 }
 
 // As above, but a single expression that we are considering moving to a place
 // where it executes unconditionally.
 static bool tooCostlyToRunUnconditionally(const PassOptions& passOptions,
                                           Expression* curr) {
-  if (passOptions.shrinkLevel) {
+  // If we care entirely about code size, just do it for that reason (early
+  // exit to avoid work).
+  if (passOptions.shrinkLevel >= 2) {
+    return false;
+  }
+  auto cost = CostAnalyzer(curr).cost;
+  return tooCostlyToRunUnconditionally(passOptions, cost);
+}
+
+// Check if it is not worth it to run code unconditionally. This
+// assumes we are trying to run two expressions where previously
+// only one of the two might have executed. We assume here that
+// executing both is good for code size.
+static bool tooCostlyToRunUnconditionally(const PassOptions& passOptions,
+                                          Expression* one,
+                                          Expression* two) {
+  // If we care entirely about code size, just do it for that reason (early
+  // exit to avoid work).
+  if (passOptions.shrinkLevel >= 2) {
     return false;
   }
-  return CostAnalyzer(curr).cost >= TooCostlyToRunUnconditionally;
+
+  // Consider the cost of executing all the code unconditionally, which adds
+  // either the cost of running one or two, so the maximum is the worst case.
+  auto max = std::max(CostAnalyzer(one).cost, CostAnalyzer(two).cost);
+  return tooCostlyToRunUnconditionally(passOptions, max);
 }
 
 struct RemoveUnusedBrs : public WalkerPass<PostWalker<RemoveUnusedBrs>> {
diff --git a/test/lit/passes/remove-unused-brs-gc.wast b/test/lit/passes/remove-unused-brs-gc.wast
index 53100cc91..7a620193e 100644
--- a/test/lit/passes/remove-unused-brs-gc.wast
+++ b/test/lit/passes/remove-unused-brs-gc.wast
@@ -655,22 +655,20 @@
  ;; CHECK-NEXT:   )
  ;; CHECK-NEXT:  )
  ;; CHECK-NEXT:  (drop
- ;; CHECK-NEXT:   (select (result (ref null $struct))
- ;; CHECK-NEXT:    (block (result (ref null $struct))
- ;; CHECK-NEXT:     (block $something (result (ref null $struct))
- ;; CHECK-NEXT:      (drop
- ;; CHECK-NEXT:       (block (result nullref)
- ;; CHECK-NEXT:        (br_on_non_null $something
- ;; CHECK-NEXT:         (local.get $struct)
- ;; CHECK-NEXT:        )
- ;; CHECK-NEXT:        (ref.null none)
+ ;; CHECK-NEXT:   (if (result (ref null $struct))
+ ;; CHECK-NEXT:    (local.get $x)
+ ;; CHECK-NEXT:    (block $something (result (ref null $struct))
+ ;; CHECK-NEXT:     (drop
+ ;; CHECK-NEXT:      (block (result nullref)
+ ;; CHECK-NEXT:       (br_on_non_null $something
+ ;; CHECK-NEXT:        (local.get $struct)
  ;; CHECK-NEXT:       )
+ ;; CHECK-NEXT:       (ref.null none)
  ;; CHECK-NEXT:      )
- ;; CHECK-NEXT:      (ref.null none)
  ;; CHECK-NEXT:     )
+ ;; CHECK-NEXT:     (ref.null none)
  ;; CHECK-NEXT:    )
  ;; CHECK-NEXT:    (ref.null none)
- ;; CHECK-NEXT:    (local.get $x)
  ;; CHECK-NEXT:   )
  ;; CHECK-NEXT:  )
  ;; CHECK-NEXT:  (drop
@@ -716,6 +714,8 @@
     )
    )
   )
+  ;; We do not selectify here because the amount of work in the if is
+  ;; significant (there is a cast and a branch).
   (drop
    (if (result anyref)
     (local.get $x)
diff --git a/test/lit/passes/remove-unused-brs.wast b/test/lit/passes/remove-unused-brs.wast
index 82963f792..93cf4cbd2 100644
--- a/test/lit/passes/remove-unused-brs.wast
+++ b/test/lit/passes/remove-unused-brs.wast
@@ -31,10 +31,7 @@
   ;; CHECK-NEXT:   (i32.const 1)
   ;; CHECK-NEXT:   (i32.lt_u
   ;; CHECK-NEXT:    (i32.sub
-  ;; CHECK-NEXT:     (i32.or
-  ;; CHECK-NEXT:      (local.get $0)
-  ;; CHECK-NEXT:      (i32.const 32)
-  ;; CHECK-NEXT:     )
+  ;; CHECK-NEXT:     (local.get $0)
   ;; CHECK-NEXT:     (i32.const 97)
   ;; CHECK-NEXT:    )
   ;; CHECK-NEXT:    (i32.const 6)
@@ -60,10 +57,7 @@
       (i32.const 1)
       (i32.lt_u
         (i32.sub
-          (i32.or
-            (local.get $0)
-            (i32.const 32)
-          )
+          (local.get $0)
           (i32.const 97)
         )
         (i32.const 6)
diff --git a/test/lit/passes/remove-unused-brs_levels.wast b/test/lit/passes/remove-unused-brs_levels.wast
new file mode 100644
index 000000000..f4737627e
--- /dev/null
+++ b/test/lit/passes/remove-unused-brs_levels.wast
@@ -0,0 +1,129 @@
+;; NOTE: Assertions have been generated by update_lit_checks.py and should not be edited.
+;; RUN: wasm-opt %s --remove-unused-brs -all -S --shrink-level=0 -o - | filecheck %s --check-prefix=SHRINK_0
+;; RUN: wasm-opt %s --remove-unused-brs -all -S --shrink-level=1 -o - | filecheck %s --check-prefix=SHRINK_1
+;; RUN: wasm-opt %s --remove-unused-brs -all -S --shrink-level=2 -o - | filecheck %s --check-prefix=SHRINK_2
+
+
+(module
+  ;; SHRINK_0:      (func $selectify-division (type $0) (param $x i32) (result i32)
+  ;; SHRINK_0-NEXT:  (if (result i32)
+  ;; SHRINK_0-NEXT:   (i32.eq
+  ;; SHRINK_0-NEXT:    (local.get $x)
+  ;; SHRINK_0-NEXT:    (i32.const 53498923)
+  ;; SHRINK_0-NEXT:   )
+  ;; SHRINK_0-NEXT:   (i32.div_s
+  ;; SHRINK_0-NEXT:    (local.get $x)
+  ;; SHRINK_0-NEXT:    (i32.const 13)
+  ;; SHRINK_0-NEXT:   )
+  ;; SHRINK_0-NEXT:   (local.get $x)
+  ;; SHRINK_0-NEXT:  )
+  ;; SHRINK_0-NEXT: )
+  ;; SHRINK_1:      (func $selectify-division (type $0) (param $x i32) (result i32)
+  ;; SHRINK_1-NEXT:  (select
+  ;; SHRINK_1-NEXT:   (i32.div_s
+  ;; SHRINK_1-NEXT:    (local.get $x)
+  ;; SHRINK_1-NEXT:    (i32.const 13)
+  ;; SHRINK_1-NEXT:   )
+  ;; SHRINK_1-NEXT:   (local.get $x)
+  ;; SHRINK_1-NEXT:   (i32.eq
+  ;; SHRINK_1-NEXT:    (local.get $x)
+  ;; SHRINK_1-NEXT:    (i32.const 53498923)
+  ;; SHRINK_1-NEXT:   )
+  ;; SHRINK_1-NEXT:  )
+  ;; SHRINK_1-NEXT: )
+  ;; SHRINK_2:      (func $selectify-division (type $0) (param $x i32) (result i32)
+  ;; SHRINK_2-NEXT:  (select
+  ;; SHRINK_2-NEXT:   (i32.div_s
+  ;; SHRINK_2-NEXT:    (local.get $x)
+  ;; SHRINK_2-NEXT:    (i32.const 13)
+  ;; SHRINK_2-NEXT:   )
+  ;; SHRINK_2-NEXT:   (local.get $x)
+  ;; SHRINK_2-NEXT:   (i32.eq
+  ;; SHRINK_2-NEXT:    (local.get $x)
+  ;; SHRINK_2-NEXT:    (i32.const 53498923)
+  ;; SHRINK_2-NEXT:   )
+  ;; SHRINK_2-NEXT:  )
+  ;; SHRINK_2-NEXT: )
+  (func $selectify-division (param $x i32) (result i32)
+    ;; See #5983: this if, if turned into a select, becomes almost 5x slower.
+    ;; We only want to selectify here when the shrink level is 1 or 2.
+    (if (result i32)
+      (i32.eq
+        (local.get $x)
+        (i32.const 53498923)
+      )
+      (i32.div_s
+        (local.get $x)
+        (i32.const 13)
+      )
+      (local.get $x)
+    )
+  )
+
+  ;; SHRINK_0:      (func $selectify-division2 (type $0) (param $x i32) (result i32)
+  ;; SHRINK_0-NEXT:  (if (result i32)
+  ;; SHRINK_0-NEXT:   (i32.eq
+  ;; SHRINK_0-NEXT:    (local.get $x)
+  ;; SHRINK_0-NEXT:    (i32.const 53498923)
+  ;; SHRINK_0-NEXT:   )
+  ;; SHRINK_0-NEXT:   (i32.div_s
+  ;; SHRINK_0-NEXT:    (i32.div_s
+  ;; SHRINK_0-NEXT:     (local.get $x)
+  ;; SHRINK_0-NEXT:     (i32.const 13)
+  ;; SHRINK_0-NEXT:    )
+  ;; SHRINK_0-NEXT:    (i32.const 13)
+  ;; SHRINK_0-NEXT:   )
+  ;; SHRINK_0-NEXT:   (local.get $x)
+  ;; SHRINK_0-NEXT:  )
+  ;; SHRINK_0-NEXT: )
+  ;; SHRINK_1:      (func $selectify-division2 (type $0) (param $x i32) (result i32)
+  ;; SHRINK_1-NEXT:  (if (result i32)
+  ;; SHRINK_1-NEXT:   (i32.eq
+  ;; SHRINK_1-NEXT:    (local.get $x)
+  ;; SHRINK_1-NEXT:    (i32.const 53498923)
+  ;; SHRINK_1-NEXT:   )
+  ;; SHRINK_1-NEXT:   (i32.div_s
+  ;; SHRINK_1-NEXT:    (i32.div_s
+  ;; SHRINK_1-NEXT:     (local.get $x)
+  ;; SHRINK_1-NEXT:     (i32.const 13)
+  ;; SHRINK_1-NEXT:    )
+  ;; SHRINK_1-NEXT:    (i32.const 13)
+  ;; SHRINK_1-NEXT:   )
+  ;; SHRINK_1-NEXT:   (local.get $x)
+  ;; SHRINK_1-NEXT:  )
+  ;; SHRINK_1-NEXT: )
+  ;; SHRINK_2:      (func $selectify-division2 (type $0) (param $x i32) (result i32)
+  ;; SHRINK_2-NEXT:  (select
+  ;; SHRINK_2-NEXT:   (i32.div_s
+  ;; SHRINK_2-NEXT:    (i32.div_s
+  ;; SHRINK_2-NEXT:     (local.get $x)
+  ;; SHRINK_2-NEXT:     (i32.const 13)
+  ;; SHRINK_2-NEXT:    )
+  ;; SHRINK_2-NEXT:    (i32.const 13)
+  ;; SHRINK_2-NEXT:   )
+  ;; SHRINK_2-NEXT:   (local.get $x)
+  ;; SHRINK_2-NEXT:   (i32.eq
+  ;; SHRINK_2-NEXT:    (local.get $x)
+  ;; SHRINK_2-NEXT:    (i32.const 53498923)
+  ;; SHRINK_2-NEXT:   )
+  ;; SHRINK_2-NEXT:  )
+  ;; SHRINK_2-NEXT: )
+  (func $selectify-division2 (param $x i32) (result i32)
+    ;; As above, but now only with a shrink level of 2 should we selectify, as
+    ;; there are two divisions.
+    (if (result i32)
+      (i32.eq
+        (local.get $x)
+        (i32.const 53498923)
+      )
+      (i32.div_s
+        (i32.div_s
+          (local.get $x)
+          (i32.const 13)
+        )
+        (i32.const 13)
+      )
+      (local.get $x)
+    )
+  )
+)
diff --git a/test/passes/remove-unused-brs_enable-multivalue.txt b/test/passes/remove-unused-brs_enable-multivalue.txt
index 9df644daa..0a66153e9 100644
--- a/test/passes/remove-unused-brs_enable-multivalue.txt
+++ b/test/passes/remove-unused-brs_enable-multivalue.txt
@@ -2502,8 +2502,9 @@
   (i32.const 0)
  )
  (func $ifs-copies-recursive (param $20 i32) (result i32)
-  (local.set $20
-   (select
+  (if
+   (i32.const 1)
+   (local.set $20
     (select
      (select
       (i32.const 4)
@@ -2513,8 +2514,6 @@
      (local.get $20)
      (i32.const 2)
     )
-    (local.get $20)
-    (i32.const 1)
    )
   )
   (local.get $20)
diff --git a/test/wasm2js/conversions-modified.2asm.js.opt b/test/wasm2js/conversions-modified.2asm.js.opt
index 811845f9d..5eb7435ee 100644
--- a/test/wasm2js/conversions-modified.2asm.js.opt
+++ b/test/wasm2js/conversions-modified.2asm.js.opt
@@ -121,15 +121,29 @@ function asmFunc(imports) {
  }
  
  function legalstub$7($0) {
-  i64toi32_i32$HIGH_BITS = Math_fround(Math_abs($0)) >= Math_fround(1.0) ? ~~($0 > Math_fround(0.0) ? Math_fround(Math_min(Math_fround(Math_floor(Math_fround($0 * Math_fround(2.3283064365386963e-10)))), Math_fround(4294967296.0))) : Math_fround(Math_ceil(Math_fround(Math_fround($0 - Math_fround(~~$0 >>> 0 >>> 0)) * Math_fround(2.3283064365386963e-10))))) >>> 0 : 0;
+  var $1 = 0, $2 = 0;
+  $2 = ~~$0 >>> 0;
+  if (Math_fround(Math_abs($0)) >= Math_fround(1.0)) {
+   $1 = ~~($0 > Math_fround(0.0) ? Math_fround(Math_min(Math_fround(Math_floor(Math_fround($0 * Math_fround(2.3283064365386963e-10)))), Math_fround(4294967296.0))) : Math_fround(Math_ceil(Math_fround(Math_fround($0 - Math_fround(~~$0 >>> 0 >>> 0)) * Math_fround(2.3283064365386963e-10))))) >>> 0
+  } else {
+   $1 = 0
+  }
+  i64toi32_i32$HIGH_BITS = $1;
   setTempRet0(i64toi32_i32$HIGH_BITS | 0);
-  return ~~$0 >>> 0;
+  return $2;
  }
  
  function legalstub$9($0) {
-  i64toi32_i32$HIGH_BITS = Math_abs($0) >= 1.0 ? ~~($0 > 0.0 ? Math_min(Math_floor($0 * 2.3283064365386963e-10), 4294967295.0) : Math_ceil(($0 - +(~~$0 >>> 0 >>> 0)) * 2.3283064365386963e-10)) >>> 0 : 0;
+  var $1 = 0, $2 = 0;
+  $2 = ~~$0 >>> 0;
+  if (Math_abs($0) >= 1.0) {
+   $1 = ~~($0 > 0.0 ? Math_min(Math_floor($0 * 2.3283064365386963e-10), 4294967295.0) : Math_ceil(($0 - +(~~$0 >>> 0 >>> 0)) * 2.3283064365386963e-10)) >>> 0
+  } else {
+   $1 = 0
+  }
+  i64toi32_i32$HIGH_BITS = $1;
   setTempRet0(i64toi32_i32$HIGH_BITS | 0);
-  return ~~$0 >>> 0;
+  return $2;
  }
  
  function legalstub$12($0, $1) {
diff --git a/test/wasm2js/float-ops.2asm.js.opt b/test/wasm2js/float-ops.2asm.js.opt
index 204774e5c..e00c51801 100644
--- a/test/wasm2js/float-ops.2asm.js.opt
+++ b/test/wasm2js/float-ops.2asm.js.opt
@@ -236,12 +236,24 @@ function asmFunc(imports) {
  
  function $47($0) {
   $0 = Math_fround($0);
-  return !(~~$0 >>> 0 | (Math_fround(Math_abs($0)) >= Math_fround(1.0) ? ~~($0 > Math_fround(0.0) ? Math_fround(Math_min(Math_fround(Math_floor(Math_fround($0 * Math_fround(2.3283064365386963e-10)))), Math_fround(4294967296.0))) : Math_fround(Math_ceil(Math_fround(Math_fround($0 - Math_fround(~~$0 >>> 0 >>> 0)) * Math_fround(2.3283064365386963e-10))))) >>> 0 : 0)) | 0;
+  var $1_1 = 0;
+  if (Math_fround(Math_abs($0)) >= Math_fround(1.0)) {
+   $1_1 = ~~($0 > Math_fround(0.0) ? Math_fround(Math_min(Math_fround(Math_floor(Math_fround($0 * Math_fround(2.3283064365386963e-10)))), Math_fround(4294967296.0))) : Math_fround(Math_ceil(Math_fround(Math_fround($0 - Math_fround(~~$0 >>> 0 >>> 0)) * Math_fround(2.3283064365386963e-10))))) >>> 0
+  } else {
+   $1_1 = 0
+  }
+  return !($1_1 | ~~$0 >>> 0) | 0;
  }
  
  function $48($0) {
   $0 = +$0;
-  return !(~~$0 >>> 0 | (Math_abs($0) >= 1.0 ? ~~($0 > 0.0 ? Math_min(Math_floor($0 * 2.3283064365386963e-10), 4294967295.0) : Math_ceil(($0 - +(~~$0 >>> 0 >>> 0)) * 2.3283064365386963e-10)) >>> 0 : 0)) | 0;
+  var $1_1 = 0;
+  if (Math_abs($0) >= 1.0) {
+   $1_1 = ~~($0 > 0.0 ? Math_min(Math_floor($0 * 2.3283064365386963e-10), 4294967295.0) : Math_ceil(($0 - +(~~$0 >>> 0 >>> 0)) * 2.3283064365386963e-10)) >>> 0
+  } else {
+   $1_1 = 0
+  }
+  return !($1_1 | ~~$0 >>> 0) | 0;
  }
  
  function legalstub$43($0, $1_1) {
author	Alon Zakai <azakai@google.com>	2023-10-03 16:39:12 -0700
committer	GitHub <noreply@github.com>	2023-10-03 16:39:12 -0700
commit	b2e096d79c36daa2cbfb7dc3db31af76e9f45cc8 (patch)
tree	52f962128c9fe6c1b8cc8b847e5af40f33117c40
parent	24779b2a3fe5e5c7cc6b1da3661d346cd9c129ae (diff)
download	binaryen-b2e096d79c36daa2cbfb7dc3db31af76e9f45cc8.tar.gz binaryen-b2e096d79c36daa2cbfb7dc3db31af76e9f45cc8.tar.bz2 binaryen-b2e096d79c36daa2cbfb7dc3db31af76e9f45cc8.zip