[NFC-ish] Avoid repeated ReFinalize etc. when inlining (#6967)

We may inline multiple times into a single function. Previously, if we did so, we did the "fixups" such as ReFinalize and non-nullable local fixes once per such inlining. But that is wasteful as each ReFinalize etc. scans the whole function, and could be done after we copy all the code from all the inlinings, which is what this PR does: it splits doInlining() into one function that inlines code and one that does the updates after, and the update is done after all inlinings. This turns out to be very important, a 5x speedup on two large real-world wasm files I am looking at. The reason is that we actually inline more than once in half the cases, and sometimes far more - in one case we inline over 1,000 times into a function! (and ReFinalized 1,000 times too many) This is practically NFC, but it turns out that there are some tiny noticeable differences between running ReFinalize once at the end vs. once after each inlining. These differences are not really functional or observable in the behavior of the code, and optimizations would remove them anyhow, but they are noticeable in two tests here. The changes to tests are, in order: * Different block names, just because the counter we use sees more things. * In a testcase with unreachable code, we inline twice into a function, and the first inlining brings in an unreachable, and ReFinalizing early will lead to it propagating differently than if we wait to ReFinalize. (It actually leads to another cycle of inlining in that case, as a fluke.)
author: Alon Zakai <azakai@google.com> 2024-09-24 16:15:06 -0700
committer: GitHub <noreply@github.com> 2024-09-24 16:15:06 -0700
commit: ccef354cc8c0379e91b4dffb365fc69f616d6e25 (patch)
tree: 21a4c7d7ed3df24696a9bab7b92cace4bed45354
parent: 4d906204ebd3ad88a48350f008c7b72a0844e1da (diff)
download: binaryen-ccef354cc8c0379e91b4dffb365fc69f616d6e25.tar.gz
binaryen-ccef354cc8c0379e91b4dffb365fc69f616d6e25.tar.bz2
binaryen-ccef354cc8c0379e91b4dffb365fc69f616d6e25.zip
3 files changed, 48 insertions, 31 deletions
diff --git a/src/passes/Inlining.cpp b/src/passes/Inlining.cpp
index 30bb5c23f..cbee7e77f 100644
--- a/src/passes/Inlining.cpp
+++ b/src/passes/Inlining.cpp
@@ -446,11 +446,11 @@ struct Updater : public TryDepthWalker<Updater> {
 };
 
 // Core inlining logic. Modifies the outside function (adding locals as
-// needed).
-static void doInlining(Module* module,
-                       Function* into,
-                       const InliningAction& action,
-                       PassOptions& options) {
+// needed) by copying the inlined code into it.
+static void doCodeInlining(Module* module,
+                           Function* into,
+                           const InliningAction& action,
+                           PassOptions& options) {
   Function* from = action.contents;
   auto* call = (*action.callSite)->cast<Call>();
 
@@ -622,6 +622,12 @@ static void doInlining(Module* module,
     }
     *action.callSite = builder.makeSequence(old, builder.makeUnreachable());
   }
+}
+
+// Updates the outer function after we inline into it. This is a general
+// operation that does not depend on what we inlined, it just makes sure that we
+// refinalize everything, have no duplicate break labels, etc.
+static void updateAfterInlining(Module* module, Function* into) {
   // Anything we inlined into may now have non-unique label names, fix it up.
   // Note that we must do this before refinalization, as otherwise duplicate
   // block labels can lead to errors (the IR must be valid before we
@@ -635,6 +641,14 @@ static void doInlining(Module* module,
   TypeUpdating::handleNonDefaultableLocals(into, *module);
 }
 
+static void doInlining(Module* module,
+                       Function* into,
+                       const InliningAction& action,
+                       PassOptions& options) {
+  doCodeInlining(module, into, action, options);
+  updateAfterInlining(module, into);
+}
+
 // A map of function names to the inlining actions we've decided to actually
 // perform in them.
 using ChosenActions = std::unordered_map<Name, std::vector<InliningAction>>;
@@ -657,9 +671,13 @@ struct DoInlining : public Pass {
     assert(iter != chosenActions.end());
     const auto& actions = iter->second;
     assert(!actions.empty());
+
+    // Inline all the code first, then update func once at the end (which saves
+    // e.g. running ReFinalize after each action, of which there might be many).
     for (auto action : actions) {
-      doInlining(module, func, action, getPassOptions());
+      doCodeInlining(module, func, action, getPassOptions());
     }
+    updateAfterInlining(module, func);
   }
 
 private:
diff --git a/test/lit/passes/inlining-optimizing_optimize-level=3.wast b/test/lit/passes/inlining-optimizing_optimize-level=3.wast
index fdfbf7b35..f6df73056 100644
--- a/test/lit/passes/inlining-optimizing_optimize-level=3.wast
+++ b/test/lit/passes/inlining-optimizing_optimize-level=3.wast
@@ -8820,7 +8820,7 @@
  ;; CHECK-NEXT:                      (br $__rjti$8)
  ;; CHECK-NEXT:                     )
  ;; CHECK-NEXT:                     (block $label$break$L8
- ;; CHECK-NEXT:                      (block $__rjti$20
+ ;; CHECK-NEXT:                      (block $__rjti$23
  ;; CHECK-NEXT:                       (if
  ;; CHECK-NEXT:                        (i32.and
  ;; CHECK-NEXT:                         (local.tee $9
@@ -8845,7 +8845,7 @@
  ;; CHECK-NEXT:                          (local.get $5)
  ;; CHECK-NEXT:                         )
  ;; CHECK-NEXT:                         (loop $while-in5
- ;; CHECK-NEXT:                          (br_if $__rjti$20
+ ;; CHECK-NEXT:                          (br_if $__rjti$23
  ;; CHECK-NEXT:                           (i32.eqz
  ;; CHECK-NEXT:                            (i32.load8_u
  ;; CHECK-NEXT:                             (local.get $8)
@@ -8890,7 +8890,7 @@
  ;; CHECK-NEXT:                         )
  ;; CHECK-NEXT:                        )
  ;; CHECK-NEXT:                       )
- ;; CHECK-NEXT:                       (br_if $__rjti$20
+ ;; CHECK-NEXT:                       (br_if $__rjti$23
  ;; CHECK-NEXT:                        (local.get $9)
  ;; CHECK-NEXT:                       )
  ;; CHECK-NEXT:                       (local.set $9
diff --git a/test/lit/passes/inlining_optimize-level=3.wast b/test/lit/passes/inlining_optimize-level=3.wast
index 1064ba04d..ab9386533 100644
--- a/test/lit/passes/inlining_optimize-level=3.wast
+++ b/test/lit/passes/inlining_optimize-level=3.wast
@@ -503,23 +503,6 @@
 ;; (That avoids possible validation problems, and maximizes DCE.) To keep it
 ;; unreachable we'll add an unreachable instruction after the inlined code.
 (module
- ;; CHECK:      (type $0 (func (param f32)))
-
- ;; CHECK:      (type $1 (func))
-
- ;; CHECK:      (func $A (param $0 f32)
- ;; CHECK-NEXT:  (local $1 f32)
- ;; CHECK-NEXT:  (drop
- ;; CHECK-NEXT:   (block (result f32)
- ;; CHECK-NEXT:    (block $__inlined_func$C (result f32)
- ;; CHECK-NEXT:     (local.set $1
- ;; CHECK-NEXT:      (local.get $0)
- ;; CHECK-NEXT:     )
- ;; CHECK-NEXT:     (local.get $1)
- ;; CHECK-NEXT:    )
- ;; CHECK-NEXT:   )
- ;; CHECK-NEXT:  )
- ;; CHECK-NEXT: )
  (func $A (param $0 f32)
   (drop
    (call $C
@@ -527,12 +510,16 @@
    )
   )
  )
+ ;; CHECK:      (type $0 (func))
+
  ;; CHECK:      (func $B
  ;; CHECK-NEXT:  (local $0 f32)
- ;; CHECK-NEXT:  (call $A
- ;; CHECK-NEXT:   (block
- ;; CHECK-NEXT:    (block
- ;; CHECK-NEXT:     (drop
+ ;; CHECK-NEXT:  (local $1 f32)
+ ;; CHECK-NEXT:  (local $2 f32)
+ ;; CHECK-NEXT:  (block
+ ;; CHECK-NEXT:   (block $__inlined_func$A$3
+ ;; CHECK-NEXT:    (local.set $1
+ ;; CHECK-NEXT:     (block (result f32)
  ;; CHECK-NEXT:      (block $__inlined_func$C$2 (result f32)
  ;; CHECK-NEXT:       (local.tee $0
  ;; CHECK-NEXT:        (block
@@ -544,7 +531,19 @@
  ;; CHECK-NEXT:       (local.get $0)
  ;; CHECK-NEXT:      )
  ;; CHECK-NEXT:     )
- ;; CHECK-NEXT:     (unreachable)
+ ;; CHECK-NEXT:    )
+ ;; CHECK-NEXT:    (local.set $2
+ ;; CHECK-NEXT:     (f32.const 0)
+ ;; CHECK-NEXT:    )
+ ;; CHECK-NEXT:    (drop
+ ;; CHECK-NEXT:     (block (result f32)
+ ;; CHECK-NEXT:      (block $__inlined_func$C (result f32)
+ ;; CHECK-NEXT:       (local.set $2
+ ;; CHECK-NEXT:        (local.get $1)
+ ;; CHECK-NEXT:       )
+ ;; CHECK-NEXT:       (local.get $2)
+ ;; CHECK-NEXT:      )
+ ;; CHECK-NEXT:     )
  ;; CHECK-NEXT:    )
  ;; CHECK-NEXT:   )
  ;; CHECK-NEXT:  )
author	Alon Zakai <azakai@google.com>	2024-09-24 16:15:06 -0700
committer	GitHub <noreply@github.com>	2024-09-24 16:15:06 -0700
commit	ccef354cc8c0379e91b4dffb365fc69f616d6e25 (patch)
tree	21a4c7d7ed3df24696a9bab7b92cace4bed45354
parent	4d906204ebd3ad88a48350f008c7b72a0844e1da (diff)
download	binaryen-ccef354cc8c0379e91b4dffb365fc69f616d6e25.tar.gz binaryen-ccef354cc8c0379e91b4dffb365fc69f616d6e25.tar.bz2 binaryen-ccef354cc8c0379e91b4dffb365fc69f616d6e25.zip