From bfd01369a6dbb4629e88d227f085f959549e3dd5 Mon Sep 17 00:00:00 2001
From: Alon Zakai <azakai@google.com>
Date: Wed, 12 May 2021 07:43:35 -0700
Subject: Heap2Local: Use escape analysis to turn heap allocations into local
 data (#3866)

If we allocate some GC data, and do not let the reference escape, then we can
replace the allocation with locals, one local for each field in the allocation
basically. This avoids the allocation, and also allows us to optimize the locals
further.

On the Dart DeltaBlue benchmark, this is a 24% speedup (making it faster than
the JS version, incidentially), and also a 6% reduction in code size.

The tests are not the best way to show what this does, as the pass assumes
other passes will clean up after. Here is an example to clarify. First, in pseudocode:

ref = new Int(42)
do {
  ref.set(ref.get() + 1)
} while (import(ref.get())

That is, we allocate an int on the heap and use it as a counter. Unnecessarily,
as it could be a normal int on the stack.

Wat:

(module
 ;; A boxed integer: an entire struct just to hold an int.
 (type $boxed-int (struct (field (mut i32))))

 (import "env" "import" (func $import (param i32) (result i32)))

 (func "example"
  (local $ref (ref null $boxed-int))

  ;; Allocate a boxed integer of 42 and save the reference to it.
  (local.set $ref
   (struct.new_with_rtt $boxed-int
    (i32.const 42)
    (rtt.canon $boxed-int)
   )
  )

  ;; Increment the integer in a loop, looking for some condition.
  (loop $loop
   (struct.set $boxed-int 0
    (local.get $ref)
    (i32.add
     (struct.get $boxed-int 0
      (local.get $ref)
     )
     (i32.const 1)
    )
   )
   (br_if $loop
    (call $import
     (struct.get $boxed-int 0
      (local.get $ref)
     )
    )
   )
  )
 )
)

Before this pass, the optimizer could do essentially nothing with this.
Even with this pass, running -O1 has no effect, as the pass is only
used in -O2+. However, running --heap2local -O1 leads to this:

 (func $0
  (local $0 i32)
  (local.set $0
   (i32.const 42)
  )
  (loop $loop
   (br_if $loop
    (call $import
     (local.tee $0
      (i32.add
       (local.get $0)
       (i32.const 1)
      )
     )
    )
   )
  )
 )

All the GC heap operations have been removed, and we just
have a plain int now, allowing a bunch of other opts to run. That
output is basically the optimal code, I think.
---
 src/passes/pass.cpp | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'src/passes/pass.cpp')

diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp
index 7790c3972..0d9bd62e5 100644
--- a/src/passes/pass.cpp
+++ b/src/passes/pass.cpp
@@ -148,6 +148,8 @@ void PassRegistry::registerPasses() {
     createGenerateI64DynCallsPass);
   registerPass(
     "generate-stack-ir", "generate Stack IR", createGenerateStackIRPass);
+  registerPass(
+    "heap2local", "replace GC allocations with locals", createHeap2LocalPass);
   registerPass(
     "inline-main", "inline __original_main into main", createInlineMainPass);
   registerPass("inlining",
@@ -434,6 +436,9 @@ void PassRunner::addDefaultFunctionOptimizationPasses() {
   addIfNoDWARFIssues("reorder-locals");
   // simplify-locals opens opportunities for optimizations
   addIfNoDWARFIssues("remove-unused-brs");
+  if (options.optimizeLevel > 1 && wasm->features.hasGC()) {
+    addIfNoDWARFIssues("heap2local");
+  }
   // if we are willing to work hard, also optimize copies before coalescing
   if (options.optimizeLevel >= 3 || options.shrinkLevel >= 2) {
     addIfNoDWARFIssues("merge-locals"); // very slow on e.g. sqlite
-- 
cgit v1.2.3