From bfd01369a6dbb4629e88d227f085f959549e3dd5 Mon Sep 17 00:00:00 2001 From: Alon Zakai Date: Wed, 12 May 2021 07:43:35 -0700 Subject: Heap2Local: Use escape analysis to turn heap allocations into local data (#3866) If we allocate some GC data, and do not let the reference escape, then we can replace the allocation with locals, one local for each field in the allocation basically. This avoids the allocation, and also allows us to optimize the locals further. On the Dart DeltaBlue benchmark, this is a 24% speedup (making it faster than the JS version, incidentially), and also a 6% reduction in code size. The tests are not the best way to show what this does, as the pass assumes other passes will clean up after. Here is an example to clarify. First, in pseudocode: ref = new Int(42) do { ref.set(ref.get() + 1) } while (import(ref.get()) That is, we allocate an int on the heap and use it as a counter. Unnecessarily, as it could be a normal int on the stack. Wat: (module ;; A boxed integer: an entire struct just to hold an int. (type $boxed-int (struct (field (mut i32)))) (import "env" "import" (func $import (param i32) (result i32))) (func "example" (local $ref (ref null $boxed-int)) ;; Allocate a boxed integer of 42 and save the reference to it. (local.set $ref (struct.new_with_rtt $boxed-int (i32.const 42) (rtt.canon $boxed-int) ) ) ;; Increment the integer in a loop, looking for some condition. (loop $loop (struct.set $boxed-int 0 (local.get $ref) (i32.add (struct.get $boxed-int 0 (local.get $ref) ) (i32.const 1) ) ) (br_if $loop (call $import (struct.get $boxed-int 0 (local.get $ref) ) ) ) ) ) ) Before this pass, the optimizer could do essentially nothing with this. Even with this pass, running -O1 has no effect, as the pass is only used in -O2+. However, running --heap2local -O1 leads to this: (func $0 (local $0 i32) (local.set $0 (i32.const 42) ) (loop $loop (br_if $loop (call $import (local.tee $0 (i32.add (local.get $0) (i32.const 1) ) ) ) ) ) ) All the GC heap operations have been removed, and we just have a plain int now, allowing a bunch of other opts to run. That output is basically the optimal code, I think. --- src/passes/pass.cpp | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'src/passes/pass.cpp') diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp index 7790c3972..0d9bd62e5 100644 --- a/src/passes/pass.cpp +++ b/src/passes/pass.cpp @@ -148,6 +148,8 @@ void PassRegistry::registerPasses() { createGenerateI64DynCallsPass); registerPass( "generate-stack-ir", "generate Stack IR", createGenerateStackIRPass); + registerPass( + "heap2local", "replace GC allocations with locals", createHeap2LocalPass); registerPass( "inline-main", "inline __original_main into main", createInlineMainPass); registerPass("inlining", @@ -434,6 +436,9 @@ void PassRunner::addDefaultFunctionOptimizationPasses() { addIfNoDWARFIssues("reorder-locals"); // simplify-locals opens opportunities for optimizations addIfNoDWARFIssues("remove-unused-brs"); + if (options.optimizeLevel > 1 && wasm->features.hasGC()) { + addIfNoDWARFIssues("heap2local"); + } // if we are willing to work hard, also optimize copies before coalescing if (options.optimizeLevel >= 3 || options.shrinkLevel >= 2) { addIfNoDWARFIssues("merge-locals"); // very slow on e.g. sqlite -- cgit v1.2.3