5 files changed, 709 insertions, 1 deletions
diff --git a/src/passes/CMakeLists.txt b/src/passes/CMakeLists.txt
index 69fbfa672..2b8790a4f 100644
--- a/src/passes/CMakeLists.txt
+++ b/src/passes/CMakeLists.txt
@@ -29,6 +29,7 @@ set(passes_SOURCES
   Flatten.cpp
   FuncCastEmulation.cpp
   GenerateDynCalls.cpp
+  Heap2Local.cpp
   I64ToI32Lowering.cpp
   Inlining.cpp
   InstrumentLocals.cpp
diff --git a/src/passes/Heap2Local.cpp b/src/passes/Heap2Local.cpp
new file mode 100644
index 000000000..7e8f12990
--- /dev/null
+++ b/src/passes/Heap2Local.cpp
@@ -0,0 +1,701 @@
+/*
+ * Copyright 2021 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//
+// Find heap allocations that never escape the current function, and lower the
+// allocation's data into locals. That is, avoid allocating a GC object, and
+// instead use one local for each of its fields.
+//
+// To get a sense for what this pass does, here is an example to clarify. First,
+// in pseudocode:
+//
+//   ref = new Int(42)
+//   do {
+//     ref.set(ref.get() + 1)
+//   } while (import(ref.get())
+//
+// That is, we allocate an int on the heap and use it as a counter.
+// Unnecessarily, as it could be a normal int on the stack.
+//
+// Wat:
+//
+//   (module
+//    ;; A boxed integer: an entire struct just to hold an int.
+//    (type $boxed-int (struct (field (mut i32))))
+//
+//    (import "env" "import" (func $import (param i32) (result i32)))
+//
+//    (func "example"
+//     (local $ref (ref null $boxed-int))
+//
+//     ;; Allocate a boxed integer of 42 and save the reference to it.
+//     (local.set $ref
+//      (struct.new_with_rtt $boxed-int
+//       (i32.const 42)
+//       (rtt.canon $boxed-int)
+//      )
+//     )
+//
+//     ;; Increment the integer in a loop, looking for some condition.
+//     (loop $loop
+//      (struct.set $boxed-int 0
+//       (local.get $ref)
+//       (i32.add
+//        (struct.get $boxed-int 0
+//         (local.get $ref)
+//        )
+//        (i32.const 1)
+//       )
+//      )
+//      (br_if $loop
+//       (call $import
+//        (struct.get $boxed-int 0
+//         (local.get $ref)
+//        )
+//       )
+//      )
+//     )
+//    )
+//   )
+//
+// Before this pass, the optimizer could do essentially nothing with this. Even
+// with this pass, running -O1 has no effect, as the pass is only used in -O2+.
+// However, running --heap2local -O1 leads to this:
+//
+//    (func $0
+//     (local $0 i32)
+//     (local.set $0
+//      (i32.const 42)
+//     )
+//     (loop $loop
+//      (br_if $loop
+//       (call $import
+//        (local.tee $0
+//         (i32.add
+//          (local.get $0)
+//          (i32.const 1)
+//         )
+//        )
+//       )
+//      )
+//     )
+//    )
+//
+// All the GC heap operations have been removed, and we just have a plain int
+// now, allowing a bunch of other opts to run.
+//
+// For us to replace an allocation with locals, we need to prove two things:
+//
+//  * It must not escape from the function. If it escapes, we must pass out a
+//    reference anyhow. (In theory we could do a whole-program transformation
+//    to replace the reference with parameters in some cases, but inlining can
+//    hopefully let us optimize most cases.)
+//  * It must be used "exclusively", without overlap. That is, we cannot
+//    handle the case where a local.get might return our allocation, but might
+//    also get some other value. We also cannot handle a select where one arm
+//    is our allocation and another is something else. If the use is exclusive
+//    then we have a simple guarantee of being able to replace the heap
+//    allocation with the locals.
+//
+// Non-exclusive uses are optimizable too, but they require more work and add
+// overhead. For example, here is a non-exclusive use:
+//
+//   var x;
+//   if (..) {
+//     x = new Something(); // the allocation we want to optimize
+//   } else {
+//     x = something_else;
+//   }
+//   // 'x' here is not used exclusively by our allocation
+//   return x.field0;
+//
+// To optimize x.field0, we'd need to check if it contains our allocation or
+// not, perhaps marking a boolean as true when it is, then doing an if on that
+// local, etc.:
+//
+//   var x_is_our_alloc; // whether x is our allocation
+//   var x; // keep x around for when it is not our allocation
+//   var x_field0; // the value of field0 on x, when x is our allocation
+//   if (..) {
+//     x_field0 = ..default value for the type..
+//     x_is_our_alloc = true;
+//   } else {
+//     x = something_else;
+//     x_is_our_alloc = false;
+//   }
+//   return x_is_our_alloc ? x_field0 : x.field0;
+//
+// (node splitting/code duplication is another possible approach). On the other
+// hand, if the allocation is used exclusively in all places (the if-else above
+// does not have an else any more) then we can do this:
+//
+//   var x_field0; // the value of field0 on x
+//   if (..) {
+//     x_field0 = ..default value for the type..
+//   }
+//   return x_field0;
+//
+// This optimization focuses on such cases.
+//
+
+#include "ir/branch-utils.h"
+#include "ir/find_all.h"
+#include "ir/local-graph.h"
+#include "ir/parents.h"
+#include "ir/properties.h"
+#include "ir/type-updating.h"
+#include "ir/utils.h"
+#include "pass.h"
+#include "support/unique_deferring_queue.h"
+#include "wasm-builder.h"
+#include "wasm.h"
+
+namespace wasm {
+
+namespace {
+
+struct Heap2LocalOptimizer {
+  Function* func;
+  Module* module;
+  const PassOptions& passOptions;
+
+  // To find allocations that do not escape, we must track locals so that we
+  // can see how allocations flow from sets to gets and so forth.
+  // TODO: only scan reference types
+  LocalGraph localGraph;
+
+  // To find what escapes, we need to follow where values flow, both up to
+  // parents, and via branches.
+  Parents parents;
+  BranchUtils::BranchTargets branchTargets;
+
+  bool optimized = false;
+
+  Heap2LocalOptimizer(Function* func,
+                      Module* module,
+                      const PassOptions& passOptions)
+    : func(func), module(module), passOptions(passOptions), localGraph(func),
+      parents(func->body), branchTargets(func->body) {
+    // We need to track what each set influences, to see where its value can
+    // flow to.
+    localGraph.computeSetInfluences();
+
+    // All the allocations in the function.
+    // TODO: Arrays (of constant size) as well.
+    FindAll<StructNew> allocations(func->body);
+
+    for (auto* allocation : allocations.list) {
+      // The point of this optimization is to replace heap allocations with
+      // locals, so we must be able to place the data in locals.
+      if (!canHandleAsLocals(allocation->type)) {
+        continue;
+      }
+
+      if (convertToLocals(allocation)) {
+        optimized = true;
+      }
+    }
+  }
+
+  bool canHandleAsLocals(Type type) {
+    if (type == Type::unreachable) {
+      return false;
+    }
+    auto& fields = type.getHeapType().getStruct().fields;
+    for (auto field : fields) {
+      if (!TypeUpdating::canHandleAsLocal(field.type)) {
+        return false;
+      }
+      if (field.isPacked()) {
+        // TODO: support packed fields by adding coercions/truncations.
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Handles the rewriting that we do to perform the optimization. We store the
+  // data that rewriting will need here, while we analyze, and then if we can do
+  // the optimization, we tell it to run.
+  //
+  // TODO: Doing a single rewrite walk at the end would be more efficient, but
+  //       it would need to be more complex.
+  struct Rewriter : PostWalker<Rewriter> {
+    StructNew* allocation;
+    Function* func;
+    Builder builder;
+    const FieldList& fields;
+
+    Rewriter(StructNew* allocation, Function* func, Module* module)
+      : allocation(allocation), func(func), builder(*module),
+        fields(allocation->type.getHeapType().getStruct().fields) {}
+
+    // We must track all the local.sets that write the allocation, to verify
+    // exclusivity.
+    std::unordered_set<LocalSet*> sets;
+
+    // All the expressions we reached during the flow analysis. That is exactly
+    // all the places where our allocation is used. We track these so that we
+    // can fix them up at the end, if the optimization ends up possible.
+    std::unordered_set<Expression*> reached;
+
+    // Maps indexes in the struct to the local index that will replace them.
+    std::vector<Index> localIndexes;
+
+    void applyOptimization() {
+      // Allocate locals to store the allocation's fields in.
+      for (auto field : fields) {
+        localIndexes.push_back(builder.addVar(func, field.type));
+      }
+
+      // Replace the things we need to using the visit* methods.
+      walk(func->body);
+    }
+
+    void visitLocalSet(LocalSet* curr) {
+      if (!reached.count(curr)) {
+        return;
+      }
+
+      // We don't need any sets of the reference to any of the locals it
+      // originally was written to.
+      //
+      // Note that after we remove the sets, other passes can easily remove the
+      // gets, and so we do not bother to do anything for them. (Also, in
+      // general it is not trivial to replace the gets - we'd need something of
+      // the same type, but the type might be a non-nullable reference type in
+      // the case of a parameter, and in the future maybe of some locals.)
+      if (curr->isTee()) {
+        replaceCurrent(curr->value);
+      } else {
+        replaceCurrent(builder.makeDrop(curr->value));
+      }
+    }
+
+    void visitStructNew(StructNew* curr) {
+      if (curr != allocation) {
+        return;
+      }
+
+      // We do not remove the allocation itself here, rather we make it
+      // unnecessary, and then depend on other optimizations to clean up. (We
+      // cannot simply remove it because we need to replace it with something of
+      // the same non-nullable type.)
+
+      // First, assign the initial values to the new locals.
+      std::vector<Expression*> contents;
+
+      if (!allocation->isWithDefault()) {
+        // We must assign the initial values to temp indexes, then copy them
+        // over all at once. If instead we did set them as we go, then we might
+        // hit a problem like this:
+        //
+        //  (local.set X (new_X))
+        //  (local.set Y (block (result ..)
+        //                 (.. (local.get X) ..) ;; returns new_X, wrongly
+        //                 (new_Y)
+        //               )
+        //
+        // Note how we assign to the local X and use it during the assignment to
+        // the local Y - but we should still see the old value of X, not new_X.
+        // Temp locals X', Y' can ensure that:
+        //
+        //  (local.set X' (new_X))
+        //  (local.set Y' (block (result ..)
+        //                  (.. (local.get X) ..) ;; returns the proper, old X
+        //                  (new_Y)
+        //                )
+        //  ..
+        //  (local.set X (local.get X'))
+        //  (local.set Y (local.get Y'))
+        std::vector<Index> tempIndexes;
+
+        for (auto field : fields) {
+          tempIndexes.push_back(builder.addVar(func, field.type));
+        }
+
+        // Store the initial values into the temp locals.
+        for (Index i = 0; i < tempIndexes.size(); i++) {
+          contents.push_back(
+            builder.makeLocalSet(tempIndexes[i], allocation->operands[i]));
+        }
+
+        // Copy them to the normal ones.
+        for (Index i = 0; i < tempIndexes.size(); i++) {
+          contents.push_back(builder.makeLocalSet(
+            localIndexes[i],
+            builder.makeLocalGet(tempIndexes[i], fields[i].type)));
+        }
+
+        // Read the values in the allocation (we don't need to, as the
+        // allocation is not used after our optimization, but we need something
+        // with the right type anyhow).
+        for (Index i = 0; i < tempIndexes.size(); i++) {
+          allocation->operands[i] =
+            builder.makeLocalGet(localIndexes[i], fields[i].type);
+        }
+
+        // TODO Check if the nondefault case does not increase code size in some
+        //      cases. A heap allocation that implicitly sets the default values
+        //      is smaller than multiple explicit settings of locals to
+        //      defaults.
+      } else {
+        // Set the default values, and replace the allocation with a block that
+        // first does that, then contains the allocation.
+        // Note that we must assign the defaults because we might be in a loop,
+        // that is, there might be a previous value.
+        for (Index i = 0; i < localIndexes.size(); i++) {
+          contents.push_back(builder.makeLocalSet(
+            localIndexes[i],
+            builder.makeConstantExpression(Literal::makeZero(fields[i].type))));
+        }
+      }
+
+      // Put the allocation itself at the end of the block, so the block has the
+      // exact same type as the allocation it replaces.
+      contents.push_back(allocation);
+      replaceCurrent(builder.makeBlock(contents));
+    }
+
+    void visitStructSet(StructSet* curr) {
+      if (!reached.count(curr)) {
+        return;
+      }
+
+      // Drop the ref (leaving it to other opts to remove, when possible), and
+      // write the data to the local instead of the heap allocation.
+      replaceCurrent(builder.makeSequence(
+        builder.makeDrop(curr->ref),
+        builder.makeLocalSet(localIndexes[curr->index], curr->value)));
+    }
+
+    void visitStructGet(StructGet* curr) {
+      if (!reached.count(curr)) {
+        return;
+      }
+
+      replaceCurrent(
+        builder.makeSequence(builder.makeDrop(curr->ref),
+                             builder.makeLocalGet(localIndexes[curr->index],
+                                                  fields[curr->index].type)));
+    }
+  };
+
+  enum class ParentChildInteraction {
+    // The parent lets the child escape. E.g. the parent is a call.
+    Escapes,
+    // The parent fully consumes the child in a safe, non-escaping way, and
+    // after consuming it nothing remains to flow further through the parent.
+    // E.g. the parent is a struct.get, which reads from the allocated heap
+    // value and does nothing more with the reference.
+    FullyConsumes,
+    // The parent flows the child out, that is, the child is the single value
+    // that can flow out from the parent. E.g. the parent is a block with no
+    // branches and the child is the final value that is returned.
+    Flows,
+    // The parent does not consume the child completely, so the child's value
+    // can be used through it. However the child does not flow cleanly through.
+    // E.g. the parent is a block with branches, and the value on them may be
+    // returned from the block and not only the child. This means the allocation
+    // is not used in an exclusive way, and we cannot optimize it.
+    Mixes,
+  };
+
+  // Analyze an allocation to see if we can convert it from a heap allocation to
+  // locals.
+  bool convertToLocals(StructNew* allocation) {
+    Rewriter rewriter(allocation, func, module);
+
+    // A queue of flows from children to parents. When something is in the queue
+    // here then it assumed that it is ok for the allocation to be at the child
+    // (that is, we have already checked the child before placing it in the
+    // queue), and we need to check if it is ok to be at the parent, and to flow
+    // from the child to the parent. We will analyze that (see
+    // ParentChildInteraction, above) and continue accordingly.
+    using ChildAndParent = std::pair<Expression*, Expression*>;
+    UniqueNonrepeatingDeferredQueue<ChildAndParent> flows;
+
+    // Start the flow from the allocation itself to its parent.
+    flows.push({allocation, parents.getParent(allocation)});
+
+    // Keep flowing while we can.
+    while (!flows.empty()) {
+      auto flow = flows.pop();
+      auto* child = flow.first;
+      auto* parent = flow.second;
+
+      // If we've already seen an expression, stop since we cannot optimize
+      // things that overlap in any way (see the notes on exclusivity, above).
+      // Note that we use a nonrepeating queue here, so we already do not visit
+      // the same thing more than once; what this check does is verify we don't
+      // look at something that another allocation reached, which would be in a
+      // different call to this function and use a different queue (any overlap
+      // between calls would prove non-exclusivity).
+      if (seen.count(child)) {
+        return false;
+      }
+      seen.insert(child);
+
+      switch (getParentChildInteraction(parent, child)) {
+        case ParentChildInteraction::Escapes: {
+          // If the parent may let us escape then we are done.
+          return false;
+        }
+        case ParentChildInteraction::FullyConsumes: {
+          // If the parent consumes us without letting us escape then all is
+          // well (and there is nothing flowing from the parent to check).
+          break;
+        }
+        case ParentChildInteraction::Flows: {
+          // The value flows through the parent; we need to look further at the
+          // grandparent.
+          flows.push({parent, parents.getParent(parent)});
+          break;
+        }
+        case ParentChildInteraction::Mixes: {
+          // Our allocation is not used exclusively via the parent, as other
+          // values are mixed with it. Give up.
+          return false;
+        }
+      }
+
+      if (auto* set = parent->dynCast<LocalSet>()) {
+        // This is one of the sets we are written to, and so we must check for
+        // exclusive use of our allocation by all the gets that read the value.
+        // Note the set, and we will check the gets at the end once we know all
+        // of our sets.
+        rewriter.sets.insert(set);
+
+        // We must also look at how the value flows from those gets.
+        if (auto* getsReached = getGetsReached(set)) {
+          for (auto* get : *getsReached) {
+            flows.push({get, parents.getParent(get)});
+          }
+        }
+      }
+
+      // If the parent may send us on a branch, we will need to look at the flow
+      // to the branch target(s).
+      for (auto name : branchesSentByParent(child, parent)) {
+        flows.push({parent, branchTargets.getTarget(name)});
+      }
+
+      // If we got to here, then we can continue to hope that we can optimize
+      // this allocation. Mark the parent as reached by it, and continue.
+      rewriter.reached.insert(parent);
+    }
+
+    // We finished the loop over the flows. Do the final checks.
+    if (!getsAreExclusiveToSets(rewriter.sets)) {
+      return false;
+    }
+
+    // We can do it, hurray!
+    rewriter.applyOptimization();
+
+    return true;
+  }
+
+  // All the expressions we have already looked at.
+  std::unordered_set<Expression*> seen;
+
+  ParentChildInteraction getParentChildInteraction(Expression* parent,
+                                                   Expression* child) {
+    // If there is no parent then we are the body of the function, and that
+    // means we escape by flowing to the caller.
+    if (!parent) {
+      return ParentChildInteraction::Escapes;
+    }
+
+    struct Checker : public Visitor<Checker> {
+      Expression* child;
+
+      // Assume escaping (or some other problem we cannot analyze) unless we are
+      // certain otherwise.
+      bool escapes = true;
+
+      // Assume we do not fully consume the value unless we are certain
+      // otherwise. If this is set to true, then we do not need to check any
+      // further. If it remains false, then we will analyze the value that
+      // falls through later to check for mixing.
+      //
+      // Note that this does not need to be set for expressions if their type
+      // proves that the value does not continue onwards (e.g. if their type is
+      // none, or not a reference type), but for clarity some do still mark this
+      // field as true when it is clearly so.
+      bool fullyConsumes = false;
+
+      // General operations
+      void visitBlock(Block* curr) {
+        escapes = false;
+        // We do not mark fullyConsumes as the value may continue through this
+        // and other control flow structures.
+      }
+      // Note that If is not supported here, because for our value to flow
+      // through it there must be an if-else, and that means there is no single
+      // value falling through anyhow.
+      void visitLoop(Loop* curr) { escapes = false; }
+      void visitDrop(Drop* curr) {
+        escapes = false;
+        fullyConsumes = true;
+      }
+      void visitBreak(Break* curr) { escapes = false; }
+      void visitSwitch(Switch* curr) { escapes = false; }
+
+      // Local operations. Locals by themselves do not escape; the analysis
+      // tracks where locals are used.
+      void visitLocalGet(LocalGet* curr) { escapes = false; }
+      void visitLocalSet(LocalSet* curr) { escapes = false; }
+
+      // Reference operations. TODO add more
+      void visitRefAs(RefAs* curr) {
+        // TODO General OptimizeInstructions integration, that is, since we know
+        //      that our allocation is what flows into this RefAs, we can
+        //      know the exact outcome of the operation.
+        if (curr->op == RefAsNonNull) {
+          // As it is our allocation that flows through here, we know it is not
+          // null (so there is no trap), and we can continue to (hopefully)
+          // optimize this allocation.
+          escapes = false;
+
+          // Note that while we can look through this operation, we cannot get
+          // rid of it later, as its parent might depend on receiving a
+          // non-nullable type. So we will leave the RefAsNonNull as it is,
+          // even if we do optimize the allocation, and we depend on other
+          // passes to remove the RefAsNonNull.
+        }
+      }
+
+      // GC operations.
+      void visitStructSet(StructSet* curr) {
+        // The reference does not escape (but the value is stored to memory and
+        // therefore might).
+        if (curr->ref == child) {
+          escapes = false;
+          fullyConsumes = true;
+        }
+      }
+      void visitStructGet(StructGet* curr) {
+        escapes = false;
+        fullyConsumes = true;
+      }
+
+      // TODO Array and I31 operations
+    } checker;
+
+    checker.child = child;
+    checker.visit(parent);
+
+    if (checker.escapes) {
+      return ParentChildInteraction::Escapes;
+    }
+
+    // If the parent returns a type that is not a reference, then by definition
+    // it fully consumes the value as it does not flow our allocation onward.
+    if (checker.fullyConsumes || !parent->type.isRef()) {
+      return ParentChildInteraction::FullyConsumes;
+    }
+
+    // Finally, check for mixing. If the child is the immediate fallthrough
+    // of the parent then no other values can be mixed in.
+    //
+    // TODO: Also check if we are sent via a branch (and that branch sends the
+    //       single value exiting the parent).
+    // TODO: Also check for safe merges where our allocation is in all places,
+    //       like two if or select arms, or branches.
+    if (Properties::getImmediateFallthrough(
+          parent, passOptions, module->features) == child) {
+      return ParentChildInteraction::Flows;
+    }
+
+    return ParentChildInteraction::Mixes;
+  }
+
+  std::unordered_set<LocalGet*>* getGetsReached(LocalSet* set) {
+    auto iter = localGraph.setInfluences.find(set);
+    if (iter != localGraph.setInfluences.end()) {
+      return &iter->second;
+    }
+    return nullptr;
+  }
+
+  BranchUtils::NameSet branchesSentByParent(Expression* child,
+                                            Expression* parent) {
+    BranchUtils::NameSet names;
+    BranchUtils::operateOnScopeNameUsesAndSentValues(
+      parent, [&](Name name, Expression* value) {
+        if (value == child) {
+          names.insert(name);
+        }
+      });
+    return names;
+  }
+
+  // Verify exclusivity of all the gets for a bunch of sets. That is, assuming
+  // the sets are exclusive (they all write exactly our allocation, and nothing
+  // else), we need to check whether all the gets that read that value cannot
+  // read anything else (which would be the case if another set writes to that
+  // local, in the right live range).
+  bool getsAreExclusiveToSets(const std::unordered_set<LocalSet*>& sets) {
+    // Find all the relevant gets (which may overlap between the sets).
+    std::unordered_set<LocalGet*> gets;
+    for (auto* set : sets) {
+      if (auto* getsReached = getGetsReached(set)) {
+        for (auto* get : *getsReached) {
+          gets.insert(get);
+        }
+      }
+    }
+
+    // Check that the gets can only read from the specific known sets.
+    for (auto* get : gets) {
+      for (auto* set : localGraph.getSetses[get]) {
+        if (sets.count(set) == 0) {
+          return false;
+        }
+      }
+    }
+
+    return true;
+  }
+};
+
+struct Heap2Local : public WalkerPass<PostWalker<Heap2Local>> {
+  bool isFunctionParallel() override { return true; }
+
+  Pass* create() override { return new Heap2Local(); }
+
+  void doWalkFunction(Function* func) {
+    // Multiple rounds of optimization may work in theory, as once we turn one
+    // allocation into locals, references written to its fields become
+    // references written to locals, which we may see do not escape. However,
+    // this does not work yet, since we do not remove the original allocation -
+    // we just "detach" it from other things and then depend on other
+    // optimizations to remove it. That means this pass must be interleaved with
+    // vacuum, in particular, to optimize such nested allocations.
+    // TODO Consider running multiple iterations here, and running vacuum in
+    //      between them.
+    if (Heap2LocalOptimizer(func, getModule(), getPassOptions()).optimized) {
+      TypeUpdating::handleNonDefaultableLocals(func, *getModule());
+    }
+  }
+};
+
+} // anonymous namespace
+
+Pass* createHeap2LocalPass() { return new Heap2Local(); }
+
+} // namespace wasm
diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp
index 7790c3972..0d9bd62e5 100644
--- a/src/passes/pass.cpp
+++ b/src/passes/pass.cpp
@@ -149,6 +149,8 @@ void PassRegistry::registerPasses() {
   registerPass(
     "generate-stack-ir", "generate Stack IR", createGenerateStackIRPass);
   registerPass(
+    "heap2local", "replace GC allocations with locals", createHeap2LocalPass);
+  registerPass(
     "inline-main", "inline __original_main into main", createInlineMainPass);
   registerPass("inlining",
                "inline functions (you probably want inlining-optimizing)",
@@ -434,6 +436,9 @@ void PassRunner::addDefaultFunctionOptimizationPasses() {
   addIfNoDWARFIssues("reorder-locals");
   // simplify-locals opens opportunities for optimizations
   addIfNoDWARFIssues("remove-unused-brs");
+  if (options.optimizeLevel > 1 && wasm->features.hasGC()) {
+    addIfNoDWARFIssues("heap2local");
+  }
   // if we are willing to work hard, also optimize copies before coalescing
   if (options.optimizeLevel >= 3 || options.shrinkLevel >= 2) {
     addIfNoDWARFIssues("merge-locals"); // very slow on e.g. sqlite
diff --git a/src/passes/passes.h b/src/passes/passes.h
index 79d344f1a..4cc8040a4 100644
--- a/src/passes/passes.h
+++ b/src/passes/passes.h
@@ -49,6 +49,7 @@ Pass* createFunctionMetricsPass();
 Pass* createGenerateDynCallsPass();
 Pass* createGenerateI64DynCallsPass();
 Pass* createGenerateStackIRPass();
+Pass* createHeap2LocalPass();
 Pass* createI64ToI32LoweringPass();
 Pass* createInlineMainPass();
 Pass* createInliningPass();
diff --git a/src/wasm-interpreter.h b/src/wasm-interpreter.h
index 4cb26d74a..b38272acd 100644
--- a/src/wasm-interpreter.h
+++ b/src/wasm-interpreter.h
@@ -1443,7 +1443,7 @@ public:
       // We must have a module in order to perform the cast, to get the type. If
       // we do not have one, or if the function is not present (which may happen
       // if we are optimizing a function before the entire module is built),
-      // then this is not something we cannot precompute.
+      // then this is something we cannot precompute.
       auto* func = module
                      ? module->getFunctionOrNull(cast.originalRef.getFunc())
                      : nullptr;