summaryrefslogtreecommitdiff
path: root/src/passes/Heap2Local.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/passes/Heap2Local.cpp')
-rw-r--r--src/passes/Heap2Local.cpp701
1 files changed, 701 insertions, 0 deletions
diff --git a/src/passes/Heap2Local.cpp b/src/passes/Heap2Local.cpp
new file mode 100644
index 000000000..7e8f12990
--- /dev/null
+++ b/src/passes/Heap2Local.cpp
@@ -0,0 +1,701 @@
+/*
+ * Copyright 2021 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//
+// Find heap allocations that never escape the current function, and lower the
+// allocation's data into locals. That is, avoid allocating a GC object, and
+// instead use one local for each of its fields.
+//
+// To get a sense for what this pass does, here is an example to clarify. First,
+// in pseudocode:
+//
+// ref = new Int(42)
+// do {
+// ref.set(ref.get() + 1)
+// } while (import(ref.get())
+//
+// That is, we allocate an int on the heap and use it as a counter.
+// Unnecessarily, as it could be a normal int on the stack.
+//
+// Wat:
+//
+// (module
+// ;; A boxed integer: an entire struct just to hold an int.
+// (type $boxed-int (struct (field (mut i32))))
+//
+// (import "env" "import" (func $import (param i32) (result i32)))
+//
+// (func "example"
+// (local $ref (ref null $boxed-int))
+//
+// ;; Allocate a boxed integer of 42 and save the reference to it.
+// (local.set $ref
+// (struct.new_with_rtt $boxed-int
+// (i32.const 42)
+// (rtt.canon $boxed-int)
+// )
+// )
+//
+// ;; Increment the integer in a loop, looking for some condition.
+// (loop $loop
+// (struct.set $boxed-int 0
+// (local.get $ref)
+// (i32.add
+// (struct.get $boxed-int 0
+// (local.get $ref)
+// )
+// (i32.const 1)
+// )
+// )
+// (br_if $loop
+// (call $import
+// (struct.get $boxed-int 0
+// (local.get $ref)
+// )
+// )
+// )
+// )
+// )
+// )
+//
+// Before this pass, the optimizer could do essentially nothing with this. Even
+// with this pass, running -O1 has no effect, as the pass is only used in -O2+.
+// However, running --heap2local -O1 leads to this:
+//
+// (func $0
+// (local $0 i32)
+// (local.set $0
+// (i32.const 42)
+// )
+// (loop $loop
+// (br_if $loop
+// (call $import
+// (local.tee $0
+// (i32.add
+// (local.get $0)
+// (i32.const 1)
+// )
+// )
+// )
+// )
+// )
+// )
+//
+// All the GC heap operations have been removed, and we just have a plain int
+// now, allowing a bunch of other opts to run.
+//
+// For us to replace an allocation with locals, we need to prove two things:
+//
+// * It must not escape from the function. If it escapes, we must pass out a
+// reference anyhow. (In theory we could do a whole-program transformation
+// to replace the reference with parameters in some cases, but inlining can
+// hopefully let us optimize most cases.)
+// * It must be used "exclusively", without overlap. That is, we cannot
+// handle the case where a local.get might return our allocation, but might
+// also get some other value. We also cannot handle a select where one arm
+// is our allocation and another is something else. If the use is exclusive
+// then we have a simple guarantee of being able to replace the heap
+// allocation with the locals.
+//
+// Non-exclusive uses are optimizable too, but they require more work and add
+// overhead. For example, here is a non-exclusive use:
+//
+// var x;
+// if (..) {
+// x = new Something(); // the allocation we want to optimize
+// } else {
+// x = something_else;
+// }
+// // 'x' here is not used exclusively by our allocation
+// return x.field0;
+//
+// To optimize x.field0, we'd need to check if it contains our allocation or
+// not, perhaps marking a boolean as true when it is, then doing an if on that
+// local, etc.:
+//
+// var x_is_our_alloc; // whether x is our allocation
+// var x; // keep x around for when it is not our allocation
+// var x_field0; // the value of field0 on x, when x is our allocation
+// if (..) {
+// x_field0 = ..default value for the type..
+// x_is_our_alloc = true;
+// } else {
+// x = something_else;
+// x_is_our_alloc = false;
+// }
+// return x_is_our_alloc ? x_field0 : x.field0;
+//
+// (node splitting/code duplication is another possible approach). On the other
+// hand, if the allocation is used exclusively in all places (the if-else above
+// does not have an else any more) then we can do this:
+//
+// var x_field0; // the value of field0 on x
+// if (..) {
+// x_field0 = ..default value for the type..
+// }
+// return x_field0;
+//
+// This optimization focuses on such cases.
+//
+
+#include "ir/branch-utils.h"
+#include "ir/find_all.h"
+#include "ir/local-graph.h"
+#include "ir/parents.h"
+#include "ir/properties.h"
+#include "ir/type-updating.h"
+#include "ir/utils.h"
+#include "pass.h"
+#include "support/unique_deferring_queue.h"
+#include "wasm-builder.h"
+#include "wasm.h"
+
+namespace wasm {
+
+namespace {
+
+struct Heap2LocalOptimizer {
+ Function* func;
+ Module* module;
+ const PassOptions& passOptions;
+
+ // To find allocations that do not escape, we must track locals so that we
+ // can see how allocations flow from sets to gets and so forth.
+ // TODO: only scan reference types
+ LocalGraph localGraph;
+
+ // To find what escapes, we need to follow where values flow, both up to
+ // parents, and via branches.
+ Parents parents;
+ BranchUtils::BranchTargets branchTargets;
+
+ bool optimized = false;
+
+ Heap2LocalOptimizer(Function* func,
+ Module* module,
+ const PassOptions& passOptions)
+ : func(func), module(module), passOptions(passOptions), localGraph(func),
+ parents(func->body), branchTargets(func->body) {
+ // We need to track what each set influences, to see where its value can
+ // flow to.
+ localGraph.computeSetInfluences();
+
+ // All the allocations in the function.
+ // TODO: Arrays (of constant size) as well.
+ FindAll<StructNew> allocations(func->body);
+
+ for (auto* allocation : allocations.list) {
+ // The point of this optimization is to replace heap allocations with
+ // locals, so we must be able to place the data in locals.
+ if (!canHandleAsLocals(allocation->type)) {
+ continue;
+ }
+
+ if (convertToLocals(allocation)) {
+ optimized = true;
+ }
+ }
+ }
+
+ bool canHandleAsLocals(Type type) {
+ if (type == Type::unreachable) {
+ return false;
+ }
+ auto& fields = type.getHeapType().getStruct().fields;
+ for (auto field : fields) {
+ if (!TypeUpdating::canHandleAsLocal(field.type)) {
+ return false;
+ }
+ if (field.isPacked()) {
+ // TODO: support packed fields by adding coercions/truncations.
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // Handles the rewriting that we do to perform the optimization. We store the
+ // data that rewriting will need here, while we analyze, and then if we can do
+ // the optimization, we tell it to run.
+ //
+ // TODO: Doing a single rewrite walk at the end would be more efficient, but
+ // it would need to be more complex.
+ struct Rewriter : PostWalker<Rewriter> {
+ StructNew* allocation;
+ Function* func;
+ Builder builder;
+ const FieldList& fields;
+
+ Rewriter(StructNew* allocation, Function* func, Module* module)
+ : allocation(allocation), func(func), builder(*module),
+ fields(allocation->type.getHeapType().getStruct().fields) {}
+
+ // We must track all the local.sets that write the allocation, to verify
+ // exclusivity.
+ std::unordered_set<LocalSet*> sets;
+
+ // All the expressions we reached during the flow analysis. That is exactly
+ // all the places where our allocation is used. We track these so that we
+ // can fix them up at the end, if the optimization ends up possible.
+ std::unordered_set<Expression*> reached;
+
+ // Maps indexes in the struct to the local index that will replace them.
+ std::vector<Index> localIndexes;
+
+ void applyOptimization() {
+ // Allocate locals to store the allocation's fields in.
+ for (auto field : fields) {
+ localIndexes.push_back(builder.addVar(func, field.type));
+ }
+
+ // Replace the things we need to using the visit* methods.
+ walk(func->body);
+ }
+
+ void visitLocalSet(LocalSet* curr) {
+ if (!reached.count(curr)) {
+ return;
+ }
+
+ // We don't need any sets of the reference to any of the locals it
+ // originally was written to.
+ //
+ // Note that after we remove the sets, other passes can easily remove the
+ // gets, and so we do not bother to do anything for them. (Also, in
+ // general it is not trivial to replace the gets - we'd need something of
+ // the same type, but the type might be a non-nullable reference type in
+ // the case of a parameter, and in the future maybe of some locals.)
+ if (curr->isTee()) {
+ replaceCurrent(curr->value);
+ } else {
+ replaceCurrent(builder.makeDrop(curr->value));
+ }
+ }
+
+ void visitStructNew(StructNew* curr) {
+ if (curr != allocation) {
+ return;
+ }
+
+ // We do not remove the allocation itself here, rather we make it
+ // unnecessary, and then depend on other optimizations to clean up. (We
+ // cannot simply remove it because we need to replace it with something of
+ // the same non-nullable type.)
+
+ // First, assign the initial values to the new locals.
+ std::vector<Expression*> contents;
+
+ if (!allocation->isWithDefault()) {
+ // We must assign the initial values to temp indexes, then copy them
+ // over all at once. If instead we did set them as we go, then we might
+ // hit a problem like this:
+ //
+ // (local.set X (new_X))
+ // (local.set Y (block (result ..)
+ // (.. (local.get X) ..) ;; returns new_X, wrongly
+ // (new_Y)
+ // )
+ //
+ // Note how we assign to the local X and use it during the assignment to
+ // the local Y - but we should still see the old value of X, not new_X.
+ // Temp locals X', Y' can ensure that:
+ //
+ // (local.set X' (new_X))
+ // (local.set Y' (block (result ..)
+ // (.. (local.get X) ..) ;; returns the proper, old X
+ // (new_Y)
+ // )
+ // ..
+ // (local.set X (local.get X'))
+ // (local.set Y (local.get Y'))
+ std::vector<Index> tempIndexes;
+
+ for (auto field : fields) {
+ tempIndexes.push_back(builder.addVar(func, field.type));
+ }
+
+ // Store the initial values into the temp locals.
+ for (Index i = 0; i < tempIndexes.size(); i++) {
+ contents.push_back(
+ builder.makeLocalSet(tempIndexes[i], allocation->operands[i]));
+ }
+
+ // Copy them to the normal ones.
+ for (Index i = 0; i < tempIndexes.size(); i++) {
+ contents.push_back(builder.makeLocalSet(
+ localIndexes[i],
+ builder.makeLocalGet(tempIndexes[i], fields[i].type)));
+ }
+
+ // Read the values in the allocation (we don't need to, as the
+ // allocation is not used after our optimization, but we need something
+ // with the right type anyhow).
+ for (Index i = 0; i < tempIndexes.size(); i++) {
+ allocation->operands[i] =
+ builder.makeLocalGet(localIndexes[i], fields[i].type);
+ }
+
+ // TODO Check if the nondefault case does not increase code size in some
+ // cases. A heap allocation that implicitly sets the default values
+ // is smaller than multiple explicit settings of locals to
+ // defaults.
+ } else {
+ // Set the default values, and replace the allocation with a block that
+ // first does that, then contains the allocation.
+ // Note that we must assign the defaults because we might be in a loop,
+ // that is, there might be a previous value.
+ for (Index i = 0; i < localIndexes.size(); i++) {
+ contents.push_back(builder.makeLocalSet(
+ localIndexes[i],
+ builder.makeConstantExpression(Literal::makeZero(fields[i].type))));
+ }
+ }
+
+ // Put the allocation itself at the end of the block, so the block has the
+ // exact same type as the allocation it replaces.
+ contents.push_back(allocation);
+ replaceCurrent(builder.makeBlock(contents));
+ }
+
+ void visitStructSet(StructSet* curr) {
+ if (!reached.count(curr)) {
+ return;
+ }
+
+ // Drop the ref (leaving it to other opts to remove, when possible), and
+ // write the data to the local instead of the heap allocation.
+ replaceCurrent(builder.makeSequence(
+ builder.makeDrop(curr->ref),
+ builder.makeLocalSet(localIndexes[curr->index], curr->value)));
+ }
+
+ void visitStructGet(StructGet* curr) {
+ if (!reached.count(curr)) {
+ return;
+ }
+
+ replaceCurrent(
+ builder.makeSequence(builder.makeDrop(curr->ref),
+ builder.makeLocalGet(localIndexes[curr->index],
+ fields[curr->index].type)));
+ }
+ };
+
+ enum class ParentChildInteraction {
+ // The parent lets the child escape. E.g. the parent is a call.
+ Escapes,
+ // The parent fully consumes the child in a safe, non-escaping way, and
+ // after consuming it nothing remains to flow further through the parent.
+ // E.g. the parent is a struct.get, which reads from the allocated heap
+ // value and does nothing more with the reference.
+ FullyConsumes,
+ // The parent flows the child out, that is, the child is the single value
+ // that can flow out from the parent. E.g. the parent is a block with no
+ // branches and the child is the final value that is returned.
+ Flows,
+ // The parent does not consume the child completely, so the child's value
+ // can be used through it. However the child does not flow cleanly through.
+ // E.g. the parent is a block with branches, and the value on them may be
+ // returned from the block and not only the child. This means the allocation
+ // is not used in an exclusive way, and we cannot optimize it.
+ Mixes,
+ };
+
+ // Analyze an allocation to see if we can convert it from a heap allocation to
+ // locals.
+ bool convertToLocals(StructNew* allocation) {
+ Rewriter rewriter(allocation, func, module);
+
+ // A queue of flows from children to parents. When something is in the queue
+ // here then it assumed that it is ok for the allocation to be at the child
+ // (that is, we have already checked the child before placing it in the
+ // queue), and we need to check if it is ok to be at the parent, and to flow
+ // from the child to the parent. We will analyze that (see
+ // ParentChildInteraction, above) and continue accordingly.
+ using ChildAndParent = std::pair<Expression*, Expression*>;
+ UniqueNonrepeatingDeferredQueue<ChildAndParent> flows;
+
+ // Start the flow from the allocation itself to its parent.
+ flows.push({allocation, parents.getParent(allocation)});
+
+ // Keep flowing while we can.
+ while (!flows.empty()) {
+ auto flow = flows.pop();
+ auto* child = flow.first;
+ auto* parent = flow.second;
+
+ // If we've already seen an expression, stop since we cannot optimize
+ // things that overlap in any way (see the notes on exclusivity, above).
+ // Note that we use a nonrepeating queue here, so we already do not visit
+ // the same thing more than once; what this check does is verify we don't
+ // look at something that another allocation reached, which would be in a
+ // different call to this function and use a different queue (any overlap
+ // between calls would prove non-exclusivity).
+ if (seen.count(child)) {
+ return false;
+ }
+ seen.insert(child);
+
+ switch (getParentChildInteraction(parent, child)) {
+ case ParentChildInteraction::Escapes: {
+ // If the parent may let us escape then we are done.
+ return false;
+ }
+ case ParentChildInteraction::FullyConsumes: {
+ // If the parent consumes us without letting us escape then all is
+ // well (and there is nothing flowing from the parent to check).
+ break;
+ }
+ case ParentChildInteraction::Flows: {
+ // The value flows through the parent; we need to look further at the
+ // grandparent.
+ flows.push({parent, parents.getParent(parent)});
+ break;
+ }
+ case ParentChildInteraction::Mixes: {
+ // Our allocation is not used exclusively via the parent, as other
+ // values are mixed with it. Give up.
+ return false;
+ }
+ }
+
+ if (auto* set = parent->dynCast<LocalSet>()) {
+ // This is one of the sets we are written to, and so we must check for
+ // exclusive use of our allocation by all the gets that read the value.
+ // Note the set, and we will check the gets at the end once we know all
+ // of our sets.
+ rewriter.sets.insert(set);
+
+ // We must also look at how the value flows from those gets.
+ if (auto* getsReached = getGetsReached(set)) {
+ for (auto* get : *getsReached) {
+ flows.push({get, parents.getParent(get)});
+ }
+ }
+ }
+
+ // If the parent may send us on a branch, we will need to look at the flow
+ // to the branch target(s).
+ for (auto name : branchesSentByParent(child, parent)) {
+ flows.push({parent, branchTargets.getTarget(name)});
+ }
+
+ // If we got to here, then we can continue to hope that we can optimize
+ // this allocation. Mark the parent as reached by it, and continue.
+ rewriter.reached.insert(parent);
+ }
+
+ // We finished the loop over the flows. Do the final checks.
+ if (!getsAreExclusiveToSets(rewriter.sets)) {
+ return false;
+ }
+
+ // We can do it, hurray!
+ rewriter.applyOptimization();
+
+ return true;
+ }
+
+ // All the expressions we have already looked at.
+ std::unordered_set<Expression*> seen;
+
+ ParentChildInteraction getParentChildInteraction(Expression* parent,
+ Expression* child) {
+ // If there is no parent then we are the body of the function, and that
+ // means we escape by flowing to the caller.
+ if (!parent) {
+ return ParentChildInteraction::Escapes;
+ }
+
+ struct Checker : public Visitor<Checker> {
+ Expression* child;
+
+ // Assume escaping (or some other problem we cannot analyze) unless we are
+ // certain otherwise.
+ bool escapes = true;
+
+ // Assume we do not fully consume the value unless we are certain
+ // otherwise. If this is set to true, then we do not need to check any
+ // further. If it remains false, then we will analyze the value that
+ // falls through later to check for mixing.
+ //
+ // Note that this does not need to be set for expressions if their type
+ // proves that the value does not continue onwards (e.g. if their type is
+ // none, or not a reference type), but for clarity some do still mark this
+ // field as true when it is clearly so.
+ bool fullyConsumes = false;
+
+ // General operations
+ void visitBlock(Block* curr) {
+ escapes = false;
+ // We do not mark fullyConsumes as the value may continue through this
+ // and other control flow structures.
+ }
+ // Note that If is not supported here, because for our value to flow
+ // through it there must be an if-else, and that means there is no single
+ // value falling through anyhow.
+ void visitLoop(Loop* curr) { escapes = false; }
+ void visitDrop(Drop* curr) {
+ escapes = false;
+ fullyConsumes = true;
+ }
+ void visitBreak(Break* curr) { escapes = false; }
+ void visitSwitch(Switch* curr) { escapes = false; }
+
+ // Local operations. Locals by themselves do not escape; the analysis
+ // tracks where locals are used.
+ void visitLocalGet(LocalGet* curr) { escapes = false; }
+ void visitLocalSet(LocalSet* curr) { escapes = false; }
+
+ // Reference operations. TODO add more
+ void visitRefAs(RefAs* curr) {
+ // TODO General OptimizeInstructions integration, that is, since we know
+ // that our allocation is what flows into this RefAs, we can
+ // know the exact outcome of the operation.
+ if (curr->op == RefAsNonNull) {
+ // As it is our allocation that flows through here, we know it is not
+ // null (so there is no trap), and we can continue to (hopefully)
+ // optimize this allocation.
+ escapes = false;
+
+ // Note that while we can look through this operation, we cannot get
+ // rid of it later, as its parent might depend on receiving a
+ // non-nullable type. So we will leave the RefAsNonNull as it is,
+ // even if we do optimize the allocation, and we depend on other
+ // passes to remove the RefAsNonNull.
+ }
+ }
+
+ // GC operations.
+ void visitStructSet(StructSet* curr) {
+ // The reference does not escape (but the value is stored to memory and
+ // therefore might).
+ if (curr->ref == child) {
+ escapes = false;
+ fullyConsumes = true;
+ }
+ }
+ void visitStructGet(StructGet* curr) {
+ escapes = false;
+ fullyConsumes = true;
+ }
+
+ // TODO Array and I31 operations
+ } checker;
+
+ checker.child = child;
+ checker.visit(parent);
+
+ if (checker.escapes) {
+ return ParentChildInteraction::Escapes;
+ }
+
+ // If the parent returns a type that is not a reference, then by definition
+ // it fully consumes the value as it does not flow our allocation onward.
+ if (checker.fullyConsumes || !parent->type.isRef()) {
+ return ParentChildInteraction::FullyConsumes;
+ }
+
+ // Finally, check for mixing. If the child is the immediate fallthrough
+ // of the parent then no other values can be mixed in.
+ //
+ // TODO: Also check if we are sent via a branch (and that branch sends the
+ // single value exiting the parent).
+ // TODO: Also check for safe merges where our allocation is in all places,
+ // like two if or select arms, or branches.
+ if (Properties::getImmediateFallthrough(
+ parent, passOptions, module->features) == child) {
+ return ParentChildInteraction::Flows;
+ }
+
+ return ParentChildInteraction::Mixes;
+ }
+
+ std::unordered_set<LocalGet*>* getGetsReached(LocalSet* set) {
+ auto iter = localGraph.setInfluences.find(set);
+ if (iter != localGraph.setInfluences.end()) {
+ return &iter->second;
+ }
+ return nullptr;
+ }
+
+ BranchUtils::NameSet branchesSentByParent(Expression* child,
+ Expression* parent) {
+ BranchUtils::NameSet names;
+ BranchUtils::operateOnScopeNameUsesAndSentValues(
+ parent, [&](Name name, Expression* value) {
+ if (value == child) {
+ names.insert(name);
+ }
+ });
+ return names;
+ }
+
+ // Verify exclusivity of all the gets for a bunch of sets. That is, assuming
+ // the sets are exclusive (they all write exactly our allocation, and nothing
+ // else), we need to check whether all the gets that read that value cannot
+ // read anything else (which would be the case if another set writes to that
+ // local, in the right live range).
+ bool getsAreExclusiveToSets(const std::unordered_set<LocalSet*>& sets) {
+ // Find all the relevant gets (which may overlap between the sets).
+ std::unordered_set<LocalGet*> gets;
+ for (auto* set : sets) {
+ if (auto* getsReached = getGetsReached(set)) {
+ for (auto* get : *getsReached) {
+ gets.insert(get);
+ }
+ }
+ }
+
+ // Check that the gets can only read from the specific known sets.
+ for (auto* get : gets) {
+ for (auto* set : localGraph.getSetses[get]) {
+ if (sets.count(set) == 0) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+};
+
+struct Heap2Local : public WalkerPass<PostWalker<Heap2Local>> {
+ bool isFunctionParallel() override { return true; }
+
+ Pass* create() override { return new Heap2Local(); }
+
+ void doWalkFunction(Function* func) {
+ // Multiple rounds of optimization may work in theory, as once we turn one
+ // allocation into locals, references written to its fields become
+ // references written to locals, which we may see do not escape. However,
+ // this does not work yet, since we do not remove the original allocation -
+ // we just "detach" it from other things and then depend on other
+ // optimizations to remove it. That means this pass must be interleaved with
+ // vacuum, in particular, to optimize such nested allocations.
+ // TODO Consider running multiple iterations here, and running vacuum in
+ // between them.
+ if (Heap2LocalOptimizer(func, getModule(), getPassOptions()).optimized) {
+ TypeUpdating::handleNonDefaultableLocals(func, *getModule());
+ }
+ }
+};
+
+} // anonymous namespace
+
+Pass* createHeap2LocalPass() { return new Heap2Local(); }
+
+} // namespace wasm