diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/passes/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/passes/Heap2Local.cpp | 701 | ||||
-rw-r--r-- | src/passes/pass.cpp | 5 | ||||
-rw-r--r-- | src/passes/passes.h | 1 | ||||
-rw-r--r-- | src/wasm-interpreter.h | 2 |
5 files changed, 709 insertions, 1 deletions
diff --git a/src/passes/CMakeLists.txt b/src/passes/CMakeLists.txt index 69fbfa672..2b8790a4f 100644 --- a/src/passes/CMakeLists.txt +++ b/src/passes/CMakeLists.txt @@ -29,6 +29,7 @@ set(passes_SOURCES Flatten.cpp FuncCastEmulation.cpp GenerateDynCalls.cpp + Heap2Local.cpp I64ToI32Lowering.cpp Inlining.cpp InstrumentLocals.cpp diff --git a/src/passes/Heap2Local.cpp b/src/passes/Heap2Local.cpp new file mode 100644 index 000000000..7e8f12990 --- /dev/null +++ b/src/passes/Heap2Local.cpp @@ -0,0 +1,701 @@ +/* + * Copyright 2021 WebAssembly Community Group participants + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// +// Find heap allocations that never escape the current function, and lower the +// allocation's data into locals. That is, avoid allocating a GC object, and +// instead use one local for each of its fields. +// +// To get a sense for what this pass does, here is an example to clarify. First, +// in pseudocode: +// +// ref = new Int(42) +// do { +// ref.set(ref.get() + 1) +// } while (import(ref.get()) +// +// That is, we allocate an int on the heap and use it as a counter. +// Unnecessarily, as it could be a normal int on the stack. +// +// Wat: +// +// (module +// ;; A boxed integer: an entire struct just to hold an int. +// (type $boxed-int (struct (field (mut i32)))) +// +// (import "env" "import" (func $import (param i32) (result i32))) +// +// (func "example" +// (local $ref (ref null $boxed-int)) +// +// ;; Allocate a boxed integer of 42 and save the reference to it. +// (local.set $ref +// (struct.new_with_rtt $boxed-int +// (i32.const 42) +// (rtt.canon $boxed-int) +// ) +// ) +// +// ;; Increment the integer in a loop, looking for some condition. +// (loop $loop +// (struct.set $boxed-int 0 +// (local.get $ref) +// (i32.add +// (struct.get $boxed-int 0 +// (local.get $ref) +// ) +// (i32.const 1) +// ) +// ) +// (br_if $loop +// (call $import +// (struct.get $boxed-int 0 +// (local.get $ref) +// ) +// ) +// ) +// ) +// ) +// ) +// +// Before this pass, the optimizer could do essentially nothing with this. Even +// with this pass, running -O1 has no effect, as the pass is only used in -O2+. +// However, running --heap2local -O1 leads to this: +// +// (func $0 +// (local $0 i32) +// (local.set $0 +// (i32.const 42) +// ) +// (loop $loop +// (br_if $loop +// (call $import +// (local.tee $0 +// (i32.add +// (local.get $0) +// (i32.const 1) +// ) +// ) +// ) +// ) +// ) +// ) +// +// All the GC heap operations have been removed, and we just have a plain int +// now, allowing a bunch of other opts to run. +// +// For us to replace an allocation with locals, we need to prove two things: +// +// * It must not escape from the function. If it escapes, we must pass out a +// reference anyhow. (In theory we could do a whole-program transformation +// to replace the reference with parameters in some cases, but inlining can +// hopefully let us optimize most cases.) +// * It must be used "exclusively", without overlap. That is, we cannot +// handle the case where a local.get might return our allocation, but might +// also get some other value. We also cannot handle a select where one arm +// is our allocation and another is something else. If the use is exclusive +// then we have a simple guarantee of being able to replace the heap +// allocation with the locals. +// +// Non-exclusive uses are optimizable too, but they require more work and add +// overhead. For example, here is a non-exclusive use: +// +// var x; +// if (..) { +// x = new Something(); // the allocation we want to optimize +// } else { +// x = something_else; +// } +// // 'x' here is not used exclusively by our allocation +// return x.field0; +// +// To optimize x.field0, we'd need to check if it contains our allocation or +// not, perhaps marking a boolean as true when it is, then doing an if on that +// local, etc.: +// +// var x_is_our_alloc; // whether x is our allocation +// var x; // keep x around for when it is not our allocation +// var x_field0; // the value of field0 on x, when x is our allocation +// if (..) { +// x_field0 = ..default value for the type.. +// x_is_our_alloc = true; +// } else { +// x = something_else; +// x_is_our_alloc = false; +// } +// return x_is_our_alloc ? x_field0 : x.field0; +// +// (node splitting/code duplication is another possible approach). On the other +// hand, if the allocation is used exclusively in all places (the if-else above +// does not have an else any more) then we can do this: +// +// var x_field0; // the value of field0 on x +// if (..) { +// x_field0 = ..default value for the type.. +// } +// return x_field0; +// +// This optimization focuses on such cases. +// + +#include "ir/branch-utils.h" +#include "ir/find_all.h" +#include "ir/local-graph.h" +#include "ir/parents.h" +#include "ir/properties.h" +#include "ir/type-updating.h" +#include "ir/utils.h" +#include "pass.h" +#include "support/unique_deferring_queue.h" +#include "wasm-builder.h" +#include "wasm.h" + +namespace wasm { + +namespace { + +struct Heap2LocalOptimizer { + Function* func; + Module* module; + const PassOptions& passOptions; + + // To find allocations that do not escape, we must track locals so that we + // can see how allocations flow from sets to gets and so forth. + // TODO: only scan reference types + LocalGraph localGraph; + + // To find what escapes, we need to follow where values flow, both up to + // parents, and via branches. + Parents parents; + BranchUtils::BranchTargets branchTargets; + + bool optimized = false; + + Heap2LocalOptimizer(Function* func, + Module* module, + const PassOptions& passOptions) + : func(func), module(module), passOptions(passOptions), localGraph(func), + parents(func->body), branchTargets(func->body) { + // We need to track what each set influences, to see where its value can + // flow to. + localGraph.computeSetInfluences(); + + // All the allocations in the function. + // TODO: Arrays (of constant size) as well. + FindAll<StructNew> allocations(func->body); + + for (auto* allocation : allocations.list) { + // The point of this optimization is to replace heap allocations with + // locals, so we must be able to place the data in locals. + if (!canHandleAsLocals(allocation->type)) { + continue; + } + + if (convertToLocals(allocation)) { + optimized = true; + } + } + } + + bool canHandleAsLocals(Type type) { + if (type == Type::unreachable) { + return false; + } + auto& fields = type.getHeapType().getStruct().fields; + for (auto field : fields) { + if (!TypeUpdating::canHandleAsLocal(field.type)) { + return false; + } + if (field.isPacked()) { + // TODO: support packed fields by adding coercions/truncations. + return false; + } + } + return true; + } + + // Handles the rewriting that we do to perform the optimization. We store the + // data that rewriting will need here, while we analyze, and then if we can do + // the optimization, we tell it to run. + // + // TODO: Doing a single rewrite walk at the end would be more efficient, but + // it would need to be more complex. + struct Rewriter : PostWalker<Rewriter> { + StructNew* allocation; + Function* func; + Builder builder; + const FieldList& fields; + + Rewriter(StructNew* allocation, Function* func, Module* module) + : allocation(allocation), func(func), builder(*module), + fields(allocation->type.getHeapType().getStruct().fields) {} + + // We must track all the local.sets that write the allocation, to verify + // exclusivity. + std::unordered_set<LocalSet*> sets; + + // All the expressions we reached during the flow analysis. That is exactly + // all the places where our allocation is used. We track these so that we + // can fix them up at the end, if the optimization ends up possible. + std::unordered_set<Expression*> reached; + + // Maps indexes in the struct to the local index that will replace them. + std::vector<Index> localIndexes; + + void applyOptimization() { + // Allocate locals to store the allocation's fields in. + for (auto field : fields) { + localIndexes.push_back(builder.addVar(func, field.type)); + } + + // Replace the things we need to using the visit* methods. + walk(func->body); + } + + void visitLocalSet(LocalSet* curr) { + if (!reached.count(curr)) { + return; + } + + // We don't need any sets of the reference to any of the locals it + // originally was written to. + // + // Note that after we remove the sets, other passes can easily remove the + // gets, and so we do not bother to do anything for them. (Also, in + // general it is not trivial to replace the gets - we'd need something of + // the same type, but the type might be a non-nullable reference type in + // the case of a parameter, and in the future maybe of some locals.) + if (curr->isTee()) { + replaceCurrent(curr->value); + } else { + replaceCurrent(builder.makeDrop(curr->value)); + } + } + + void visitStructNew(StructNew* curr) { + if (curr != allocation) { + return; + } + + // We do not remove the allocation itself here, rather we make it + // unnecessary, and then depend on other optimizations to clean up. (We + // cannot simply remove it because we need to replace it with something of + // the same non-nullable type.) + + // First, assign the initial values to the new locals. + std::vector<Expression*> contents; + + if (!allocation->isWithDefault()) { + // We must assign the initial values to temp indexes, then copy them + // over all at once. If instead we did set them as we go, then we might + // hit a problem like this: + // + // (local.set X (new_X)) + // (local.set Y (block (result ..) + // (.. (local.get X) ..) ;; returns new_X, wrongly + // (new_Y) + // ) + // + // Note how we assign to the local X and use it during the assignment to + // the local Y - but we should still see the old value of X, not new_X. + // Temp locals X', Y' can ensure that: + // + // (local.set X' (new_X)) + // (local.set Y' (block (result ..) + // (.. (local.get X) ..) ;; returns the proper, old X + // (new_Y) + // ) + // .. + // (local.set X (local.get X')) + // (local.set Y (local.get Y')) + std::vector<Index> tempIndexes; + + for (auto field : fields) { + tempIndexes.push_back(builder.addVar(func, field.type)); + } + + // Store the initial values into the temp locals. + for (Index i = 0; i < tempIndexes.size(); i++) { + contents.push_back( + builder.makeLocalSet(tempIndexes[i], allocation->operands[i])); + } + + // Copy them to the normal ones. + for (Index i = 0; i < tempIndexes.size(); i++) { + contents.push_back(builder.makeLocalSet( + localIndexes[i], + builder.makeLocalGet(tempIndexes[i], fields[i].type))); + } + + // Read the values in the allocation (we don't need to, as the + // allocation is not used after our optimization, but we need something + // with the right type anyhow). + for (Index i = 0; i < tempIndexes.size(); i++) { + allocation->operands[i] = + builder.makeLocalGet(localIndexes[i], fields[i].type); + } + + // TODO Check if the nondefault case does not increase code size in some + // cases. A heap allocation that implicitly sets the default values + // is smaller than multiple explicit settings of locals to + // defaults. + } else { + // Set the default values, and replace the allocation with a block that + // first does that, then contains the allocation. + // Note that we must assign the defaults because we might be in a loop, + // that is, there might be a previous value. + for (Index i = 0; i < localIndexes.size(); i++) { + contents.push_back(builder.makeLocalSet( + localIndexes[i], + builder.makeConstantExpression(Literal::makeZero(fields[i].type)))); + } + } + + // Put the allocation itself at the end of the block, so the block has the + // exact same type as the allocation it replaces. + contents.push_back(allocation); + replaceCurrent(builder.makeBlock(contents)); + } + + void visitStructSet(StructSet* curr) { + if (!reached.count(curr)) { + return; + } + + // Drop the ref (leaving it to other opts to remove, when possible), and + // write the data to the local instead of the heap allocation. + replaceCurrent(builder.makeSequence( + builder.makeDrop(curr->ref), + builder.makeLocalSet(localIndexes[curr->index], curr->value))); + } + + void visitStructGet(StructGet* curr) { + if (!reached.count(curr)) { + return; + } + + replaceCurrent( + builder.makeSequence(builder.makeDrop(curr->ref), + builder.makeLocalGet(localIndexes[curr->index], + fields[curr->index].type))); + } + }; + + enum class ParentChildInteraction { + // The parent lets the child escape. E.g. the parent is a call. + Escapes, + // The parent fully consumes the child in a safe, non-escaping way, and + // after consuming it nothing remains to flow further through the parent. + // E.g. the parent is a struct.get, which reads from the allocated heap + // value and does nothing more with the reference. + FullyConsumes, + // The parent flows the child out, that is, the child is the single value + // that can flow out from the parent. E.g. the parent is a block with no + // branches and the child is the final value that is returned. + Flows, + // The parent does not consume the child completely, so the child's value + // can be used through it. However the child does not flow cleanly through. + // E.g. the parent is a block with branches, and the value on them may be + // returned from the block and not only the child. This means the allocation + // is not used in an exclusive way, and we cannot optimize it. + Mixes, + }; + + // Analyze an allocation to see if we can convert it from a heap allocation to + // locals. + bool convertToLocals(StructNew* allocation) { + Rewriter rewriter(allocation, func, module); + + // A queue of flows from children to parents. When something is in the queue + // here then it assumed that it is ok for the allocation to be at the child + // (that is, we have already checked the child before placing it in the + // queue), and we need to check if it is ok to be at the parent, and to flow + // from the child to the parent. We will analyze that (see + // ParentChildInteraction, above) and continue accordingly. + using ChildAndParent = std::pair<Expression*, Expression*>; + UniqueNonrepeatingDeferredQueue<ChildAndParent> flows; + + // Start the flow from the allocation itself to its parent. + flows.push({allocation, parents.getParent(allocation)}); + + // Keep flowing while we can. + while (!flows.empty()) { + auto flow = flows.pop(); + auto* child = flow.first; + auto* parent = flow.second; + + // If we've already seen an expression, stop since we cannot optimize + // things that overlap in any way (see the notes on exclusivity, above). + // Note that we use a nonrepeating queue here, so we already do not visit + // the same thing more than once; what this check does is verify we don't + // look at something that another allocation reached, which would be in a + // different call to this function and use a different queue (any overlap + // between calls would prove non-exclusivity). + if (seen.count(child)) { + return false; + } + seen.insert(child); + + switch (getParentChildInteraction(parent, child)) { + case ParentChildInteraction::Escapes: { + // If the parent may let us escape then we are done. + return false; + } + case ParentChildInteraction::FullyConsumes: { + // If the parent consumes us without letting us escape then all is + // well (and there is nothing flowing from the parent to check). + break; + } + case ParentChildInteraction::Flows: { + // The value flows through the parent; we need to look further at the + // grandparent. + flows.push({parent, parents.getParent(parent)}); + break; + } + case ParentChildInteraction::Mixes: { + // Our allocation is not used exclusively via the parent, as other + // values are mixed with it. Give up. + return false; + } + } + + if (auto* set = parent->dynCast<LocalSet>()) { + // This is one of the sets we are written to, and so we must check for + // exclusive use of our allocation by all the gets that read the value. + // Note the set, and we will check the gets at the end once we know all + // of our sets. + rewriter.sets.insert(set); + + // We must also look at how the value flows from those gets. + if (auto* getsReached = getGetsReached(set)) { + for (auto* get : *getsReached) { + flows.push({get, parents.getParent(get)}); + } + } + } + + // If the parent may send us on a branch, we will need to look at the flow + // to the branch target(s). + for (auto name : branchesSentByParent(child, parent)) { + flows.push({parent, branchTargets.getTarget(name)}); + } + + // If we got to here, then we can continue to hope that we can optimize + // this allocation. Mark the parent as reached by it, and continue. + rewriter.reached.insert(parent); + } + + // We finished the loop over the flows. Do the final checks. + if (!getsAreExclusiveToSets(rewriter.sets)) { + return false; + } + + // We can do it, hurray! + rewriter.applyOptimization(); + + return true; + } + + // All the expressions we have already looked at. + std::unordered_set<Expression*> seen; + + ParentChildInteraction getParentChildInteraction(Expression* parent, + Expression* child) { + // If there is no parent then we are the body of the function, and that + // means we escape by flowing to the caller. + if (!parent) { + return ParentChildInteraction::Escapes; + } + + struct Checker : public Visitor<Checker> { + Expression* child; + + // Assume escaping (or some other problem we cannot analyze) unless we are + // certain otherwise. + bool escapes = true; + + // Assume we do not fully consume the value unless we are certain + // otherwise. If this is set to true, then we do not need to check any + // further. If it remains false, then we will analyze the value that + // falls through later to check for mixing. + // + // Note that this does not need to be set for expressions if their type + // proves that the value does not continue onwards (e.g. if their type is + // none, or not a reference type), but for clarity some do still mark this + // field as true when it is clearly so. + bool fullyConsumes = false; + + // General operations + void visitBlock(Block* curr) { + escapes = false; + // We do not mark fullyConsumes as the value may continue through this + // and other control flow structures. + } + // Note that If is not supported here, because for our value to flow + // through it there must be an if-else, and that means there is no single + // value falling through anyhow. + void visitLoop(Loop* curr) { escapes = false; } + void visitDrop(Drop* curr) { + escapes = false; + fullyConsumes = true; + } + void visitBreak(Break* curr) { escapes = false; } + void visitSwitch(Switch* curr) { escapes = false; } + + // Local operations. Locals by themselves do not escape; the analysis + // tracks where locals are used. + void visitLocalGet(LocalGet* curr) { escapes = false; } + void visitLocalSet(LocalSet* curr) { escapes = false; } + + // Reference operations. TODO add more + void visitRefAs(RefAs* curr) { + // TODO General OptimizeInstructions integration, that is, since we know + // that our allocation is what flows into this RefAs, we can + // know the exact outcome of the operation. + if (curr->op == RefAsNonNull) { + // As it is our allocation that flows through here, we know it is not + // null (so there is no trap), and we can continue to (hopefully) + // optimize this allocation. + escapes = false; + + // Note that while we can look through this operation, we cannot get + // rid of it later, as its parent might depend on receiving a + // non-nullable type. So we will leave the RefAsNonNull as it is, + // even if we do optimize the allocation, and we depend on other + // passes to remove the RefAsNonNull. + } + } + + // GC operations. + void visitStructSet(StructSet* curr) { + // The reference does not escape (but the value is stored to memory and + // therefore might). + if (curr->ref == child) { + escapes = false; + fullyConsumes = true; + } + } + void visitStructGet(StructGet* curr) { + escapes = false; + fullyConsumes = true; + } + + // TODO Array and I31 operations + } checker; + + checker.child = child; + checker.visit(parent); + + if (checker.escapes) { + return ParentChildInteraction::Escapes; + } + + // If the parent returns a type that is not a reference, then by definition + // it fully consumes the value as it does not flow our allocation onward. + if (checker.fullyConsumes || !parent->type.isRef()) { + return ParentChildInteraction::FullyConsumes; + } + + // Finally, check for mixing. If the child is the immediate fallthrough + // of the parent then no other values can be mixed in. + // + // TODO: Also check if we are sent via a branch (and that branch sends the + // single value exiting the parent). + // TODO: Also check for safe merges where our allocation is in all places, + // like two if or select arms, or branches. + if (Properties::getImmediateFallthrough( + parent, passOptions, module->features) == child) { + return ParentChildInteraction::Flows; + } + + return ParentChildInteraction::Mixes; + } + + std::unordered_set<LocalGet*>* getGetsReached(LocalSet* set) { + auto iter = localGraph.setInfluences.find(set); + if (iter != localGraph.setInfluences.end()) { + return &iter->second; + } + return nullptr; + } + + BranchUtils::NameSet branchesSentByParent(Expression* child, + Expression* parent) { + BranchUtils::NameSet names; + BranchUtils::operateOnScopeNameUsesAndSentValues( + parent, [&](Name name, Expression* value) { + if (value == child) { + names.insert(name); + } + }); + return names; + } + + // Verify exclusivity of all the gets for a bunch of sets. That is, assuming + // the sets are exclusive (they all write exactly our allocation, and nothing + // else), we need to check whether all the gets that read that value cannot + // read anything else (which would be the case if another set writes to that + // local, in the right live range). + bool getsAreExclusiveToSets(const std::unordered_set<LocalSet*>& sets) { + // Find all the relevant gets (which may overlap between the sets). + std::unordered_set<LocalGet*> gets; + for (auto* set : sets) { + if (auto* getsReached = getGetsReached(set)) { + for (auto* get : *getsReached) { + gets.insert(get); + } + } + } + + // Check that the gets can only read from the specific known sets. + for (auto* get : gets) { + for (auto* set : localGraph.getSetses[get]) { + if (sets.count(set) == 0) { + return false; + } + } + } + + return true; + } +}; + +struct Heap2Local : public WalkerPass<PostWalker<Heap2Local>> { + bool isFunctionParallel() override { return true; } + + Pass* create() override { return new Heap2Local(); } + + void doWalkFunction(Function* func) { + // Multiple rounds of optimization may work in theory, as once we turn one + // allocation into locals, references written to its fields become + // references written to locals, which we may see do not escape. However, + // this does not work yet, since we do not remove the original allocation - + // we just "detach" it from other things and then depend on other + // optimizations to remove it. That means this pass must be interleaved with + // vacuum, in particular, to optimize such nested allocations. + // TODO Consider running multiple iterations here, and running vacuum in + // between them. + if (Heap2LocalOptimizer(func, getModule(), getPassOptions()).optimized) { + TypeUpdating::handleNonDefaultableLocals(func, *getModule()); + } + } +}; + +} // anonymous namespace + +Pass* createHeap2LocalPass() { return new Heap2Local(); } + +} // namespace wasm diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp index 7790c3972..0d9bd62e5 100644 --- a/src/passes/pass.cpp +++ b/src/passes/pass.cpp @@ -149,6 +149,8 @@ void PassRegistry::registerPasses() { registerPass( "generate-stack-ir", "generate Stack IR", createGenerateStackIRPass); registerPass( + "heap2local", "replace GC allocations with locals", createHeap2LocalPass); + registerPass( "inline-main", "inline __original_main into main", createInlineMainPass); registerPass("inlining", "inline functions (you probably want inlining-optimizing)", @@ -434,6 +436,9 @@ void PassRunner::addDefaultFunctionOptimizationPasses() { addIfNoDWARFIssues("reorder-locals"); // simplify-locals opens opportunities for optimizations addIfNoDWARFIssues("remove-unused-brs"); + if (options.optimizeLevel > 1 && wasm->features.hasGC()) { + addIfNoDWARFIssues("heap2local"); + } // if we are willing to work hard, also optimize copies before coalescing if (options.optimizeLevel >= 3 || options.shrinkLevel >= 2) { addIfNoDWARFIssues("merge-locals"); // very slow on e.g. sqlite diff --git a/src/passes/passes.h b/src/passes/passes.h index 79d344f1a..4cc8040a4 100644 --- a/src/passes/passes.h +++ b/src/passes/passes.h @@ -49,6 +49,7 @@ Pass* createFunctionMetricsPass(); Pass* createGenerateDynCallsPass(); Pass* createGenerateI64DynCallsPass(); Pass* createGenerateStackIRPass(); +Pass* createHeap2LocalPass(); Pass* createI64ToI32LoweringPass(); Pass* createInlineMainPass(); Pass* createInliningPass(); diff --git a/src/wasm-interpreter.h b/src/wasm-interpreter.h index 4cb26d74a..b38272acd 100644 --- a/src/wasm-interpreter.h +++ b/src/wasm-interpreter.h @@ -1443,7 +1443,7 @@ public: // We must have a module in order to perform the cast, to get the type. If // we do not have one, or if the function is not present (which may happen // if we are optimizing a function before the entire module is built), - // then this is not something we cannot precompute. + // then this is something we cannot precompute. auto* func = module ? module->getFunctionOrNull(cast.originalRef.getFunc()) : nullptr; |