diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/passes/Monomorphize.cpp | 288 |
1 files changed, 260 insertions, 28 deletions
diff --git a/src/passes/Monomorphize.cpp b/src/passes/Monomorphize.cpp index 8c08db55b..1fd81b9d1 100644 --- a/src/passes/Monomorphize.cpp +++ b/src/passes/Monomorphize.cpp @@ -85,13 +85,18 @@ // compute the LUB of a bunch of calls to a target and then investigate // that one case and use it in all those callers. // TODO: Not just direct calls? But updating vtables is complex. +// TODO: Should we look at no-inline flags? We do move code between functions, +// but it isn't normal inlining. // #include "ir/cost.h" +#include "ir/effects.h" #include "ir/find_all.h" +#include "ir/iteration.h" #include "ir/manipulation.h" #include "ir/module-utils.h" #include "ir/names.h" +#include "ir/properties.h" #include "ir/return-utils.h" #include "ir/type-updating.h" #include "ir/utils.h" @@ -212,33 +217,204 @@ struct CallContext { // remaining values by updating |newOperands| (for example, if all the values // sent are constants, then |newOperands| will end up empty, as we have // nothing left to send). + // + // The approach implemented here tries to move as much code into the call + // context as possible. That may not always be helpful, say in situations like + // this: + // + // (call $foo + // (i32.add + // (local.get $x) + // (local.get $y) + // ) + // ) + // + // If we move the i32.add into $foo then it will still be adding two unknown + // values (which will be parameters rather than locals). Moving the add might + // just increase code size if so. However, there are many other situations + // where the more code, the better: + // + // (call $foo + // (i32.eqz + // (local.get $x) + // ) + // ) + // + // While the value remains unknown, after moving the i32.eqz into the target + // function we may be able to use the fact that it has at most 1 bit set. + // Even larger benefits can happen in WasmGC: + // + // (call $foo + // (struct.new $T + // (local.get $x) + // (local.get $y) + // ) + // ) + // + // If the struct never escapes then we may be able to remove the allocation + // after monomorphization, even if we know nothing about the values in its + // fields. + // + // TODO: Explore other options that are more careful about how much code is + // moved. void buildFromCall(CallInfo& info, std::vector<Expression*>& newOperands, - Module& wasm) { + Module& wasm, + const PassOptions& options) { Builder builder(wasm); + // First, find the things we can move into the context and the things we + // cannot. Some things simply cannot be moved out of the calling function, + // such as a local.set, but also we need to handle effect interactions among + // the operands, because each time we move code into the context we are + // pushing it into the called function, which changes the order of + // operations, for example: + // + // (call $foo + // (first + // (a) + // ) + // (second + // (b) + // ) + // ) + // + // (func $foo (param $first) (param $second) + // ) + // + // If we move |first| and |a| into the context then we get this: + // + // (call $foo + // ;; |first| and |a| were removed from here. + // (second + // (b) + // ) + // ) + // + // (func $foo (param $second) + // ;; |first| is now a local, and we assign it inside the called func. + // (local $first) + // (local.set $first + // (first + // (a) + // ) + // ) + // ) + // + // After this code motion we execute |second| and |b| *before* the call, and + // |first| and |a| after, so we cannot do this transformation if the order + // of operations between them matters. + // + // The key property here is that all things that are moved into the context + // (moved into the monomorphized function) remain ordered with respect to + // each other, but must be moved past all non-moving things after them. For + // example, say we want to move B and D in this list (of expressions in + // execution order): + // + // A, B, C, D, E + // + // After moving B and D we end up with this: + // + // A, C, E and executing later in the monomorphized function: B, D + // + // Then we must be able to move B past C and E, and D past E. It is simplest + // to compute this in reverse order, starting from E and going back, and + // then each time we want to move something we can check if it can cross + // over all the non-moving effects we've seen so far. To compute this, first + // list out the post-order of the expressions, and then we'll iterate in + // reverse. + struct Lister + : public PostWalker<Lister, UnifiedExpressionVisitor<Lister>> { + std::vector<Expression*> list; + void visitExpression(Expression* curr) { list.push_back(curr); } + } lister; + // As a quick estimate, we need space for at least the operands. + lister.list.reserve(operands.size()); + + for (auto* operand : info.call->operands) { + lister.walk(operand); + } + + // Go in reverse post-order as explained earlier, noting what cannot be + // moved into the context, and while accumulating the effects that are not + // moving. + std::unordered_set<Expression*> immovable; + EffectAnalyzer nonMovingEffects(options, wasm); + for (auto i = int64_t(lister.list.size()) - 1; i >= 0; i--) { + auto* curr = lister.list[i]; + + // This may have been marked as immovable because of the parent. We do + // that because if a parent is immovable then we can't move the children + // into the context (if we did, they would execute after the parent, but + // it needs their values). + bool currImmovable = immovable.count(curr) > 0; + if (!currImmovable) { + // This might be movable or immovable. Check both effect interactions + // (as described before, we want to move this past immovable code) and + // reasons intrinsic to the expression itself that might prevent moving. + ShallowEffectAnalyzer currEffects(options, wasm, curr); + if (currEffects.invalidates(nonMovingEffects) || + !canBeMovedIntoContext(curr, currEffects)) { + immovable.insert(curr); + currImmovable = true; + } + } + + if (currImmovable) { + // Regardless of whether this was marked immovable because of the + // parent, or because we just found it cannot be moved, accumulate the + // effects, and also mark its immediate children (so that we do the same + // when we get to them). + nonMovingEffects.visit(curr); + for (auto* child : ChildIterator(curr)) { + immovable.insert(child); + } + } + } + + // We now know which code can be moved and which cannot, so we can do the + // final processing of the call operands. We do this as a copy operation, + // copying as much as possible into the call context. Code that cannot be + // moved ends up as values sent to the monomorphized function. + // + // The copy operation works in pre-order, which allows us to override + // entire children as needed: + // + // (call $foo + // (problem + // (a) + // ) + // (later) + // ) + // + // We visit |problem| first, and if there is a problem that prevents us + // moving it into the context then we override the copy and then it and + // its child |a| remain in the caller (and |a| is never visited in the + // copy). for (auto* operand : info.call->operands) { - // Process the operand. This is a copy operation, as we are trying to move - // (copy) code from the callsite into the called function. When we find we - // can copy then we do so, and when we cannot that value remains as a - // value sent from the call. operands.push_back(ExpressionManipulator::flexibleCopy( operand, wasm, [&](Expression* child) -> Expression* { - if (canBeMovedIntoContext(child)) { - // This can be moved, great: let the copy happen. + if (!child) { + // This is an optional child that is not present. Let the copy of + // the nullptr happen. + return nullptr; + } + + if (!immovable.count(child)) { + // This can be moved; let the copy happen. return nullptr; } - // This cannot be moved, so we stop here: this is a value that is sent - // into the monomorphized function. It is a new operand in the call, - // and in the context operands it is a local.get, that reads that - // value. + // This cannot be moved. Do not copy it into the call context. In the + // example above, |problem| remains as an operand on the call (so we + // add it to |newOperands|), and in the call context all we have is a + // local.get that reads that sent value. auto paramIndex = newOperands.size(); newOperands.push_back(child); // TODO: If one operand is a tee and another a get, we could actually // reuse the local, effectively showing the monomorphized - // function that the values are the same. (But then the checks - // later down to is<LocalGet> would need to check index too.) + // function that the values are the same. EquivalentSets may + // help here. return builder.makeLocalGet(paramIndex, child->type); })); } @@ -247,12 +423,49 @@ struct CallContext { } // Checks whether an expression can be moved into the context. - bool canBeMovedIntoContext(Expression* curr) { - // Constant numbers, funcs, strings, etc. can all be copied, so it is ok to - // add them to the context. - // TODO: Allow global.get as well, and anything else that is purely - // copyable. - return Properties::isSingleConstantExpression(curr); + bool canBeMovedIntoContext(Expression* curr, + const ShallowEffectAnalyzer& effects) { + // Pretty much everything can be moved into the context if we can copy it + // between functions, such as constants, globals, etc. The things we cannot + // copy are now checked for. + if (effects.branchesOut || effects.hasExternalBreakTargets()) { + // This branches or returns. We can't move control flow between functions. + return false; + } + if (effects.accessesLocal()) { + // Reads/writes to local state cannot be moved around. + return false; + } + if (effects.calls) { + // We can in principle move calls, but for simplicity we avoid such + // situations (which might involve recursion etc.). + return false; + } + if (Properties::isControlFlowStructure(curr)) { + // We can in principle move entire control flow structures with their + // children, but for simplicity stop when we see one rather than look + // inside to see if we could transfer all its contents. (We would also + // need to be careful when handling If arms, etc.) + return false; + } + for (auto* child : ChildIterator(curr)) { + if (child->type.isTuple()) { + // Consider this: + // + // (call $target + // (tuple.extract 2 1 + // (local.get $tuple) + // ) + // ) + // + // We cannot move the tuple.extract into the context, because then the + // call would have a tuple param. While it is possible to split up the + // tuple, or to check if we can also move the children with the parent, + // for simplicity just ignore this rare situation. + return false; + } + } + return true; } // Check if a context is trivial relative to a call, that is, the context @@ -389,7 +602,7 @@ struct Monomorphize : public Pass { // if we use that context. CallContext context; std::vector<Expression*> newOperands; - context.buildFromCall(info, newOperands, wasm); + context.buildFromCall(info, newOperands, wasm, getPassOptions()); // See if we've already evaluated this function + call context. If so, then // we've memoized the result. @@ -447,8 +660,22 @@ struct Monomorphize : public Pass { doOpts(func); doOpts(monoFunc.get()); + // The cost before monomorphization is the old body + the context + // operands. The operands will be *removed* from the calling code if we + // optimize, and moved into the monomorphized function, so the proper + // comparison is the context + the old body, versus the new body (which + // includes the reverse-inlined call context). auto costBefore = CostAnalyzer(func->body).cost; + for (auto* operand : context.operands) { + // Note that a slight oddity is that we have *not* optimized the + // operands before. We optimize func before and after, but the operands + // are in the calling function, which we are not modifying here. In + // theory that might lead to false positives, if the call's operands are + // very unoptimized. + costBefore += CostAnalyzer(operand).cost; + } auto costAfter = CostAnalyzer(monoFunc->body).cost; + // TODO: We should probably only accept improvements above some minimum, // to avoid optimizing cases where we duplicate a huge function but // only optimize a tiny part of it compared to the original. @@ -486,14 +713,19 @@ struct Monomorphize : public Pass { // Copy the function as the base for the new one. auto newFunc = ModuleUtils::copyFunctionWithoutAdd(func, wasm, newName); - // Generate the new signature, and apply it to the new function. + // A local.get is a value that arrives in a parameter. Anything else is + // something that we are reverse-inlining into the function, so we don't + // need a param for it. Note that we might have multiple gets nested here, + // if we are copying part of the original parameter but not all children, so + // we scan each operand for all such local.gets. + // + // Use this information to generate the new signature, and apply it to the + // new function. std::vector<Type> newParams; for (auto* operand : context.operands) { - // A local.get is a value that arrives in a parameter. Anything else is - // something that we are reverse-inlining into the function, so we don't - // need a param for it. - if (operand->is<LocalGet>()) { - newParams.push_back(operand->type); + FindAll<LocalGet> gets(operand); + for (auto* get : gets.list) { + newParams.push_back(get->type); } } // If we were dropped then we are pulling the drop into the monomorphized @@ -501,7 +733,7 @@ struct Monomorphize : public Pass { auto newResults = context.dropped ? Type::none : func->getResults(); newFunc->type = Signature(Type(newParams), newResults); - // We must update local indexes: the new function has a potentially + // We must update local indexes: the new function has a potentially // different number of parameters, and parameters are at the very bottom of // the local index space. We are also replacing old params with vars. To // track this, map each old index to the new one. @@ -559,7 +791,7 @@ struct Monomorphize : public Pass { // (local.get $param) ;; copied old body // ) // - // We need to add such an local.set in the prelude of the function for each + // We need to add such a local.set in the prelude of the function for each // operand in the context. std::vector<Expression*> pre; for (Index i = 0; i < context.operands.size(); i++) { |