From 141f7cad3aa516db3828e619b31fe681e46a151b Mon Sep 17 00:00:00 2001 From: Alon Zakai Date: Wed, 10 Jan 2024 15:02:05 -0800 Subject: Precompute into select arms (#6212) E.g. (i32.add (select (i32.const 100) (i32.const 200) (..condition..) ) (i32.const 50) ) ;; => (select (i32.const 150) (i32.const 250) (..condition..) ) We cannot fully precompute the select, but we can "partially precompute" it, by precomputing its arms using the parent. This may require looking several steps up the parent chain, which is an awkward operation in our simple walkers, so to do it we capture stacks of parents and operate directly on them. This is a little slower than a normal walk, so only do it when we see a promising select, and only in -O2 and above (this makes the pass 7% or so slower; not a large cost, but best to avoid it in -O1). --- src/passes/Precompute.cpp | 330 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 318 insertions(+), 12 deletions(-) (limited to 'src') diff --git a/src/passes/Precompute.cpp b/src/passes/Precompute.cpp index 4178342a2..5baf2331f 100644 --- a/src/passes/Precompute.cpp +++ b/src/passes/Precompute.cpp @@ -27,16 +27,19 @@ // looked at. // -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "ir/effects.h" +#include "ir/iteration.h" +#include "ir/literal-utils.h" +#include "ir/local-graph.h" +#include "ir/manipulation.h" +#include "ir/properties.h" +#include "ir/utils.h" +#include "pass.h" +#include "support/insert_ordered.h" +#include "support/unique_deferring_queue.h" +#include "wasm-builder.h" +#include "wasm-interpreter.h" +#include "wasm.h" namespace wasm { @@ -210,9 +213,16 @@ struct Precompute GetValues getValues; HeapValues heapValues; + bool canPartiallyPrecompute; + void doWalkFunction(Function* func) { + // Perform partial precomputing only when the optimization level is non- + // trivial, as it is slower and less likely to help. + canPartiallyPrecompute = getPassOptions().optimizeLevel >= 2; + // Walk the function and precompute things. super::doWalkFunction(func); + partiallyPrecompute(func); if (!propagate) { return; } @@ -226,11 +236,13 @@ struct Precompute // another walk to apply them and perhaps other optimizations that are // unlocked. super::doWalkFunction(func); + // We could also try to partially precompute again, but that is a somewhat + // heavy operation, so we only do it the first time, and leave such things + // for later runs of this pass and for --converge. } // Note that in principle even more cycles could find further work here, in // very rare cases. To avoid constructing a LocalGraph again just for that - // unlikely chance, we leave such things for later runs of this pass and for - // --converge. + // unlikely chance, we leave such things for later. } template void reuseConstantNode(T* curr, Flow flow) { @@ -281,6 +293,9 @@ struct Precompute } if (flow.breaking()) { if (flow.breakTo == NONCONSTANT_FLOW) { + // This cannot be turned into a constant, but perhaps we can partially + // precompute it. + considerPartiallyPrecomputing(curr); return; } if (flow.breakTo == RETURN_FLOW) { @@ -319,6 +334,273 @@ struct Precompute } } + // If we failed to precompute a constant, perhaps we can still precompute part + // of an expression. Specifically, consider this case: + // + // (A + // (select + // (B) + // (C) + // (condition) + // ) + // ) + // + // Perhaps we can compute A(B) and A(C). If so, we can emit a better select: + // + // (select + // (constant result of A(B)) + // (constant result of A(C)) + // (condition) + // ) + // + // Note that in general for code size we want to move operations *out* of + // selects and ifs (OptimizeInstructions does that), but here we are + // computing two constants which replace three expressions, so it is + // worthwhile. + // + // To do such partial precomputing, in the main pass we note selects that look + // promising. If we find any then we do a second pass later just for that (as + // doing so requires walking up the stack in a manner that we want to avoid in + // the main pass for overhead reasons; see below). + // + // Note that selects are all we really need here: Other passes would turn an + // if into a select if the arms are simple enough, and only in those cases + // (simple arms) do we have a chance at partially precomputing. For example, + // if an arm is a constant then we can, but if it is a call then we can't.) + // However, there are cases like an if with arms with side effects that end in + // precomputable things, that are missed atm TODO + std::unordered_set partiallyPrecomputable; + + void considerPartiallyPrecomputing(Expression* curr) { + if (!canPartiallyPrecompute) { + return; + } + + if (auto* select = curr->dynCast