/* * Copyright 2016 WebAssembly Community Group participants * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // // Computes code at compile time where possible, replacing it with the // computed constant. // // The "propagate" variant of this pass also propagates constants across // sets and gets, which implements a standard constant propagation. // // Possible nondeterminism: WebAssembly NaN signs are nondeterministic, // and this pass may optimize e.g. a float 0 / 0 into +nan while a VM may // emit -nan, which can be a noticeable difference if the bits are // looked at. // #include "ir/iteration.h" #include "ir/literal-utils.h" #include "ir/local-graph.h" #include "ir/manipulation.h" #include "ir/properties.h" #include "ir/utils.h" #include "pass.h" #include "support/insert_ordered.h" #include "support/small_vector.h" #include "wasm-builder.h" #include "wasm-interpreter.h" #include "wasm.h" namespace wasm { // A map of gets to their constant values. If a get does not have a constant // value then it does not appear in the map (that avoids allocating for the // majority of gets). using GetValues = std::unordered_map; // A map of values on the heap. This maps the expressions that create the // heap data (struct.new, array.new, etc.) to the data they are created with. // Each such expression gets its own GCData created for it. This allows // computing identity between locals referring to the same GCData, by seeing // if they point to the same thing. // // Note that a source expression may create different data each time it is // reached in a loop, // // (loop // (if .. // (local.set $x // (struct.new .. // ) // ) // ..compare $x to something.. // ) // // Just like in SSA form, this is not a problem because the loop entry must // have a merge, if a value entering the loop might be noticed. In SSA form // that means a phi is created, and identity is set there. In our // representation, the merge will cause a local.get of $x to have more // possible input values than that struct.new, which means we will not infer // a value for it, and not attempt to say anything about comparisons of $x. using HeapValues = std::unordered_map>; // Precomputes an expression. Errors if we hit anything that can't be // precomputed. Inherits most of its functionality from // ConstantExpressionRunner, which it shares with the C-API, but adds handling // of GetValues computed during the precompute pass. class PrecomputingExpressionRunner : public ConstantExpressionRunner { using Super = ConstantExpressionRunner; // Concrete values of gets computed during the pass, which the runner does not // know about since it only records values of sets it visits. const GetValues& getValues; HeapValues& heapValues; // Limit evaluation depth for 2 reasons: first, it is highly unlikely // that we can do anything useful to precompute a hugely nested expression // (we should succed at smaller parts of it first). Second, a low limit is // helpful to avoid platform differences in native stack sizes. static const Index MAX_DEPTH = 50; // Limit loop iterations since loops might be infinite. Since we are going to // replace the expression and must preserve side effects, we limit this to the // very first iteration because a side effect would be necessary to achieve // more than one iteration before becoming concrete. static const Index MAX_LOOP_ITERATIONS = 1; public: PrecomputingExpressionRunner(Module* module, const GetValues& getValues, HeapValues& heapValues, bool replaceExpression) : ConstantExpressionRunner( module, replaceExpression ? FlagValues::PRESERVE_SIDEEFFECTS : FlagValues::DEFAULT, MAX_DEPTH, MAX_LOOP_ITERATIONS), getValues(getValues), heapValues(heapValues) {} Flow visitLocalGet(LocalGet* curr) { auto iter = getValues.find(curr); if (iter != getValues.end()) { auto values = iter->second; assert(values.isConcrete()); return Flow(values); } return ConstantExpressionRunner< PrecomputingExpressionRunner>::visitLocalGet(curr); } // TODO: Use immutability for values Flow visitStructNew(StructNew* curr) { auto flow = Super::visitStructNew(curr); if (flow.breaking()) { return flow; } return getHeapCreationFlow(flow, curr); } Flow visitStructSet(StructSet* curr) { return Flow(NONCONSTANT_FLOW); } Flow visitStructGet(StructGet* curr) { if (curr->ref->type == Type::unreachable || curr->ref->type.isNull()) { return Flow(NONCONSTANT_FLOW); } switch (curr->order) { case MemoryOrder::Unordered: // This can always be precomputed. break; case MemoryOrder::SeqCst: // This can never be precomputed away because it synchronizes with other // threads. return Flow(NONCONSTANT_FLOW); case MemoryOrder::AcqRel: // This synchronizes only with writes to the same data, so it can still // be precomputed if the data is not shared with other threads. if (curr->ref->type.getHeapType().isShared()) { return Flow(NONCONSTANT_FLOW); } break; } // If this field is immutable then we may be able to precompute this, as // if we also created the data in this function (or it was created in an // immutable global) then we know the value in the field. If it is // immutable, call the super method which will do the rest here. That // includes checking for the data being properly created, as if it was // not then we will not have a constant value for it, which means the // local.get of that value will stop us. auto& field = curr->ref->type.getHeapType().getStruct().fields[curr->index]; if (field.mutable_ == Mutable) { return Flow(NONCONSTANT_FLOW); } return Super::visitStructGet(curr); } Flow visitArrayNew(ArrayNew* curr) { auto flow = Super::visitArrayNew(curr); if (flow.breaking()) { return flow; } return getHeapCreationFlow(flow, curr); } Flow visitArrayNewFixed(ArrayNewFixed* curr) { auto flow = Super::visitArrayNewFixed(curr); if (flow.breaking()) { return flow; } return getHeapCreationFlow(flow, curr); } Flow visitArraySet(ArraySet* curr) { return Flow(NONCONSTANT_FLOW); } Flow visitArrayGet(ArrayGet* curr) { if (curr->ref->type != Type::unreachable && !curr->ref->type.isNull()) { // See above with struct.get auto element = curr->ref->type.getHeapType().getArray().element; if (element.mutable_ == Immutable) { return Super::visitArrayGet(curr); } } // Otherwise, we've failed to precompute. return Flow(NONCONSTANT_FLOW); } // ArrayLen is not disallowed here as it is an immutable property. Flow visitArrayCopy(ArrayCopy* curr) { return Flow(NONCONSTANT_FLOW); } // Generates heap info for a heap-allocating expression. template Flow getHeapCreationFlow(Flow flow, T* curr) { // We must return a literal that refers to the canonical location for this // source expression, so that each time we compute a specific struct.new // we get the same identity. std::shared_ptr& canonical = heapValues[curr]; std::shared_ptr newGCData = flow.getSingleValue().getGCData(); if (!canonical) { canonical = std::make_shared(*newGCData); } else { *canonical = *newGCData; } return Literal(canonical, curr->type.getHeapType()); } Flow visitStringNew(StringNew* curr) { if (curr->op != StringNewWTF16Array) { // TODO: handle other string ops. For now we focus on JS-like strings. return Flow(NONCONSTANT_FLOW); } // string.encode_wtf16_array is effectively an Array read operation, so // just like ArrayGet above we must check for immutability. auto refType = curr->ref->type; if (refType.isRef()) { auto heapType = refType.getHeapType(); if (heapType.isArray()) { if (heapType.getArray().element.mutable_ == Immutable) { return Super::visitStringNew(curr); } } } // Otherwise, this is mutable or unreachable or otherwise uninteresting. return Flow(NONCONSTANT_FLOW); } Flow visitStringEncode(StringEncode* curr) { // string.encode_wtf16_array is effectively an Array write operation, so // just like ArraySet and ArrayCopy above we must mark it as disallowed // (due to side effects). (And we do not support other operations than // string.encode_wtf16_array anyhow.) return Flow(NONCONSTANT_FLOW); } }; struct Precompute : public WalkerPass< PostWalker>> { bool isFunctionParallel() override { return true; } std::unique_ptr create() override { return std::make_unique(propagate); } bool propagate = false; Precompute(bool propagate) : propagate(propagate) {} GetValues getValues; HeapValues heapValues; bool canPartiallyPrecompute; void doWalkFunction(Function* func) { // Perform partial precomputing only when the optimization level is non- // trivial, as it is slower and less likely to help. canPartiallyPrecompute = getPassOptions().optimizeLevel >= 2; // Walk the function and precompute things. Super::doWalkFunction(func); partiallyPrecompute(func); if (!propagate) { return; } // When propagating, we can utilize the graph of local operations to // precompute the values from a local.set to a local.get. This populates // getValues which is then used by a subsequent walk that applies those // values. bool propagated = propagateLocals(func); if (propagated) { // We found constants to propagate and entered them in getValues. Do // another walk to apply them and perhaps other optimizations that are // unlocked. Super::doWalkFunction(func); // We could also try to partially precompute again, but that is a somewhat // heavy operation, so we only do it the first time, and leave such things // for later runs of this pass and for --converge. } // Note that in principle even more cycles could find further work here, in // very rare cases. To avoid constructing a LocalGraph again just for that // unlikely chance, we leave such things for later. } template void reuseConstantNode(T* curr, Flow flow) { if (flow.values.isConcrete()) { // reuse a const / ref.null / ref.func node if there is one if (curr->value && flow.values.size() == 1) { Literal singleValue = flow.getSingleValue(); if (singleValue.type.isNumber()) { if (auto* c = curr->value->template dynCast()) { c->value = singleValue; c->finalize(); curr->finalize(); return; } } else if (singleValue.isNull()) { if (auto* n = curr->value->template dynCast()) { n->finalize(singleValue.type); curr->finalize(); return; } } else if (singleValue.type.isRef() && singleValue.type.getHeapType().isSignature()) { if (auto* r = curr->value->template dynCast()) { r->func = singleValue.getFunc(); auto heapType = getModule()->getFunction(r->func)->type; r->finalize(Type(heapType, NonNullable)); curr->finalize(); return; } } } curr->value = flow.getConstExpression(*getModule()); } else { curr->value = nullptr; } curr->finalize(); } void visitExpression(Expression* curr) { // TODO: if local.get, only replace with a constant if we don't care about // size...? if (Properties::isConstantExpression(curr) || curr->is()) { return; } // try to evaluate this into a const Flow flow = precomputeExpression(curr); if (!canEmitConstantFor(flow.values)) { return; } if (flow.breaking()) { if (flow.breakTo == NONCONSTANT_FLOW) { // This cannot be turned into a constant, but perhaps we can partially // precompute it. considerPartiallyPrecomputing(curr); return; } if (flow.breakTo == RETURN_FLOW) { // this expression causes a return. if it's already a return, reuse the // node if (auto* ret = curr->dynCast()) { reuseConstantNode(ret, flow); } else { Builder builder(*getModule()); replaceCurrent(builder.makeReturn( flow.values.isConcrete() ? flow.getConstExpression(*getModule()) : nullptr)); } return; } // this expression causes a break, emit it directly. if it's already a br, // reuse the node. if (auto* br = curr->dynCast()) { br->name = flow.breakTo; br->condition = nullptr; reuseConstantNode(br, flow); } else { Builder builder(*getModule()); replaceCurrent(builder.makeBreak( flow.breakTo, flow.values.isConcrete() ? flow.getConstExpression(*getModule()) : nullptr)); } return; } // this was precomputed if (flow.values.isConcrete()) { replaceCurrent(flow.getConstExpression(*getModule())); } else { ExpressionManipulator::nop(curr); } } void visitBlock(Block* curr) { // When block precomputation fails, it can lead to quadratic slowness due to // the "tower of blocks" pattern used to implement switches: // // (block // (block // ... // (block // (br_table .. // // If we try to precompute each block here, and fail on each, then we end up // doing quadratic work. This is also wasted work as once a nested block // fails to precompute there is not really a chance to succeed on the // parent. If we do *not* fail to precompute, however, then we do want to // precompute such nested blocks, e.g.: // // (block $out // (block // (br $out) // ) // ) // // Here we *can* precompute the inner block, so when we get to the outer one // we see this: // // (block $out // (br $out) // ) // // And that precomputes to nothing. Therefore when we see a child of the // block that is another block (it failed to precompute to something // simpler) then we leave early here. // // Note that in theory we could still precompute here if wasm had // instructions that allow such things, e.g.: // // (block $out // (block // (cause side effect1) // (cause side effect2) // ) // (undo those side effects exactly) // ) // // We are forced to invent a side effect that we can precisely undo (unlike, // say locals - a local.set would persist outside of the block, and even if // we did another set to the original value, this pass doesn't track values // that way). Only with that can we make the inner block un-precomputable // (because there are side effects) but the outer one is (because those // effects are undone). Note that it is critical that we have two things in // the block, so that we can't precompute it to one of them (which is what // we did to the br in the previous example). Note also that this is still // optimizable using other passes, as merge-blocks will fold the two blocks // together. if (!curr->list.empty() && curr->list[0]->is()) { // The first child is a block, that is, it could not be simplified, so // this looks like the "tower of blocks" pattern. Avoid quadratic time // here as explained above. (We could also look at other children of the // block, but the only real-world pattern identified so far is on the // first child, so keep things simple here.) return; } // Otherwise, precompute normally like all other expressions. visitExpression(curr); } // If we failed to precompute a constant, perhaps we can still precompute part // of an expression. Specifically, consider this case: // // (A // (select // (B) // (C) // (condition) // ) // ) // // Perhaps we can compute A(B) and A(C). If so, we can emit a better select: // // (select // (constant result of A(B)) // (constant result of A(C)) // (condition) // ) // // Note that in general for code size we want to move operations *out* of // selects and ifs (OptimizeInstructions does that), but here we are // computing two constants which replace three expressions, so it is // worthwhile. // // To do such partial precomputing, in the main pass we note selects that look // promising. If we find any then we do a second pass later just for that (as // doing so requires walking up the stack in a manner that we want to avoid in // the main pass for overhead reasons; see below). // // Note that selects are all we really need here: Other passes would turn an // if into a select if the arms are simple enough, and only in those cases // (simple arms) do we have a chance at partially precomputing. For example, // if an arm is a constant then we can, but if it is a call then we can't.) // However, there are cases like an if with arms with side effects that end in // precomputable things, that are missed atm TODO std::unordered_set partiallyPrecomputable; void considerPartiallyPrecomputing(Expression* curr) { if (!canPartiallyPrecompute) { return; } if (auto* select = curr->dynCast