summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/ir/CMakeLists.txt1
-rw-r--r--src/ir/hashed.h17
-rw-r--r--src/ir/properties.cpp42
-rw-r--r--src/ir/properties.h45
-rw-r--r--src/passes/LocalCSE.cpp660
-rw-r--r--src/passes/OptimizeInstructions.cpp27
-rw-r--r--src/passes/Precompute.cpp5
-rw-r--r--src/passes/pass.cpp6
-rw-r--r--src/tools/wasm-reduce.cpp3
9 files changed, 577 insertions, 229 deletions
diff --git a/src/ir/CMakeLists.txt b/src/ir/CMakeLists.txt
index f98f7b7aa..c17a48ac0 100644
--- a/src/ir/CMakeLists.txt
+++ b/src/ir/CMakeLists.txt
@@ -3,6 +3,7 @@ set(ir_SOURCES
ExpressionAnalyzer.cpp
ExpressionManipulator.cpp
names.cpp
+ properties.cpp
LocalGraph.cpp
ReFinalize.cpp
stack-utils.cpp
diff --git a/src/ir/hashed.h b/src/ir/hashed.h
index 4a6e1647e..99488dcfb 100644
--- a/src/ir/hashed.h
+++ b/src/ir/hashed.h
@@ -23,23 +23,6 @@
namespace wasm {
-// An expression with a cached hash value
-struct HashedExpression {
- Expression* expr;
- size_t digest;
-
- HashedExpression(Expression* expr) : expr(expr) {
- if (expr) {
- digest = ExpressionAnalyzer::hash(expr);
- } else {
- digest = hash(0);
- }
- }
-
- HashedExpression(const HashedExpression& other)
- : expr(other.expr), digest(other.digest) {}
-};
-
// A pass that hashes all functions
struct FunctionHasher : public WalkerPass<PostWalker<FunctionHasher>> {
diff --git a/src/ir/properties.cpp b/src/ir/properties.cpp
new file mode 100644
index 000000000..889ece57b
--- /dev/null
+++ b/src/ir/properties.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2021 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ir/properties.h"
+#include "wasm-traversal.h"
+
+namespace wasm {
+
+namespace Properties {
+
+bool isIntrinsicallyNondeterministic(Expression* curr, FeatureSet features) {
+ // Practically no wasm instructions are intrinsically nondeterministic.
+ // Exceptions occur only in GC atm.
+ if (!features.hasGC()) {
+ return false;
+ }
+
+ struct Scanner : public PostWalker<Scanner> {
+ bool deterministic = true;
+ void visitStructNew(StructNew* curr) { deterministic = false; }
+ void visitArrayNew(ArrayNew* curr) { deterministic = false; }
+ } scanner;
+ scanner.walk(curr);
+ return !scanner.deterministic;
+}
+
+} // namespace Properties
+
+} // namespace wasm
diff --git a/src/ir/properties.h b/src/ir/properties.h
index d477cb11c..4909dfbc8 100644
--- a/src/ir/properties.h
+++ b/src/ir/properties.h
@@ -335,6 +335,51 @@ inline bool canEmitSelectWithArms(Expression* ifTrue, Expression* ifFalse) {
return ifTrue->type.isSingle() && ifFalse->type.isSingle();
}
+// An intrinsically-nondeterministic expression is one that can return different
+// results for the same inputs, and that difference is *not* explained by
+// other expressions that interact with this one. Hence the cause of
+// nondeterminism can be said to be "intrinsic" - it is internal and inherent in
+// the expression.
+//
+// To see the issue more concretely, consider these:
+//
+// x = load(100);
+// ..
+// y = load(100);
+//
+// versus
+//
+// x = struct.new();
+// ..
+// y = struct.new();
+//
+// Are x and y identical in both cases? For loads, we can look at the code
+// in ".." to see: if there are no possible stores to memory, then the
+// result is identical (and we have EffectAnalyzer for that). For the GC
+// allocations, though, it doesn't matter what is in "..": there is nothing
+// in the wasm that we can check to find out if the results are the same or
+// not. (In fact, in this case they are always not the same.) So the
+// nondeterminism is "intrinsic."
+//
+// Thus, loads are nondeterministic but not intrinsically so, while GC
+// allocations are actual examples of intrinsically nondeterministic
+// instructions. If wasm were to add "get current time" or "get a random number"
+// instructions then those would also be intrinsically nondeterministic.
+//
+// * Note that NaN nondeterminism is ignored here. Technically that allows e.g.
+// an f32.add to be nondeterministic, but it is a valid wasm implementation
+// to have deterministic NaN behavior, and we optimize under that assumption.
+// So NaN nondeterminism does not cause anything to be intrinsically
+// nondeterministic.
+// * Note that calls are ignored here. In theory this concept could be defined
+// either way for them (that is, we could potentially define them as either
+// intrinsically nondeterministic, or not, and each could make sense in its
+// own way). It is simpler to ignore them here, which means we only consider
+// the behavior of the expression provided here (including its chldren), and
+// not external code.
+//
+bool isIntrinsicallyNondeterministic(Expression* curr, FeatureSet features);
+
} // namespace Properties
} // namespace wasm
diff --git a/src/passes/LocalCSE.cpp b/src/passes/LocalCSE.cpp
index 7e6d9cb41..b55da976c 100644
--- a/src/passes/LocalCSE.cpp
+++ b/src/passes/LocalCSE.cpp
@@ -17,32 +17,112 @@
//
// Local CSE
//
-// This requires --flatten to be run before in order to be effective,
-// and preserves flatness. The reason flatness is required is that
-// this pass assumes everything is stored in a local, and all it does
-// is alter local.sets to do local.gets of an existing value when
-// possible, replacing a recomputing of that value. That design means that
-// if there are block and if return values, nested expressions not stored
-// to a local, etc., then it can't operate on them (and will just not
-// do anything for them).
+// This finds common sub-expressions and saves them to a local to avoid
+// recomputing them. It runs in each basic block separately, and uses a simple
+// algorithm, where we track "requests" to reuse a value. That is, if we see
+// an add operation appear twice, and the inputs must be identical in both
+// cases, then the second one requests to reuse the computed value from the
+// first. The first one to appear is the "original" expression that will remain
+// in the code; we will save it's value to a local, and get it from that local
+// later:
//
-// In each linear area of execution,
-// * track each relevant (big enough) expression
-// * if already seen, write to a local if not already, and reuse
-// * invalidate the list as we see effects
+// (i32.add (A) (B))
+// (i32.add (A) (B))
//
-// TODO: global, inter-block gvn etc.
+// =>
+//
+// (local.tee $temp (i32.add (A) (B)))
+// (local.get $temp)
+//
+// The algorithm used here is as follows:
+//
+// * Scan: Hash each expression and see if it repeats later.
+// If it does:
+// * Note that the original appearance is requested to be reused
+// an additional time.
+// * Link the first expression as the original appearance of the
+// later one.
+// * Scan the children of the repeat and undo their requests to be
+// replaced (as if we will reuse the parent, we don't need to do
+// anything for the children, see below).
+//
+// * Check: Check if effects prevent some of the requests from being done. For
+// example, if the value contains a load from memory, we cannot
+// optimize around a store, as the value before the store might be
+// different (see below).
+//
+// * Apply: Go through the basic block again, this time adding a tee on an
+// expression whose value we want to use later, and replacing the
+// uses with gets.
+//
+// For example, here is what the algorithm would do on
+//
+// (i32.eqz (A))
+// ..
+// (i32.eqz (A))
+//
+// Assuming A does not have side effects that interfere, this will happen:
+//
+// 1. Scan A and add it to the hash map of things we have seen.
+// 2. Scan the eqz, and do the same for it.
+// 3. Scan the second A. Increment the first A's requests counter, and mark the
+// second A as intended to be replaced by the original A.
+// 4. Scan the second eqz, and do similar things for it: increment the requests
+// for the original eqz, and point to the original from the repeat.
+// * Then also scan its children, in this case A, and decrement the first
+// A's reuse counter, and unmark the second A's note that it is intended
+// to be replaced.
+// 5. After that, the second eqz requests to be replaced by the first, and
+// there is no request on A.
+// 6. Run through the block again, adding a tee and replacing the second eqz,
+// resulting in:
+//
+// (local.tee $temp
+// (i32.eqz
+// (A)
+// )
+// )
+// (local.get $temp)
+//
+// Note how the scanning of children avoids us adding a local for A: when we
+// reuse the parent, we don't need to also try to reuse the child.
+//
+// Effects must be considered carefully by the Checker phase. E.g.:
+//
+// x = load(a);
+// store(..)
+// y = load(a);
+//
+// Even though the load looks identical, the store means we may load a
+// different value, so we will invalidate the request to optimize here.
+//
+// This pass only finds entire expression trees, and not parts of them, so we
+// will not optimize this:
+//
+// (A (B (C (D1))))
+// (A (B (C (D2))))
+//
+// The innermost child is different, so the trees are not identical. However,
+// running flatten before running this pass would allow those to be optimized as
+// well (we would also need to simplify locals somewhat to allow the locals to
+// be identified as identical, see pass.cpp).
+//
+// TODO: Global, inter-block gvn etc. However, note that atm the cost of our
+// adding new locals here is low because their lifetimes are all within a
+// single basic block. A global optimization here could add long-lived
+// locals with register allocation costs in the entire function.
//
#include <algorithm>
#include <memory>
-#include "ir/flat.h"
#include <ir/cost.h>
#include <ir/effects.h>
-#include <ir/equivalent_sets.h>
-#include <ir/hashed.h>
+#include <ir/iteration.h>
#include <ir/linear-execution.h>
+#include <ir/properties.h>
+#include <ir/type-updating.h>
+#include <ir/utils.h>
#include <pass.h>
#include <wasm-builder.h>
#include <wasm-traversal.h>
@@ -50,224 +130,420 @@
namespace wasm {
-struct LocalCSE : public WalkerPass<LinearExecutionWalker<LocalCSE>> {
- bool isFunctionParallel() override { return true; }
+namespace {
- // CSE adds and reuses locals.
- // FIXME DWARF updating does not handle local changes yet.
- bool invalidatesDWARF() override { return true; }
+// An expression with a cached hash value.
+struct HashedExpression {
+ Expression* expr;
+ size_t digest;
- Pass* create() override { return new LocalCSE(); }
+ HashedExpression(Expression* expr, size_t digest)
+ : expr(expr), digest(digest) {}
- struct Usable {
- HashedExpression hashed;
- Type localType;
- Usable(HashedExpression hashed, Type localType)
- : hashed(hashed), localType(localType) {}
- };
+ HashedExpression(const HashedExpression& other)
+ : expr(other.expr), digest(other.digest) {}
+};
- struct UsableHasher {
- size_t operator()(const Usable value) const {
- auto digest = value.hashed.digest;
- wasm::rehash(digest, value.localType.getID());
- return digest;
- }
- };
+struct HEHasher {
+ size_t operator()(const HashedExpression hashed) const {
+ return hashed.digest;
+ }
+};
- struct UsableComparer {
- bool operator()(const Usable a, const Usable b) const {
- if (a.hashed.digest != b.hashed.digest || a.localType != b.localType) {
- return false;
- }
- return ExpressionAnalyzer::equal(a.hashed.expr, b.hashed.expr);
+// A full equality check for HashedExpressions. The hash is used as a speedup,
+// but if it matches we still verify the contents are identical.
+struct HEComparer {
+ bool operator()(const HashedExpression a, const HashedExpression b) const {
+ if (a.digest != b.digest) {
+ return false;
}
- };
-
- // information for an expression we can reuse
- struct UsableInfo {
- Expression* value; // the value we can reuse
- Index index; // the local we are assigned to, local.get that to reuse us
- EffectAnalyzer effects;
-
- UsableInfo(Expression* value,
- Index index,
- PassOptions& passOptions,
- FeatureSet features)
- : value(value), index(index), effects(passOptions, features, value) {}
- };
-
- // a list of usables in a linear execution trace
- class Usables
- : public std::
- unordered_map<Usable, UsableInfo, UsableHasher, UsableComparer> {};
-
- // locals in current linear execution trace, which we try to sink
- Usables usables;
-
- // We track locals containing the same value - the value is what matters, not
- // the index.
- EquivalentSets equivalences;
-
- bool anotherPass;
+ return ExpressionAnalyzer::equal(a.expr, b.expr);
+ }
+};
- void doWalkFunction(Function* func) {
- Flat::verifyFlatness(func);
- anotherPass = true;
- // we may need multiple rounds
- while (anotherPass) {
- anotherPass = false;
- clear();
- super::doWalkFunction(func);
- }
+// Maps hashed expressions to the list of expressions that match. That is, all
+// expressions that are equivalent (same hash, and also compare equal) will
+// be in a vector for the corresponding entry in this map.
+using HashedExprs = std::unordered_map<HashedExpression,
+ SmallVector<Expression*, 1>,
+ HEHasher,
+ HEComparer>;
+
+// Request information about an expression: whether it requests to be replaced,
+// or it is an original expression whose value can be used to replace others
+// later.
+struct RequestInfo {
+ // The number of other expressions that would like to reuse this value.
+ Index requests = 0;
+
+ // If this is a repeat value, this points to the original. (And we have
+ // incremented the original's |requests|.)
+ Expression* original = nullptr;
+
+ void validate() const {
+ // An expression cannot both be requested and make requests. If we see A
+ // appear three times, both repeats must request from the very first.
+ assert(!(requests && original));
+
+ // When we encounter a requestInfo (after its initial creation) then it
+ // must either request or be requested - otherwise, it should not exist,
+ // as we remove unneeded things from our tracking.
+ assert(requests || original);
}
+};
- static void doNoteNonLinear(LocalCSE* self, Expression** currp) {
- self->clear();
+// A map of expressions to their request info.
+struct RequestInfoMap : public std::unordered_map<Expression*, RequestInfo> {
+ void dump(std::ostream& o) {
+ for (auto& kv : *this) {
+ auto* curr = kv.first;
+ auto& info = kv.second;
+ o << *curr << " has " << info.requests << " reqs, orig: " << info.original
+ << '\n';
+ }
}
+};
- void clear() {
- usables.clear();
- equivalences.clear();
+struct Scanner
+ : public LinearExecutionWalker<Scanner, UnifiedExpressionVisitor<Scanner>> {
+ PassOptions options;
+
+ // Request info for all expressions ever seen.
+ RequestInfoMap& requestInfos;
+
+ Scanner(PassOptions options, RequestInfoMap& requestInfos)
+ : options(options), requestInfos(requestInfos) {}
+
+ // Currently active hashed expressions in the current basic block. If we see
+ // an active expression before us that is identical to us, then it becomes our
+ // original expression that we request from.
+ HashedExprs activeExprs;
+
+ // Hash values of all active expressions. We store these so that we do not end
+ // up recomputing hashes of children in an N^2 manner.
+ std::unordered_map<Expression*, size_t> activeHashes;
+
+ static void doNoteNonLinear(Scanner* self, Expression** currp) {
+ // We are starting a new basic block. Forget all the currently-hashed
+ // expressions, as we no longer want to make connections to anything from
+ // another block.
+ self->activeExprs.clear();
+ self->activeHashes.clear();
+ // Note that we do not clear requestInfos - that is information we will use
+ // later in the Applier class. That is, we've cleared all the active
+ // information, leaving the things we need later.
}
- // Checks invalidations due to a set of effects. Also optionally receive
- // an expression that was just post-visited, and that also needs to be
- // taken into account.
- void checkInvalidations(EffectAnalyzer& effects, Expression* curr = nullptr) {
- // TODO: this is O(bad)
- std::vector<Usable> invalidated;
- for (auto& sinkable : usables) {
- // Check invalidations of the values we may want to use.
- if (effects.invalidates(sinkable.second.effects)) {
- invalidated.push_back(sinkable.first);
+ void visitExpression(Expression* curr) {
+ // Compute the hash, using the pre-computed hashes of the children, which
+ // are saved. This allows us to hash everything in linear time.
+ //
+ // Note that we must compute the hash first, as we need it even for things
+ // that are not isRelevant() (if they are the children of a relevant thing).
+ auto hash = ExpressionAnalyzer::shallowHash(curr);
+ for (auto* child : ChildIterator(curr)) {
+ auto iter = activeHashes.find(child);
+ if (iter == activeHashes.end()) {
+ // The child was in another block, so this expression cannot be
+ // optimized.
+ return;
}
+ hash_combine(hash, iter->second);
+ }
+ activeHashes[curr] = hash;
+
+ // Check if this is something relevant for optimization.
+ if (!isRelevant(curr)) {
+ return;
}
- if (curr) {
- // If we are a set, we have more to check: each of the usable
- // values was from a set, and we did not consider the set in
- // the loop above - just the values. So here we must check that
- // sets do not interfere. (Note that due to flattening we
- // have no risk of tees etc.)
- if (auto* set = curr->dynCast<LocalSet>()) {
- for (auto& sinkable : usables) {
- // Check if the index is the same. Make sure to ignore
- // our own value, which we may have just added!
- if (sinkable.second.index == set->index &&
- sinkable.second.value != set->value) {
- invalidated.push_back(sinkable.first);
- }
+
+ auto& vec = activeExprs[HashedExpression(curr, hash)];
+ vec.push_back(curr);
+ if (vec.size() > 1) {
+ // This is a repeat expression. Add a request for it.
+ auto& info = requestInfos[curr];
+ auto* original = vec[0];
+ info.original = original;
+
+ // Mark the request on the original. Note that this may create the
+ // requestInfo for it, if it is the first request (this avoids us creating
+ // requests eagerly).
+ requestInfos[original].requests++;
+
+ // Remove any requests from the expression's children, as we will replace
+ // the entire thing (see explanation earlier). Note that we just need to
+ // go over our direct children, as grandchildren etc. have already been
+ // processed. That is, if we have
+ //
+ // (A (B (C))
+ // (A (B (C))
+ //
+ // Then when we see the second B we will mark the second C as no longer
+ // requesting replacement. And then when we see the second A, all it needs
+ // to update is the second B.
+ for (auto* child : ChildIterator(curr)) {
+ if (!requestInfos.count(child)) {
+ // The child never had a request. While it repeated (since the parent
+ // repeats), it was not relevant for the optimization so we never
+ // created a requestInfo for it.
+ continue;
+ }
+
+ // Remove the child.
+ auto& childInfo = requestInfos[child];
+ auto* childOriginal = childInfo.original;
+ requestInfos.erase(child);
+
+ // Update the child's original, potentially erasing it too if no
+ // requests remain.
+ assert(childOriginal);
+ auto& childOriginalRequests = requestInfos[childOriginal].requests;
+ assert(childOriginalRequests > 0);
+ childOriginalRequests--;
+ if (childOriginalRequests == 0) {
+ requestInfos.erase(childOriginal);
}
}
}
- for (auto index : invalidated) {
- usables.erase(index);
- }
}
- std::vector<Expression*> expressionStack;
+ // Only some values are relevant to be optimized.
+ bool isRelevant(Expression* curr) {
+ // * Ignore anything that is not a concrete type, as we are looking for
+ // computed values to reuse, and so none and unreachable are irrelevant.
+ // * Ignore local.get and set, as those are the things we optimize to.
+ // * Ignore constants so that we don't undo the effects of constant
+ // propagation.
+ // * Ignore things we cannot put in a local, as then we can't do this
+ // optimization at all.
+ //
+ // More things matter here, like having side effects or not, but computing
+ // them is not cheap, so leave them for later, after we know if there
+ // actually are any requests for reuse of this value (which is rare).
+ if (!curr->type.isConcrete() || curr->is<LocalGet>() ||
+ curr->is<LocalSet>() || Properties::isConstantExpression(curr) ||
+ !TypeUpdating::canHandleAsLocal(curr->type)) {
+ return false;
+ }
- static void visitPre(LocalCSE* self, Expression** currp) {
- // pre operations
- Expression* curr = *currp;
+ // If the size is at least 3, then if we have two of them we have 6,
+ // and so adding one set+one get and removing one of the items itself
+ // is not detrimental, and may be beneficial.
+ // TODO: investigate size 2
+ if (options.shrinkLevel > 0 && Measurer::measure(curr) >= 3) {
+ return true;
+ }
- EffectAnalyzer effects(self->getPassOptions(), self->getModule()->features);
- if (effects.checkPre(curr)) {
- self->checkInvalidations(effects);
+ // If we focus on speed, any reduction in cost is beneficial, as the
+ // cost of a get is essentially free.
+ if (options.shrinkLevel == 0 && CostAnalyzer(curr).cost > 0) {
+ return true;
}
- self->expressionStack.push_back(curr);
+ return false;
}
+};
- static void visitPost(LocalCSE* self, Expression** currp) {
- auto* curr = *currp;
+// Check for invalidations due to effects. We do this after scanning as effect
+// computation is not cheap, and there are usually not many identical fragments
+// of code.
+//
+// This updates the RequestInfos of things it sees are invalidated, which will
+// make Applier ignore them.
+struct Checker
+ : public LinearExecutionWalker<Checker, UnifiedExpressionVisitor<Checker>> {
+ PassOptions options;
+ RequestInfoMap& requestInfos;
+
+ Checker(PassOptions options, RequestInfoMap& requestInfos)
+ : options(options), requestInfos(requestInfos) {}
+
+ struct ActiveOriginalInfo {
+ // How many of the requests remain to be seen during our walk. When this
+ // reaches 0, we know that the original is no longer requested from later in
+ // the block.
+ Index requestsLeft;
+
+ // The effects in the expression.
+ EffectAnalyzer effects;
+ };
- // main operations
- self->handle(curr);
+ // The currently relevant original expressions, that is, the ones that may be
+ // optimized in the current basic block.
+ std::unordered_map<Expression*, ActiveOriginalInfo> activeOriginals;
+
+ void visitExpression(Expression* curr) {
+ // This is the first time we encounter this expression.
+ assert(!activeOriginals.count(curr));
+
+ // Given the current expression, see what it invalidates of the currently-
+ // hashed expressions, if there are any.
+ if (!activeOriginals.empty()) {
+ EffectAnalyzer effects(options, getModule()->features);
+ // We only need to visit this node itself, as we have already visited its
+ // children by the time we get here.
+ effects.visit(curr);
+
+ std::vector<Expression*> invalidated;
+ for (auto& kv : activeOriginals) {
+ auto* original = kv.first;
+ auto& originalInfo = kv.second;
+ if (effects.invalidates(originalInfo.effects)) {
+ invalidated.push_back(original);
+ }
+ }
- // post operations
+ for (auto* original : invalidated) {
+ // Remove all requests after this expression, as we cannot optimize to
+ // them.
+ requestInfos[original].requests -=
+ activeOriginals.at(original).requestsLeft;
+
+ // If no requests remain at all (that is, there were no requests we
+ // could provide before we ran into this invalidation) then we do not
+ // need this original at all.
+ if (requestInfos[original].requests == 0) {
+ requestInfos.erase(original);
+ }
- EffectAnalyzer effects(self->getPassOptions(), self->getModule()->features);
- if (effects.checkPost(curr)) {
- self->checkInvalidations(effects, curr);
+ activeOriginals.erase(original);
+ }
}
- self->expressionStack.pop_back();
- }
+ auto iter = requestInfos.find(curr);
+ if (iter == requestInfos.end()) {
+ return;
+ }
+ auto& info = iter->second;
+ info.validate();
+
+ if (info.requests > 0) {
+ // This is an original. Compute its side effects, as we cannot optimize
+ // away repeated apperances if it has any.
+ EffectAnalyzer effects(options, getModule()->features, curr);
+
+ // We also cannot optimize away something that is intrinsically
+ // nondeterministic: even if it has no side effects, if it may return a
+ // different result each time, then we cannot optimize away repeats.
+ if (effects.hasSideEffects() ||
+ Properties::isIntrinsicallyNondeterministic(curr,
+ getModule()->features)) {
+ requestInfos.erase(curr);
+ } else {
+ activeOriginals.emplace(
+ curr, ActiveOriginalInfo{info.requests, std::move(effects)});
+ }
+ } else if (info.original) {
+ // The original may have already been invalidated. If so, remove our info
+ // as well.
+ auto originalIter = activeOriginals.find(info.original);
+ if (originalIter == activeOriginals.end()) {
+ requestInfos.erase(iter);
+ return;
+ }
- // override scan to add a pre and a post check task to all nodes
- static void scan(LocalCSE* self, Expression** currp) {
- self->pushTask(visitPost, currp);
+ // After visiting this expression, we have one less request for its
+ // original, and perhaps none are left.
+ auto& originalInfo = originalIter->second;
+ if (originalInfo.requestsLeft == 1) {
+ activeOriginals.erase(info.original);
+ } else {
+ originalInfo.requestsLeft--;
+ }
+ }
+ }
- super::scan(self, currp);
+ static void doNoteNonLinear(Checker* self, Expression** currp) {
+ // Between basic blocks there can be no active originals.
+ assert(self->activeOriginals.empty());
+ }
- self->pushTask(visitPre, currp);
+ void visitFunction(Function* curr) {
+ // At the end of the function there can be no active originals.
+ assert(activeOriginals.empty());
}
+};
- void handle(Expression* curr) {
- if (auto* set = curr->dynCast<LocalSet>()) {
- // Calculate equivalences
- auto* func = getFunction();
- equivalences.reset(set->index);
- if (auto* get = set->value->dynCast<LocalGet>()) {
- if (func->getLocalType(set->index) == func->getLocalType(get->index)) {
- equivalences.add(set->index, get->index);
- }
- }
- // consider the value
- auto* value = set->value;
- if (isRelevant(value)) {
- Usable usable(value, func->getLocalType(set->index));
- auto iter = usables.find(usable);
- if (iter != usables.end()) {
- // already exists in the table, this is good to reuse
- auto& info = iter->second;
- Type localType = func->getLocalType(info.index);
- set->value =
- Builder(*getModule()).makeLocalGet(info.index, localType);
- anotherPass = true;
- } else {
- // not in table, add this, maybe we can help others later
- usables.emplace(std::make_pair(
- usable,
- UsableInfo(
- value, set->index, getPassOptions(), getModule()->features)));
- }
- }
- } else if (auto* get = curr->dynCast<LocalGet>()) {
- if (auto* set = equivalences.getEquivalents(get->index)) {
- // Canonicalize to the lowest index. This lets hashing and comparisons
- // "just work".
- get->index = *std::min_element(set->begin(), set->end());
+// Applies the optimization now that we know which requests are valid.
+struct Applier
+ : public LinearExecutionWalker<Applier, UnifiedExpressionVisitor<Applier>> {
+ RequestInfoMap requestInfos;
+
+ Applier(RequestInfoMap& requestInfos) : requestInfos(requestInfos) {}
+
+ // Maps the original expressions that we save to locals to the local indexes
+ // for them.
+ std::unordered_map<Expression*, Index> originalLocalMap;
+
+ void visitExpression(Expression* curr) {
+ auto iter = requestInfos.find(curr);
+ if (iter == requestInfos.end()) {
+ return;
+ }
+
+ const auto& info = iter->second;
+ info.validate();
+
+ if (info.requests) {
+ // We have requests for this value. Add a local and tee the value to
+ // there.
+ Index local = originalLocalMap[curr] =
+ Builder::addVar(getFunction(), curr->type);
+ replaceCurrent(
+ Builder(*getModule()).makeLocalTee(local, curr, curr->type));
+ } else if (info.original) {
+ auto& originalInfo = requestInfos.at(info.original);
+ if (originalInfo.requests) {
+ // This is a valid request of an original value. Get the value from the
+ // local.
+ assert(originalLocalMap.count(info.original));
+ replaceCurrent(
+ Builder(*getModule())
+ .makeLocalGet(originalLocalMap[info.original], curr->type));
+ originalInfo.requests--;
}
}
}
- // A relevant value is a non-trivial one, something we may want to reuse
- // and are able to.
- bool isRelevant(Expression* value) {
- if (value->is<LocalGet>()) {
- return false; // trivial, this is what we optimize to!
- }
- if (!value->type.isConcrete()) {
- return false; // don't bother with unreachable etc.
- }
- if (EffectAnalyzer(getPassOptions(), getModule()->features, value)
- .hasSideEffects()) {
- return false; // we can't combine things with side effects
- }
- auto& options = getPassRunner()->options;
- // If the size is at least 3, then if we have two of them we have 6,
- // and so adding one set+two gets and removing one of the items itself
- // is not detrimental, and may be beneficial.
- if (options.shrinkLevel > 0 && Measurer::measure(value) >= 3) {
- return true;
+ static void doNoteNonLinear(Applier* self, Expression** currp) {
+ // Clear the state between blocks.
+ self->originalLocalMap.clear();
+ }
+};
+
+} // anonymous namespace
+
+struct LocalCSE : public WalkerPass<PostWalker<LocalCSE>> {
+ bool isFunctionParallel() override { return true; }
+
+ // FIXME DWARF updating does not handle local changes yet.
+ bool invalidatesDWARF() override { return true; }
+
+ Pass* create() override { return new LocalCSE(); }
+
+ void doWalkFunction(Function* func) {
+ auto options = getPassOptions();
+
+ RequestInfoMap requestInfos;
+
+ Scanner scanner(options, requestInfos);
+ scanner.walkFunctionInModule(func, getModule());
+ if (requestInfos.empty()) {
+ // We did not find any repeated expressions at all.
+ return;
}
- // If we focus on speed, any reduction in cost is beneficial, as the
- // cost of a get is essentially free.
- if (options.shrinkLevel == 0 && CostAnalyzer(value).cost > 0) {
- return true;
+
+ Checker checker(options, requestInfos);
+ checker.walkFunctionInModule(func, getModule());
+ if (requestInfos.empty()) {
+ // No repeated expressions remain after checking for effects.
+ return;
}
- return false;
+
+ Applier applier(requestInfos);
+ applier.walkFunctionInModule(func, getModule());
+
+ TypeUpdating::handleNonDefaultableLocals(func, *getModule());
}
};
diff --git a/src/passes/OptimizeInstructions.cpp b/src/passes/OptimizeInstructions.cpp
index 18fbf6d83..43d0d912f 100644
--- a/src/passes/OptimizeInstructions.cpp
+++ b/src/passes/OptimizeInstructions.cpp
@@ -1470,30 +1470,21 @@ private:
// First, check for side effects. If there are any, then we can't even
// assume things like local.get's of the same index being identical.
auto passOptions = getPassOptions();
- if (EffectAnalyzer(passOptions, getModule()->features, left)
- .hasSideEffects() ||
- EffectAnalyzer(passOptions, getModule()->features, right)
- .hasSideEffects()) {
+ auto features = getModule()->features;
+ if (EffectAnalyzer(passOptions, features, left).hasSideEffects() ||
+ EffectAnalyzer(passOptions, features, right).hasSideEffects()) {
return false;
}
// Ignore extraneous things and compare them structurally.
- left = Properties::getFallthrough(left, passOptions, getModule()->features);
- right =
- Properties::getFallthrough(right, passOptions, getModule()->features);
+ left = Properties::getFallthrough(left, passOptions, features);
+ right = Properties::getFallthrough(right, passOptions, features);
if (!ExpressionAnalyzer::equal(left, right)) {
return false;
}
- // Reference equality also needs to check for allocation, which is *not* a
- // side effect, but which results in different references:
- // repeatedly calling (struct.new $foo)'s output will return different
- // results (while i32.const etc. of course does not).
- // Note that allocations may have appeared in the original inputs to this
- // function, and skipped when we focused on what falls through; since there
- // are no side effects, any allocations there cannot reach the fallthrough.
- if (left->type.isRef()) {
- if (FindAll<StructNew>(left).has() || FindAll<ArrayNew>(left).has()) {
- return false;
- }
+ // To be equal, they must also be known to return the same result
+ // deterministically.
+ if (Properties::isIntrinsicallyNondeterministic(left, features)) {
+ return false;
}
return true;
}
diff --git a/src/passes/Precompute.cpp b/src/passes/Precompute.cpp
index 99741ebe3..82504bff5 100644
--- a/src/passes/Precompute.cpp
+++ b/src/passes/Precompute.cpp
@@ -307,7 +307,10 @@ private:
// value*. A case where that can happen is GC data (each struct.new
// creates a new, unique struct, even if the data is equal), and so
// PrecomputingExpressionRunner will return a nonconstant flow for all
- // GC heap operations.
+ // GC heap operations. (We could also have used
+ // Properties::isIntrinsicallyNondeterministic here, but that would be
+ // less efficient to re-scan the entire expression.)
+ //
// (Other side effects are fine; if an expression does a call and we
// somehow know the entire expression precomputes to a 42, then we can
// propagate that 42 along to the users, regardless of whatever the call
diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp
index fa706dd10..595b431ca 100644
--- a/src/passes/pass.cpp
+++ b/src/passes/pass.cpp
@@ -423,7 +423,13 @@ void PassRunner::addDefaultFunctionOptimizationPasses() {
// that depend on flat IR
if (options.optimizeLevel >= 4) {
addIfNoDWARFIssues("flatten");
+ // LocalCSE is particularly useful after flatten (see comment in the pass
+ // itself), but we must simplify locals a little first (as flatten adds many
+ // new and redundant ones, which make things seem different if we do not
+ // run some amount of simplify-locals first).
+ addIfNoDWARFIssues("simplify-locals-notee-nostructure");
addIfNoDWARFIssues("local-cse");
+ // TODO: add rereloop etc. here
}
addIfNoDWARFIssues("dce");
addIfNoDWARFIssues("remove-unused-names");
diff --git a/src/tools/wasm-reduce.cpp b/src/tools/wasm-reduce.cpp
index 878233c84..6a442aeba 100644
--- a/src/tools/wasm-reduce.cpp
+++ b/src/tools/wasm-reduce.cpp
@@ -257,13 +257,14 @@ struct Reducer
"-O4",
"--flatten -Os",
"--flatten -O3",
- "--flatten --local-cse -Os",
+ "--flatten --simplify-locals-notee-nostructure --local-cse -Os",
"--coalesce-locals --vacuum",
"--dce",
"--duplicate-function-elimination",
"--inlining",
"--inlining-optimizing",
"--optimize-level=3 --inlining-optimizing",
+ "--local-cse",
"--memory-packing",
"--remove-unused-names --merge-blocks --vacuum",
"--optimize-instructions",