diff options
author | Alon Zakai <alonzakai@gmail.com> | 2019-03-01 10:28:07 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-03-01 10:28:07 -0800 |
commit | 689fe405a3417fbfd59456035add6f6f53149f35 (patch) | |
tree | d6f1dcaf0cbb85eb3ae830f68a46c9a6627d1562 /src | |
parent | f59c3033e678ced61bc8c78e8ac9fbee31ef0210 (diff) | |
download | binaryen-689fe405a3417fbfd59456035add6f6f53149f35.tar.gz binaryen-689fe405a3417fbfd59456035add6f6f53149f35.tar.bz2 binaryen-689fe405a3417fbfd59456035add6f6f53149f35.zip |
Consistently optimize small added constants into load/store offsets (#1924)
See #1919 - we did not do this consistently before.
This adds a lowMemoryUnused option to PassOptions. It can be passed on the commandline with --low-memory-unused. If enabled, we run the new optimize-added-constants pass, which does the real work here, replacing older code in post-emscripten.
Aside from running at the proper time (unlike the old pass, see #1919), this also has a -propagate mode, which can do stuff like this:
y = x + 10
[..]
load(y)
[..]
load(y)
=>
y = x + 10
[..]
load(x, offset=10)
[..]
load(x, offset=10)
That is, it can propagate such offsets to the loads/stores. This pattern is common in big interpreter loops, where the pointers are offsets into a big struct of state.
The pass does this propagation by using a new feature of LocalGraph, which can verify which locals are in SSA mode. Binaryen IR is not SSA (intentionally, since it's a later IR), but if a local only has a single set for all gets, that means that local is in such a state, and can be optimized. The tricky thing is that all locals are initialized to zero, so there are at minimum two sets. But if we verify that the real set dominates all the gets, then the zero initialization cannot reach them, and we are safe.
This PR also makes safe-heap aware of lowMemoryUnused. If so, we check for not just an access of 0, but the range 0-1023.
This makes zlib 5% faster, with either the wasm backend or asm2wasm. It also makes it 0.5% smaller. Also helps sqlite (1.5% faster) and lua (1% faster)
Diffstat (limited to 'src')
-rw-r--r-- | src/asm2wasm.h | 3 | ||||
-rw-r--r-- | src/ir/LocalGraph.cpp | 33 | ||||
-rw-r--r-- | src/ir/local-graph.h | 33 | ||||
-rw-r--r-- | src/pass.h | 28 | ||||
-rw-r--r-- | src/passes/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/passes/CodePushing.cpp | 6 | ||||
-rw-r--r-- | src/passes/OptimizeAddedConstants.cpp | 239 | ||||
-rw-r--r-- | src/passes/PickLoadSigns.cpp | 38 | ||||
-rw-r--r-- | src/passes/PostEmscripten.cpp | 60 | ||||
-rw-r--r-- | src/passes/Precompute.cpp | 7 | ||||
-rw-r--r-- | src/passes/SafeHeap.cpp | 9 | ||||
-rw-r--r-- | src/passes/pass.cpp | 9 | ||||
-rw-r--r-- | src/passes/passes.h | 2 | ||||
-rw-r--r-- | src/tools/optimization-options.h | 5 | ||||
-rw-r--r-- | src/wasm.h | 6 | ||||
-rw-r--r-- | src/wasm/wasm-s-parser.cpp | 4 |
16 files changed, 382 insertions, 101 deletions
diff --git a/src/asm2wasm.h b/src/asm2wasm.h index c193791f8..9a68542a5 100644 --- a/src/asm2wasm.h +++ b/src/asm2wasm.h @@ -957,6 +957,7 @@ void Asm2WasmBuilder::processAsm(Ref ast) { } optimizingBuilder = make_unique<OptimizingIncrementalModuleBuilder>(&wasm, numFunctions, passOptions, [&](PassRunner& passRunner) { // addPrePasses + passRunner.options.lowMemoryUnused = true; if (debug) { passRunner.setDebug(true); passRunner.setValidateGlobally(false); @@ -1189,6 +1190,7 @@ void Asm2WasmBuilder::processAsm(Ref ast) { // functions). Optimize those now. Typically there are very few, just do it // sequentially. PassRunner passRunner(&wasm, passOptions); + passRunner.options.lowMemoryUnused = true; passRunner.addDefaultFunctionOptimizationPasses(); for (auto& pair : trappingFunctions.getFunctions()) { auto* func = pair.second; @@ -1447,6 +1449,7 @@ void Asm2WasmBuilder::processAsm(Ref ast) { }; PassRunner passRunner(&wasm, passOptions); + passRunner.options.lowMemoryUnused = true; passRunner.setFeatures(passOptions.features); if (debug) { passRunner.setDebug(true); diff --git a/src/ir/LocalGraph.cpp b/src/ir/LocalGraph.cpp index 6a99ed44e..1cf883c19 100644 --- a/src/ir/LocalGraph.cpp +++ b/src/ir/LocalGraph.cpp @@ -242,5 +242,38 @@ void LocalGraph::computeInfluences() { } } +void LocalGraph::computeSSAIndexes() { + std::unordered_map<Index, std::set<SetLocal*>> indexSets; + for (auto& pair : getSetses) { + auto* get = pair.first; + auto& sets = pair.second; + for (auto* set : sets) { + indexSets[get->index].insert(set); + } + } + for (auto& pair : locations) { + auto* curr = pair.first; + if (auto* set = curr->dynCast<SetLocal>()) { + auto& sets = indexSets[set->index]; + if (sets.size() == 1 && *sets.begin() != curr) { + // While it has just one set, it is not the right one (us), + // so mark it invalid. + sets.clear(); + } + } + } + for (auto& pair : indexSets) { + auto index = pair.first; + auto& sets = pair.second; + if (sets.size() == 1) { + SSAIndexes.insert(index); + } + } +} + +bool LocalGraph::isSSA(Index x) { + return SSAIndexes.count(x); +} + } // namespace wasm diff --git a/src/ir/local-graph.h b/src/ir/local-graph.h index 725be0536..fd6a496c0 100644 --- a/src/ir/local-graph.h +++ b/src/ir/local-graph.h @@ -45,13 +45,40 @@ struct LocalGraph { // value (0 for a var, the received value for a param) Locations locations; // where each get and set is (for easy replacing) - // optional computation: compute the influence graphs between sets and gets - // (useful for algorithms that propagate changes) + // Optional: compute the influence graphs between sets and gets + // (useful for algorithms that propagate changes). + + void computeInfluences(); std::unordered_map<GetLocal*, std::unordered_set<SetLocal*>> getInfluences; // for each get, the sets whose values are influenced by that get std::unordered_map<SetLocal*, std::unordered_set<GetLocal*>> setInfluences; // for each set, the gets whose values are influenced by that set - void computeInfluences(); + // Optional: Compute the local indexes that are SSA, in the sense of + // * a single set for all the gets for that local index + // * the set dominates all the gets (logically implied by the former property) + // * no other set (aside from the zero-init) + // The third property is not exactly standard SSA, but is useful since we are not in + // SSA form in our IR. To see why it matters, consider these: + // + // x = 0 // zero init + // [..] + // x = 10 + // y = x + 20 + // x = 30 // !!! + // f(y) + // + // The !!! line violates that property - it is another set for x, and it may interfere + // say with replacing f(y) with f(x + 20). Instead, if we know the only other possible set for x + // is the zero init, then things like the !!! line cannot exist, and it is valid to replace + // f(y) with f(x + 20). + // (This could be simpler, but in wasm the zero init always exists.) + + void computeSSAIndexes(); + + bool isSSA(Index x); + +private: + std::set<Index> SSAIndexes; }; } // namespace wasm diff --git a/src/pass.h b/src/pass.h index 78d1f594e..9ef159d7d 100644 --- a/src/pass.h +++ b/src/pass.h @@ -56,14 +56,26 @@ private: }; struct PassOptions { - bool debug = false; // run passes in debug mode, doing extra validation and timing checks - bool validate = true; // whether to run the validator to check for errors - bool validateGlobally = false; // when validating validate globally and not just locally - int optimizeLevel = 0; // 0, 1, 2 correspond to -O0, -O1, -O2, etc. - int shrinkLevel = 0; // 0, 1, 2 correspond to -O0, -Os, -Oz - bool ignoreImplicitTraps = false; // optimize assuming things like div by 0, bad load/store, will not trap - bool debugInfo = false; // whether to try to preserve debug info through, which are special calls - FeatureSet features = FeatureSet::All; // Which wasm features to accept, and be allowed to use + // Run passes in debug mode, doing extra validation and timing checks. + bool debug = false; + // Whether to run the validator to check for errors. + bool validate = true; + // When validating validate globally and not just locally + bool validateGlobally = false; + // 0, 1, 2 correspond to -O0, -O1, -O2, etc. + int optimizeLevel = 0; + // 0, 1, 2 correspond to -O0, -Os, -Oz + int shrinkLevel = 0; + // Optimize assuming things like div by 0, bad load/store, will not trap. + bool ignoreImplicitTraps = false; + // Optimize assuming that the low 1K of memory is not valid memory for the application + // to use. In that case, we can optimize load/store offsets in many cases. + bool lowMemoryUnused = false; + enum { LowMemoryBound = 1024 }; + // Whether to try to preserve debug info through, which are special calls. + bool debugInfo = false; + // Which wasm features to accept, and be allowed to use. + FeatureSet features = FeatureSet::All; void setDefaultOptimizationOptions() { // -Os is our default diff --git a/src/passes/CMakeLists.txt b/src/passes/CMakeLists.txt index 0ad9dbf05..2848946f8 100644 --- a/src/passes/CMakeLists.txt +++ b/src/passes/CMakeLists.txt @@ -31,6 +31,7 @@ SET(passes_SOURCES MinifyImportsAndExports.cpp NameList.cpp NoExitRuntime.cpp + OptimizeAddedConstants.cpp OptimizeInstructions.cpp PickLoadSigns.cpp PostEmscripten.cpp diff --git a/src/passes/CodePushing.cpp b/src/passes/CodePushing.cpp index 931df140d..52aab08ad 100644 --- a/src/passes/CodePushing.cpp +++ b/src/passes/CodePushing.cpp @@ -77,8 +77,8 @@ struct LocalAnalyzer : public PostWalker<LocalAnalyzer> { } }; -// Implement core optimization logic in a struct, used and then discarded entirely -// for each block +// Implements core optimization logic. Used and then discarded entirely +// for each block. class Pusher { ExpressionList& list; LocalAnalyzer& analyzer; @@ -92,7 +92,7 @@ public: // continuing forward. Index relevant = list.size() - 1; // we never need to push past a final element, as // we couldn't be used after it. - Index nothing = -1; + const Index nothing = -1; Index i = 0; Index firstPushable = nothing; while (i < relevant) { diff --git a/src/passes/OptimizeAddedConstants.cpp b/src/passes/OptimizeAddedConstants.cpp new file mode 100644 index 000000000..119d61637 --- /dev/null +++ b/src/passes/OptimizeAddedConstants.cpp @@ -0,0 +1,239 @@ +/* + * Copyright 2019 WebAssembly Community Group participants + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// +// Optimize added constants into load/store offsets. This requires the assumption +// that low memory is unused, so that we can replace an add (which might wrap) +// with a load/store offset (which does not). +// +// The propagate option also propagates offsets across set/get local pairs. +// +// Optimizing constants into load/store offsets is almost always +// beneficial for speed, as VMs can optimize these operations better. +// If a LocalGraph is provided, this can also propagate values along get/set +// pairs. In such a case, we may increase code size slightly or reduce +// compressibility (e.g., replace (load (get $x)) with (load offset=Z (get $y)), +// where Z is big enough to not fit in a single byte), but this is good for +// speed, and may lead to code size reductions elsewhere by using fewer locals. +// + +#include <wasm.h> +#include <pass.h> +#include <wasm-builder.h> +#include <ir/local-graph.h> + +namespace wasm { + +template<typename T> +class MemoryAccessOptimizer { +public: + MemoryAccessOptimizer(T* curr, Module* module, LocalGraph* localGraph) : curr(curr), module(module), localGraph(localGraph) { + // The pointer itself may be a constant, if e.g. it was precomputed or + // a get that we propagated. + if (curr->ptr->template is<Const>()) { + optimizeConstantPointer(); + return; + } + if (auto* add = curr->ptr->template dynCast<Binary>()) { + if (add->op == AddInt32) { + // Look for a constant on both sides. + if (tryToOptimizeConstant(add->right, add->left) || + tryToOptimizeConstant(add->left, add->right)) { + return; + } + } + } + if (localGraph) { + // A final important case is a propagated add: + // + // x = y + 10 + // .. + // load(x) + // => + // x = y + 10 + // .. + // load(y, offset=10) + // + // This is only valid if y does not change in the middle! + if (auto* get = curr->ptr->template dynCast<GetLocal>()) { + auto& sets = localGraph->getSetses[get]; + if (sets.size() == 1) { + auto* set = *sets.begin(); + // May be a zero-init (in which case, we can ignore it). + if (set) { + auto* value = set->value; + if (auto* add = value->template dynCast<Binary>()) { + if (add->op == AddInt32) { + // We can optimize on either side, but only if both we find + // a constant *and* the other side cannot change in the middle. + // TODO If it could change, we may add a new local to capture the + // old value. + if (tryToOptimizePropagatedAdd(add->right, add->left, get) || + tryToOptimizePropagatedAdd(add->left, add->right, get)) { + return; + } + } + } + } + } + } + } + } + +private: + T* curr; + Module* module; + LocalGraph* localGraph; + + void optimizeConstantPointer() { + // The constant and an offset are interchangeable: + // (load (const X)) <=> (load offset=X (const 0)) + // It may not matter if we do this or not - it's the same size, + // and in both cases the compiler can see it's a constant location. + // For code clarity and compressibility, we prefer to put the + // entire address in the constant. + if (curr->offset) { + // Note that the offset may already be larger than low memory - the + // code may know that is valid, even if we can't. Only handle the + // obviously valid case where an overflow can't occur. + auto* c = curr->ptr->template cast<Const>(); + uint32_t base = c->value.geti32(); + uint32_t offset = curr->offset; + if (uint64_t(base) + uint64_t(offset) < (uint64_t(1) << 32)) { + c->value = c->value.add(Literal(uint32_t(curr->offset))); + curr->offset = 0; + } + } + } + + struct Result { + bool succeeded; + Address total; + Result() : succeeded(false) {} + Result(Address total) : succeeded(true), total(total) {} + }; + + // See if we can optimize an offset from an expression. If we report + // success, the returned offset can be added as a replacement for the + // expression here. + bool tryToOptimizeConstant(Expression* oneSide, Expression* otherSide) { + if (auto* c = oneSide->template dynCast<Const>()) { + auto result = canOptimizeConstant(c->value); + if (result.succeeded) { + curr->offset = result.total; + curr->ptr = otherSide; + if (curr->ptr->template is<Const>()) { + optimizeConstantPointer(); + } + return true; + } + } + return false; + } + + bool tryToOptimizePropagatedAdd(Expression* oneSide, Expression* otherSide, GetLocal* ptr) { + if (auto* c = oneSide->template dynCast<Const>()) { + auto result = canOptimizeConstant(c->value); + if (result.succeeded) { + // Looks good, but we need to make sure the other side cannot change: + // + // x = y + 10 + // y = y + 1 + // load(x) + // + // This example should not be optimized into + // + // load(x, offset=10) + // + // If the other side is a get, we may be able to prove that we can just use that same + // local, if both it and the pointer are in SSA form. In that case, + // + // y = .. // single assignment that dominates all uses + // x = y + 10 // single assignment that dominates all uses + // [..] + // load(x) => load(y, offset=10) + // + // This is valid since dominance is transitive, so y's definition dominates the load, + // and it is ok to replace x with y + 10 there. + // TODO otherwise, create a new local + if (auto* get = otherSide->template dynCast<GetLocal>()) { + if (localGraph->isSSA(get->index) && localGraph->isSSA(ptr->index)) { + curr->offset = result.total; + curr->ptr = Builder(*module).makeGetLocal(get->index, get->type); + return true; + } + } + } + } + return false; + } + + // Sees if we can optimize a particular constant. + Result canOptimizeConstant(Literal literal) { + auto value = literal.geti32(); + // Avoid uninteresting corner cases with peculiar offsets. + if (value >= 0 && value < PassOptions::LowMemoryBound) { + // The total offset must not allow reaching reasonable memory + // by overflowing. + auto total = curr->offset + value; + if (total < PassOptions::LowMemoryBound) { + return Result(total); + } + } + return Result(); + } +}; + +struct OptimizeAddedConstants : public WalkerPass<PostWalker<OptimizeAddedConstants, UnifiedExpressionVisitor<OptimizeAddedConstants>>> { + bool isFunctionParallel() override { return true; } + + bool propagate; + + OptimizeAddedConstants(bool propagate) : propagate(propagate) {} + + Pass* create() override { return new OptimizeAddedConstants(propagate); } + + std::unique_ptr<LocalGraph> localGraph; + + void visitLoad(Load* curr) { + MemoryAccessOptimizer<Load>(curr, getModule(), localGraph.get()); + } + + void visitStore(Store* curr) { + MemoryAccessOptimizer<Store>(curr, getModule(), localGraph.get()); + } + + void doWalkFunction(Function* func) { + // This pass is only valid under the assumption of unused low memory. + assert(getPassOptions().lowMemoryUnused); + if (propagate) { + localGraph = make_unique<LocalGraph>(func); + localGraph->computeSSAIndexes(); + } + super::doWalkFunction(func); + } +}; + +Pass *createOptimizeAddedConstantsPass() { + return new OptimizeAddedConstants(false); +} + +Pass *createOptimizeAddedConstantsPropagatePass() { + return new OptimizeAddedConstants(true); +} + +} // namespace wasm + diff --git a/src/passes/PickLoadSigns.cpp b/src/passes/PickLoadSigns.cpp index 827346839..fce50b4bb 100644 --- a/src/passes/PickLoadSigns.cpp +++ b/src/passes/PickLoadSigns.cpp @@ -46,22 +46,8 @@ struct PickLoadSigns : public WalkerPass<ExpressionStackWalker<PickLoadSigns>> { usages.resize(func->getNumLocals()); // walk ExpressionStackWalker<PickLoadSigns>::doWalkFunction(func); - // optimize based on the info we saw - for (auto& pair : loads) { - auto* load = pair.first; - auto index = pair.second; - auto& usage = usages[index]; - // if we can't optimize, give up - if (usage.totalUsages == 0 || // no usages, so no idea - usage.signedUsages + usage.unsignedUsages != usage.totalUsages || // non-sign/unsigned usages, so cannot change - (usage.signedUsages != 0 && usage.signedBits != load->bytes * 8) || // sign usages exist but the wrong size - (usage.unsignedUsages != 0 && usage.unsignedBits != load->bytes * 8)) { // unsigned usages exist but the wrong size - continue; - } - // we can pick the optimal one. our hope is to remove 2 items per - // signed use (two shifts), so we factor that in - load->signed_ = usage.signedUsages * 2 >= usage.unsignedUsages; - } + // optimize + optimize(); } void visitGetLocal(GetLocal* curr) { @@ -102,6 +88,26 @@ struct PickLoadSigns : public WalkerPass<ExpressionStackWalker<PickLoadSigns>> { loads[load] = curr->index; } } + + void optimize() { + // optimize based on the info we saw + for (auto& pair : loads) { + auto* load = pair.first; + auto index = pair.second; + auto& usage = usages[index]; + // if we can't optimize, give up + if (usage.totalUsages == 0 || // no usages, so no idea + usage.signedUsages + usage.unsignedUsages != usage.totalUsages || // non-sign/unsigned usages, so cannot change + (usage.signedUsages != 0 && usage.signedBits != load->bytes * 8) || // sign usages exist but the wrong size + (usage.unsignedUsages != 0 && usage.unsignedBits != load->bytes * 8)) { // unsigned usages exist but the wrong size + continue; + } + // we can pick the optimal one. our hope is to remove 2 items per + // signed use (two shifts), so we factor that in + load->signed_ = usage.signedUsages * 2 >= usage.unsignedUsages; + } + } + }; Pass *createPickLoadSignsPass() { diff --git a/src/passes/PostEmscripten.cpp b/src/passes/PostEmscripten.cpp index 72c0d8808..7e2bacf25 100644 --- a/src/passes/PostEmscripten.cpp +++ b/src/passes/PostEmscripten.cpp @@ -32,66 +32,6 @@ struct PostEmscripten : public WalkerPass<PostWalker<PostEmscripten>> { Pass* create() override { return new PostEmscripten; } - // When we have a Load from a local value (typically a GetLocal) plus a constant offset, - // we may be able to fold it in. - // The semantics of the Add are to wrap, while wasm offset semantics purposefully do - // not wrap. So this is not always safe to do. For example, a load may depend on - // wrapping via - // (2^32 - 10) + 100 => wrap and load from address 90 - // Without wrapping, we get something too large, and an error. *However*, for - // asm2wasm output coming from Emscripten, we allocate the lowest 1024 for mapped - // globals. Mapped globals are simple types (i32, float or double), always - // accessed directly by a single constant. Therefore if we see (..) + K where - // K is less then 1024, then if it wraps, it wraps into [0, 1024) which is at best - // a mapped global, but it can't be because they are accessed directly (at worst, - // it's 0 or an unused section of memory that was reserved for mapped globlas). - // Thus it is ok to optimize such small constants into Load offsets. - - #define SAFE_MAX 1024 - - void optimizeMemoryAccess(Expression*& ptr, Address& offset) { - while (1) { - auto* add = ptr->dynCast<Binary>(); - if (!add) break; - if (add->op != AddInt32) break; - auto* left = add->left->dynCast<Const>(); - auto* right = add->right->dynCast<Const>(); - // note: in optimized code, we shouldn't see an add of two constants, so don't worry about that much - // (precompute would optimize that) - if (left) { - auto value = left->value.geti32(); - if (value >= 0 && value < SAFE_MAX) { - offset = offset + value; - ptr = add->right; - continue; - } - } - if (right) { - auto value = right->value.geti32(); - if (value >= 0 && value < SAFE_MAX) { - offset = offset + value; - ptr = add->left; - continue; - } - } - break; - } - // finally ptr may be a const, but it isn't worth folding that in (we still have a const); in fact, - // it's better to do the opposite for gzip purposes as well as for readability. - auto* last = ptr->dynCast<Const>(); - if (last) { - last->value = Literal(int32_t(last->value.geti32() + offset)); - offset = 0; - } - } - - void visitLoad(Load* curr) { - optimizeMemoryAccess(curr->ptr, curr->offset); - } - void visitStore(Store* curr) { - optimizeMemoryAccess(curr->ptr, curr->offset); - } - void visitCall(Call* curr) { // special asm.js imports can be optimized auto* func = getModule()->getFunction(curr->target); diff --git a/src/passes/Precompute.cpp b/src/passes/Precompute.cpp index c23babda4..5e9d9c9ea 100644 --- a/src/passes/Precompute.cpp +++ b/src/passes/Precompute.cpp @@ -15,7 +15,11 @@ */ // -// Computes code at compile time where possible. +// Computes code at compile time where possible, replacing it with the +// computed constant. +// +// The "propagate" variant of this pass also propagates constants across +// sets and gets, which implements a standard constant propagation. // // Possible nondeterminism: WebAssembly NaN signs are nondeterministic, // and this pass may optimize e.g. a float 0 / 0 into +nan while a VM may @@ -267,6 +271,7 @@ private: // compute all dependencies LocalGraph localGraph(func); localGraph.computeInfluences(); + localGraph.computeSSAIndexes(); // prepare the work list. we add things here that might change to a constant // initially, that means everything std::unordered_set<Expression*> work; diff --git a/src/passes/SafeHeap.cpp b/src/passes/SafeHeap.cpp index f170041e1..3cc1021ed 100644 --- a/src/passes/SafeHeap.cpp +++ b/src/passes/SafeHeap.cpp @@ -100,7 +100,10 @@ struct AccessInstrumenter : public WalkerPass<PostWalker<AccessInstrumenter>> { }; struct SafeHeap : public Pass { + PassOptions options; + void run(PassRunner* runner, Module* module) override { + options = runner->options; // add imports addImports(module); // instrument loads and stores @@ -314,13 +317,15 @@ struct SafeHeap : public Pass { } Expression* makeBoundsCheck(Type type, Builder& builder, Index local, Index bytes) { + auto upperOp = options.lowMemoryUnused ? LtUInt32 : EqInt32; + auto upperBound = options.lowMemoryUnused ? PassOptions::LowMemoryBound : 0; return builder.makeIf( builder.makeBinary( OrInt32, builder.makeBinary( - EqInt32, + upperOp, builder.makeGetLocal(local, i32), - builder.makeConst(Literal(int32_t(0))) + builder.makeConst(Literal(int32_t(upperBound))) ), builder.makeBinary( GtUInt32, diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp index 58a5f2f0a..3712bf19e 100644 --- a/src/passes/pass.cpp +++ b/src/passes/pass.cpp @@ -100,6 +100,8 @@ void PassRegistry::registerPasses() { registerPass("minify-imports-and-exports", "minifies both import and export names, and emits a mapping to the minified ones", createMinifyImportsAndExportsPass); registerPass("nm", "name list", createNameListPass); registerPass("no-exit-runtime", "removes calls to atexit(), which is valid if the C runtime will never be exited", createNoExitRuntimePass); + registerPass("optimize-added-constants", "optimizes added constants into load/store offsets", createOptimizeAddedConstantsPass); + registerPass("optimize-added-constants-propagate", "optimizes added constants into load/store offsets, propagating them across locals too", createOptimizeAddedConstantsPropagatePass); registerPass("optimize-instructions", "optimizes instruction combinations", createOptimizeInstructionsPass); registerPass("optimize-stack-ir", "optimize Stack IR", createOptimizeStackIRPass); registerPass("pick-load-signs", "pick load signs based on their uses", createPickLoadSignsPass); @@ -169,6 +171,13 @@ void PassRunner::addDefaultFunctionOptimizationPasses() { } else { add("precompute"); } + if (options.lowMemoryUnused) { + if (options.optimizeLevel >= 3 || options.shrinkLevel >= 1) { + add("optimize-added-constants-propagate"); + } else { + add("optimize-added-constants"); + } + } if (options.optimizeLevel >= 2 || options.shrinkLevel >= 2) { add("code-pushing"); } diff --git a/src/passes/passes.h b/src/passes/passes.h index 15493f313..ab11721fb 100644 --- a/src/passes/passes.h +++ b/src/passes/passes.h @@ -57,6 +57,8 @@ Pass* createMinifyImportsAndExportsPass(); Pass* createMetricsPass(); Pass* createNameListPass(); Pass* createNoExitRuntimePass(); +Pass* createOptimizeAddedConstantsPass(); +Pass* createOptimizeAddedConstantsPropagatePass(); Pass* createOptimizeInstructionsPass(); Pass* createOptimizeStackIRPass(); Pass* createPickLoadSignsPass(); diff --git a/src/tools/optimization-options.h b/src/tools/optimization-options.h index 53fd60e4d..4b8bfc139 100644 --- a/src/tools/optimization-options.h +++ b/src/tools/optimization-options.h @@ -96,6 +96,11 @@ struct OptimizationOptions : public ToolOptions { Options::Arguments::Zero, [this](Options*, const std::string&) { passOptions.ignoreImplicitTraps = true; + }) + .add("--low-memory-unused", "-lmu", "Optimize under the helpful assumption that the low 1K of memory is not used by the application", + Options::Arguments::Zero, + [this](Options*, const std::string&) { + passOptions.lowMemoryUnused = true; }); // add passes in registry for (const auto& p : PassRegistry::get()->getRegisteredNames()) { diff --git a/src/wasm.h b/src/wasm.h index a1761a430..a16dab478 100644 --- a/src/wasm.h +++ b/src/wasm.h @@ -98,12 +98,6 @@ struct Address { Address& operator++() { ++addr; return *this; } }; -// An offset into memory -typedef int32_t Offset; - -// Types - - // Operators enum UnaryOp { diff --git a/src/wasm/wasm-s-parser.cpp b/src/wasm/wasm-s-parser.cpp index d7b3824a9..7fd4679b1 100644 --- a/src/wasm/wasm-s-parser.cpp +++ b/src/wasm/wasm-s-parser.cpp @@ -966,10 +966,10 @@ static size_t parseMemAttributes(Element& s, Address* offset, Address* align, Ad throw ParseException("bad memory attribute immediate", s.line, s.col); } if (str[0] == 'a') { - if (value > std::numeric_limits<uint32_t>::max()) throw ParseException("bad align"); + if (value > std::numeric_limits<uint32_t>::max()) throw ParseException("bad align", s.line, s.col); *align = value; } else if (str[0] == 'o') { - if (value > std::numeric_limits<uint32_t>::max()) throw ParseException("bad offset"); + if (value > std::numeric_limits<uint32_t>::max()) throw ParseException("bad offset", s.line, s.col); *offset = value; } else throw ParseException("bad memory attribute"); i++; |