Consistently optimize small added constants into load/store offsets (#1924)

See #1919 - we did not do this consistently before. This adds a lowMemoryUnused option to PassOptions. It can be passed on the commandline with --low-memory-unused. If enabled, we run the new optimize-added-constants pass, which does the real work here, replacing older code in post-emscripten. Aside from running at the proper time (unlike the old pass, see #1919), this also has a -propagate mode, which can do stuff like this: y = x + 10 [..] load(y) [..] load(y) => y = x + 10 [..] load(x, offset=10) [..] load(x, offset=10) That is, it can propagate such offsets to the loads/stores. This pattern is common in big interpreter loops, where the pointers are offsets into a big struct of state. The pass does this propagation by using a new feature of LocalGraph, which can verify which locals are in SSA mode. Binaryen IR is not SSA (intentionally, since it's a later IR), but if a local only has a single set for all gets, that means that local is in such a state, and can be optimized. The tricky thing is that all locals are initialized to zero, so there are at minimum two sets. But if we verify that the real set dominates all the gets, then the zero initialization cannot reach them, and we are safe. This PR also makes safe-heap aware of lowMemoryUnused. If so, we check for not just an access of 0, but the range 0-1023. This makes zlib 5% faster, with either the wasm backend or asm2wasm. It also makes it 0.5% smaller. Also helps sqlite (1.5% faster) and lua (1% faster)
author: Alon Zakai <alonzakai@gmail.com> 2019-03-01 10:28:07 -0800
committer: GitHub <noreply@github.com> 2019-03-01 10:28:07 -0800
commit: 689fe405a3417fbfd59456035add6f6f53149f35 (patch)
tree: d6f1dcaf0cbb85eb3ae830f68a46c9a6627d1562 /src/passes
parent: f59c3033e678ced61bc8c78e8ac9fbee31ef0210 (diff)
download: binaryen-689fe405a3417fbfd59456035add6f6f53149f35.tar.gz
binaryen-689fe405a3417fbfd59456035add6f6f53149f35.tar.bz2
binaryen-689fe405a3417fbfd59456035add6f6f53149f35.zip
9 files changed, 289 insertions, 82 deletions
diff --git a/src/passes/CMakeLists.txt b/src/passes/CMakeLists.txt
index 0ad9dbf05..2848946f8 100644
--- a/src/passes/CMakeLists.txt
+++ b/src/passes/CMakeLists.txt
@@ -31,6 +31,7 @@ SET(passes_SOURCES
   MinifyImportsAndExports.cpp
   NameList.cpp
   NoExitRuntime.cpp
+  OptimizeAddedConstants.cpp
   OptimizeInstructions.cpp
   PickLoadSigns.cpp
   PostEmscripten.cpp
diff --git a/src/passes/CodePushing.cpp b/src/passes/CodePushing.cpp
index 931df140d..52aab08ad 100644
--- a/src/passes/CodePushing.cpp
+++ b/src/passes/CodePushing.cpp
@@ -77,8 +77,8 @@ struct LocalAnalyzer : public PostWalker<LocalAnalyzer> {
   }
 };
 
-// Implement core optimization logic in a struct, used and then discarded entirely
-// for each block
+// Implements core optimization logic. Used and then discarded entirely
+// for each block.
 class Pusher {
   ExpressionList& list;
   LocalAnalyzer& analyzer;
@@ -92,7 +92,7 @@ public:
     // continuing forward.
     Index relevant = list.size() - 1; // we never need to push past a final element, as
                                       // we couldn't be used after it.
-    Index nothing = -1;
+    const Index nothing = -1;
     Index i = 0;
     Index firstPushable = nothing;
     while (i < relevant) {
diff --git a/src/passes/OptimizeAddedConstants.cpp b/src/passes/OptimizeAddedConstants.cpp
new file mode 100644
index 000000000..119d61637
--- /dev/null
+++ b/src/passes/OptimizeAddedConstants.cpp
@@ -0,0 +1,239 @@
+/*
+ * Copyright 2019 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//
+// Optimize added constants into load/store offsets. This requires the assumption
+// that low memory is unused, so that we can replace an add (which might wrap)
+// with a load/store offset (which does not).
+//
+// The propagate option also propagates offsets across set/get local pairs.
+//
+// Optimizing constants into load/store offsets is almost always
+// beneficial for speed, as VMs can optimize these operations better.
+// If a LocalGraph is provided, this can also propagate values along get/set
+// pairs. In such a case, we may increase code size slightly or reduce
+// compressibility (e.g., replace (load (get $x)) with (load offset=Z (get $y)),
+// where Z is big enough to not fit in a single byte), but this is good for
+// speed, and may lead to code size reductions elsewhere by using fewer locals.
+//
+
+#include <wasm.h>
+#include <pass.h>
+#include <wasm-builder.h>
+#include <ir/local-graph.h>
+
+namespace wasm {
+
+template<typename T>
+class MemoryAccessOptimizer {
+public:
+  MemoryAccessOptimizer(T* curr, Module* module, LocalGraph* localGraph) : curr(curr), module(module), localGraph(localGraph) {
+    // The pointer itself may be a constant, if e.g. it was precomputed or
+    // a get that we propagated.
+    if (curr->ptr->template is<Const>()) {
+      optimizeConstantPointer();
+      return;
+    }
+    if (auto* add = curr->ptr->template dynCast<Binary>()) {
+      if (add->op == AddInt32) {
+        // Look for a constant on both sides.
+        if (tryToOptimizeConstant(add->right, add->left) ||
+            tryToOptimizeConstant(add->left, add->right)) {
+          return;
+        }
+      }
+    }
+    if (localGraph) {
+      // A final important case is a propagated add:
+      //
+      //  x = y + 10
+      //  ..
+      //  load(x)
+      // =>
+      //  x = y + 10
+      //  ..
+      //  load(y, offset=10)
+      //
+      // This is only valid if y does not change in the middle!
+      if (auto* get = curr->ptr->template dynCast<GetLocal>()) {
+        auto& sets = localGraph->getSetses[get];
+        if (sets.size() == 1) {
+          auto* set = *sets.begin();
+          // May be a zero-init (in which case, we can ignore it).
+          if (set) {
+            auto* value = set->value;
+            if (auto* add = value->template dynCast<Binary>()) {
+              if (add->op == AddInt32) {
+                // We can optimize on either side, but only if both we find
+                // a constant *and* the other side cannot change in the middle.
+                // TODO If it could change, we may add a new local to capture the
+                //      old value.
+                if (tryToOptimizePropagatedAdd(add->right, add->left, get) ||
+                   tryToOptimizePropagatedAdd(add->left, add->right, get)) {
+                  return;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+private:
+  T* curr;
+  Module* module;
+  LocalGraph* localGraph;
+
+  void optimizeConstantPointer() {
+    // The constant and an offset are interchangeable:
+    //   (load (const X))  <=>  (load offset=X (const 0))
+    // It may not matter if we do this or not - it's the same size,
+    // and in both cases the compiler can see it's a constant location.
+    // For code clarity and compressibility, we prefer to put the
+    // entire address in the constant.
+    if (curr->offset) {
+      // Note that the offset may already be larger than low memory - the
+      // code may know that is valid, even if we can't. Only handle the
+      // obviously valid case where an overflow can't occur.
+      auto* c = curr->ptr->template cast<Const>();
+      uint32_t base = c->value.geti32();
+      uint32_t offset = curr->offset;
+      if (uint64_t(base) + uint64_t(offset) < (uint64_t(1) << 32)) {
+        c->value = c->value.add(Literal(uint32_t(curr->offset)));
+        curr->offset = 0;
+      }
+    }
+  }
+
+  struct Result {
+    bool succeeded;
+    Address total;
+    Result() : succeeded(false) {}
+    Result(Address total) : succeeded(true), total(total) {}
+  };
+
+  // See if we can optimize an offset from an expression. If we report
+  // success, the returned offset can be added as a replacement for the
+  // expression here.
+  bool tryToOptimizeConstant(Expression* oneSide, Expression* otherSide) {
+    if (auto* c = oneSide->template dynCast<Const>()) {
+      auto result = canOptimizeConstant(c->value);
+      if (result.succeeded) {
+        curr->offset = result.total;
+        curr->ptr = otherSide;
+        if (curr->ptr->template is<Const>()) {
+          optimizeConstantPointer();
+        }
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool tryToOptimizePropagatedAdd(Expression* oneSide, Expression* otherSide, GetLocal* ptr) {
+    if (auto* c = oneSide->template dynCast<Const>()) {
+      auto result = canOptimizeConstant(c->value);
+      if (result.succeeded) {
+        // Looks good, but we need to make sure the other side cannot change:
+        //
+        //  x = y + 10
+        //  y = y + 1
+        //  load(x)
+        //
+        // This example should not be optimized into
+        //
+        //  load(x, offset=10)
+        //
+        // If the other side is a get, we may be able to prove that we can just use that same
+        // local, if both it and the pointer are in SSA form. In that case,
+        //
+        //  y = .. // single assignment that dominates all uses
+        //  x = y + 10 // single assignment that dominates all uses
+        //  [..]
+        //  load(x) => load(y, offset=10)
+        //
+        // This is valid since dominance is transitive, so y's definition dominates the load,
+        // and it is ok to replace x with y + 10 there.
+        // TODO otherwise, create a new local
+        if (auto* get = otherSide->template dynCast<GetLocal>()) {
+          if (localGraph->isSSA(get->index) && localGraph->isSSA(ptr->index)) {
+            curr->offset = result.total;
+            curr->ptr = Builder(*module).makeGetLocal(get->index, get->type);
+            return true;
+          }
+        }
+      }
+    }
+    return false;
+  }
+
+  // Sees if we can optimize a particular constant.
+  Result canOptimizeConstant(Literal literal) {
+    auto value = literal.geti32();
+    // Avoid uninteresting corner cases with peculiar offsets.
+    if (value >= 0 && value < PassOptions::LowMemoryBound) {
+      // The total offset must not allow reaching reasonable memory
+      // by overflowing.
+      auto total = curr->offset + value;
+      if (total < PassOptions::LowMemoryBound) {
+        return Result(total);
+      }
+    }
+    return Result();
+  }
+};
+
+struct OptimizeAddedConstants : public WalkerPass<PostWalker<OptimizeAddedConstants, UnifiedExpressionVisitor<OptimizeAddedConstants>>> {
+  bool isFunctionParallel() override { return true; }
+
+  bool propagate;
+
+  OptimizeAddedConstants(bool propagate) : propagate(propagate) {}
+
+  Pass* create() override { return new OptimizeAddedConstants(propagate); }
+
+  std::unique_ptr<LocalGraph> localGraph;
+
+  void visitLoad(Load* curr) {
+    MemoryAccessOptimizer<Load>(curr, getModule(), localGraph.get());
+  }
+
+  void visitStore(Store* curr) {
+    MemoryAccessOptimizer<Store>(curr, getModule(), localGraph.get());
+  }
+
+  void doWalkFunction(Function* func) {
+    // This pass is only valid under the assumption of unused low memory.
+    assert(getPassOptions().lowMemoryUnused);
+    if (propagate) {
+      localGraph = make_unique<LocalGraph>(func);
+      localGraph->computeSSAIndexes();
+    }
+    super::doWalkFunction(func);
+  }
+};
+
+Pass *createOptimizeAddedConstantsPass() {
+  return new OptimizeAddedConstants(false);
+}
+
+Pass *createOptimizeAddedConstantsPropagatePass() {
+  return new OptimizeAddedConstants(true);
+}
+
+} // namespace wasm
+
diff --git a/src/passes/PickLoadSigns.cpp b/src/passes/PickLoadSigns.cpp
index 827346839..fce50b4bb 100644
--- a/src/passes/PickLoadSigns.cpp
+++ b/src/passes/PickLoadSigns.cpp
@@ -46,22 +46,8 @@ struct PickLoadSigns : public WalkerPass<ExpressionStackWalker<PickLoadSigns>> {
     usages.resize(func->getNumLocals());
     // walk
     ExpressionStackWalker<PickLoadSigns>::doWalkFunction(func);
-    // optimize based on the info we saw
-    for (auto& pair : loads) {
-      auto* load = pair.first;
-      auto index = pair.second;
-      auto& usage = usages[index];
-      // if we can't optimize, give up
-      if (usage.totalUsages == 0 || // no usages, so no idea
-          usage.signedUsages + usage.unsignedUsages != usage.totalUsages || // non-sign/unsigned usages, so cannot change
-          (usage.signedUsages   != 0 && usage.signedBits   != load->bytes * 8) || // sign usages exist but the wrong size
-          (usage.unsignedUsages != 0 && usage.unsignedBits != load->bytes * 8)) { // unsigned usages exist but the wrong size
-        continue;
-      }
-      // we can pick the optimal one. our hope is to remove 2 items per
-      // signed use (two shifts), so we factor that in
-      load->signed_ = usage.signedUsages * 2 >= usage.unsignedUsages;
-    }
+    // optimize
+    optimize();
   }
 
   void visitGetLocal(GetLocal* curr) {
@@ -102,6 +88,26 @@ struct PickLoadSigns : public WalkerPass<ExpressionStackWalker<PickLoadSigns>> {
       loads[load] = curr->index;
     }
   }
+
+  void optimize() {
+    // optimize based on the info we saw
+    for (auto& pair : loads) {
+      auto* load = pair.first;
+      auto index = pair.second;
+      auto& usage = usages[index];
+      // if we can't optimize, give up
+      if (usage.totalUsages == 0 || // no usages, so no idea
+          usage.signedUsages + usage.unsignedUsages != usage.totalUsages || // non-sign/unsigned usages, so cannot change
+          (usage.signedUsages   != 0 && usage.signedBits   != load->bytes * 8) || // sign usages exist but the wrong size
+          (usage.unsignedUsages != 0 && usage.unsignedBits != load->bytes * 8)) { // unsigned usages exist but the wrong size
+        continue;
+      }
+      // we can pick the optimal one. our hope is to remove 2 items per
+      // signed use (two shifts), so we factor that in
+      load->signed_ = usage.signedUsages * 2 >= usage.unsignedUsages;
+    }
+  }
+
 };
 
 Pass *createPickLoadSignsPass() {
diff --git a/src/passes/PostEmscripten.cpp b/src/passes/PostEmscripten.cpp
index 72c0d8808..7e2bacf25 100644
--- a/src/passes/PostEmscripten.cpp
+++ b/src/passes/PostEmscripten.cpp
@@ -32,66 +32,6 @@ struct PostEmscripten : public WalkerPass<PostWalker<PostEmscripten>> {
 
   Pass* create() override { return new PostEmscripten; }
 
-  // When we have a Load from a local value (typically a GetLocal) plus a constant offset,
-  // we may be able to fold it in.
-  // The semantics of the Add are to wrap, while wasm offset semantics purposefully do
-  // not wrap. So this is not always safe to do. For example, a load may depend on
-  // wrapping via
-  //      (2^32 - 10) + 100   =>  wrap and load from address 90
-  // Without wrapping, we get something too large, and an error. *However*, for
-  // asm2wasm output coming from Emscripten, we allocate the lowest 1024 for mapped
-  // globals. Mapped globals are simple types (i32, float or double), always
-  // accessed directly by a single constant. Therefore if we see (..) + K where
-  // K is less then 1024, then if it wraps, it wraps into [0, 1024) which is at best
-  // a mapped global, but it can't be because they are accessed directly (at worst,
-  // it's 0 or an unused section of memory that was reserved for mapped globlas).
-  // Thus it is ok to optimize such small constants into Load offsets.
-
-  #define SAFE_MAX 1024
-
-  void optimizeMemoryAccess(Expression*& ptr, Address& offset) {
-    while (1) {
-      auto* add = ptr->dynCast<Binary>();
-      if (!add) break;
-      if (add->op != AddInt32) break;
-      auto* left = add->left->dynCast<Const>();
-      auto* right = add->right->dynCast<Const>();
-      // note: in optimized code, we shouldn't see an add of two constants, so don't worry about that much
-      // (precompute would optimize that)
-      if (left) {
-        auto value = left->value.geti32();
-        if (value >= 0 && value < SAFE_MAX) {
-          offset = offset + value;
-          ptr = add->right;
-          continue;
-        }
-      }
-      if (right) {
-        auto value = right->value.geti32();
-        if (value >= 0 && value < SAFE_MAX) {
-          offset = offset + value;
-          ptr = add->left;
-          continue;
-        }
-      }
-      break;
-    }
-    // finally ptr may be a const, but it isn't worth folding that in (we still have a const); in fact,
-    // it's better to do the opposite for gzip purposes as well as for readability.
-    auto* last = ptr->dynCast<Const>();
-    if (last) {
-      last->value = Literal(int32_t(last->value.geti32() + offset));
-      offset = 0;
-    }
-  }
-
-  void visitLoad(Load* curr) {
-    optimizeMemoryAccess(curr->ptr, curr->offset);
-  }
-  void visitStore(Store* curr) {
-    optimizeMemoryAccess(curr->ptr, curr->offset);
-  }
-
   void visitCall(Call* curr) {
     // special asm.js imports can be optimized
     auto* func = getModule()->getFunction(curr->target);
diff --git a/src/passes/Precompute.cpp b/src/passes/Precompute.cpp
index c23babda4..5e9d9c9ea 100644
--- a/src/passes/Precompute.cpp
+++ b/src/passes/Precompute.cpp
@@ -15,7 +15,11 @@
  */
 
 //
-// Computes code at compile time where possible.
+// Computes code at compile time where possible, replacing it with the
+// computed constant.
+//
+// The "propagate" variant of this pass also propagates constants across
+// sets and gets, which implements a standard constant propagation.
 //
 // Possible nondeterminism: WebAssembly NaN signs are nondeterministic,
 // and this pass may optimize e.g. a float 0 / 0 into +nan while a VM may
@@ -267,6 +271,7 @@ private:
     // compute all dependencies
     LocalGraph localGraph(func);
     localGraph.computeInfluences();
+    localGraph.computeSSAIndexes();
     // prepare the work list. we add things here that might change to a constant
     // initially, that means everything
     std::unordered_set<Expression*> work;
diff --git a/src/passes/SafeHeap.cpp b/src/passes/SafeHeap.cpp
index f170041e1..3cc1021ed 100644
--- a/src/passes/SafeHeap.cpp
+++ b/src/passes/SafeHeap.cpp
@@ -100,7 +100,10 @@ struct AccessInstrumenter : public WalkerPass<PostWalker<AccessInstrumenter>> {
 };
 
 struct SafeHeap : public Pass {
+  PassOptions options;
+
   void run(PassRunner* runner, Module* module) override {
+    options = runner->options;
     // add imports
     addImports(module);
     // instrument loads and stores
@@ -314,13 +317,15 @@ struct SafeHeap : public Pass {
   }
 
   Expression* makeBoundsCheck(Type type, Builder& builder, Index local, Index bytes) {
+    auto upperOp = options.lowMemoryUnused ? LtUInt32 : EqInt32;
+    auto upperBound = options.lowMemoryUnused ? PassOptions::LowMemoryBound : 0;
     return builder.makeIf(
       builder.makeBinary(
         OrInt32,
         builder.makeBinary(
-          EqInt32,
+          upperOp,
           builder.makeGetLocal(local, i32),
-          builder.makeConst(Literal(int32_t(0)))
+          builder.makeConst(Literal(int32_t(upperBound)))
         ),
         builder.makeBinary(
           GtUInt32,
diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp
index 58a5f2f0a..3712bf19e 100644
--- a/src/passes/pass.cpp
+++ b/src/passes/pass.cpp
@@ -100,6 +100,8 @@ void PassRegistry::registerPasses() {
   registerPass("minify-imports-and-exports", "minifies both import and export names, and emits a mapping to the minified ones", createMinifyImportsAndExportsPass);
   registerPass("nm", "name list", createNameListPass);
   registerPass("no-exit-runtime", "removes calls to atexit(), which is valid if the C runtime will never be exited", createNoExitRuntimePass);
+  registerPass("optimize-added-constants", "optimizes added constants into load/store offsets", createOptimizeAddedConstantsPass);
+  registerPass("optimize-added-constants-propagate", "optimizes added constants into load/store offsets, propagating them across locals too", createOptimizeAddedConstantsPropagatePass);
   registerPass("optimize-instructions", "optimizes instruction combinations", createOptimizeInstructionsPass);
   registerPass("optimize-stack-ir", "optimize Stack IR", createOptimizeStackIRPass);
   registerPass("pick-load-signs", "pick load signs based on their uses", createPickLoadSignsPass);
@@ -169,6 +171,13 @@ void PassRunner::addDefaultFunctionOptimizationPasses() {
   } else {
     add("precompute");
   }
+  if (options.lowMemoryUnused) {
+    if (options.optimizeLevel >= 3 || options.shrinkLevel >= 1) {
+      add("optimize-added-constants-propagate");
+    } else {
+      add("optimize-added-constants");
+    }
+  }
   if (options.optimizeLevel >= 2 || options.shrinkLevel >= 2) {
     add("code-pushing");
   }
diff --git a/src/passes/passes.h b/src/passes/passes.h
index 15493f313..ab11721fb 100644
--- a/src/passes/passes.h
+++ b/src/passes/passes.h
@@ -57,6 +57,8 @@ Pass* createMinifyImportsAndExportsPass();
 Pass* createMetricsPass();
 Pass* createNameListPass();
 Pass* createNoExitRuntimePass();
+Pass* createOptimizeAddedConstantsPass();
+Pass* createOptimizeAddedConstantsPropagatePass();
 Pass* createOptimizeInstructionsPass();
 Pass* createOptimizeStackIRPass();
 Pass* createPickLoadSignsPass();
author	Alon Zakai <alonzakai@gmail.com>	2019-03-01 10:28:07 -0800
committer	GitHub <noreply@github.com>	2019-03-01 10:28:07 -0800
commit	689fe405a3417fbfd59456035add6f6f53149f35 (patch)
tree	d6f1dcaf0cbb85eb3ae830f68a46c9a6627d1562 /src/passes
parent	f59c3033e678ced61bc8c78e8ac9fbee31ef0210 (diff)
download	binaryen-689fe405a3417fbfd59456035add6f6f53149f35.tar.gz binaryen-689fe405a3417fbfd59456035add6f6f53149f35.tar.bz2 binaryen-689fe405a3417fbfd59456035add6f6f53149f35.zip