8 files changed, 269 insertions, 30 deletions
diff --git a/src/ir/LocalGraph.cpp b/src/ir/LocalGraph.cpp
index cee187c6d..53c6a52e2 100644
--- a/src/ir/LocalGraph.cpp
+++ b/src/ir/LocalGraph.cpp
@@ -127,11 +127,8 @@ void LocalGraph::visitLoop(Loop* curr) {
         // the get trivially has fewer sets, so it overrode the loop entry sets
         return;
       }
-      std::vector<SetLocal*> intersection;
-      std::set_intersection(beforeSets.begin(), beforeSets.end(),
-                            getSets.begin(), getSets.end(),
-                            std::back_inserter(intersection));
-      if (intersection.size() < beforeSets.size()) {
+      if (!std::includes(getSets.begin(), getSets.end(),
+                         beforeSets.begin(), beforeSets.end())) {
         // the get has not the same sets as in the loop entry
         return;
       }
diff --git a/src/ir/local-graph.h b/src/ir/local-graph.h
index 4c4c1ee0a..ea27fa1fb 100644
--- a/src/ir/local-graph.h
+++ b/src/ir/local-graph.h
@@ -28,8 +28,7 @@ namespace wasm {
 // on this).
 //
 // TODO: the algorithm here is pretty simple, but also pretty slow,
-//       we should optimize it. e.g. we rely on set_interaction
-//       here, and worse we only use it to compute the size...
+//       we should optimize it.
 struct LocalGraph : public PostWalker<LocalGraph> {
   // main API
 
diff --git a/src/passes/CMakeLists.txt b/src/passes/CMakeLists.txt
index 168af2761..e9a157499 100644
--- a/src/passes/CMakeLists.txt
+++ b/src/passes/CMakeLists.txt
@@ -17,6 +17,7 @@ SET(passes_SOURCES
   InstrumentMemory.cpp
   MemoryPacking.cpp
   MergeBlocks.cpp
+  MergeLocals.cpp
   Metrics.cpp
   NameList.cpp
   OptimizeInstructions.cpp
diff --git a/src/passes/MergeLocals.cpp b/src/passes/MergeLocals.cpp
new file mode 100644
index 000000000..31be495c1
--- /dev/null
+++ b/src/passes/MergeLocals.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2016 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//
+// Merges locals when it is beneficial to do so.
+//
+// An obvious case is when locals are copied. In that case, two locals have the
+// same value in a range, and we can pick which of the two to use. For
+// example, in
+//
+//  (if (result i32)
+//   (tee_local $x
+//    (get_local $y)
+//   )
+//   (i32.const 100)
+//   (get_local $x)
+//  )
+// 
+// If that assignment of $y is never used again, everything is fine. But if
+// if is, then the live range of $y does not end in that get, and will
+// necessarily overlap with that of $x - making them appear to interfere
+// with each other in coalesce-locals, even though the value is identical.
+//
+// To fix that, we replace uses of $y with uses of $x. This extends $x's
+// live range and shrinks $y's live range. This tradeoff is not always good,
+// but $x and $y definitely overlap already, so trying to shrink the overlap
+// makes sense - if we remove the overlap entirely, we may be able to let
+// $x and $y be coalesced later.
+//
+// If we can remove only some of $y's uses, then we are definitely not
+// removing the overlap, and they do conflict. In that case, it's not clear
+// if this is beneficial or not, and we don't do it for now
+// TODO: investigate more
+//
+
+#include <wasm.h>
+#include <pass.h>
+#include <wasm-builder.h>
+#include <ir/local-graph.h>
+
+namespace wasm {
+
+struct MergeLocals : public WalkerPass<PostWalker<MergeLocals, UnifiedExpressionVisitor<MergeLocals>>> {
+  bool isFunctionParallel() override { return true; }
+
+  Pass* create() override { return new MergeLocals(); }
+
+  void doWalkFunction(Function* func) {
+    // first, instrument the graph by modifying each copy
+    //   (set_local $x
+    //    (get_local $y)
+    //   )
+    // to
+    //   (set_local $x
+    //    (tee_local $y
+    //     (get_local $y)
+    //    )
+    //   )
+    // That is, we add a trivial assign of $y. This ensures we
+    // have a new assignment of $y at the location of the copy,
+    // which makes it easy for us to see if the value if $y
+    // is still used after that point
+    super::doWalkFunction(func);
+
+    // optimize the copies, merging when we can, and removing
+    // the trivial assigns we added temporarily
+    optimizeCopies();
+  }
+
+  std::vector<SetLocal*> copies;
+
+  void visitSetLocal(SetLocal* curr) {
+    if (auto* get = curr->value->dynCast<GetLocal>()) {
+      if (get->index != curr->index) {
+        Builder builder(*getModule());
+        auto* trivial = builder.makeTeeLocal(get->index, get);
+        curr->value = trivial;
+        copies.push_back(curr);
+      }
+    }
+  }
+
+  void optimizeCopies() {
+    if (copies.empty()) return;
+    // compute all dependencies
+    LocalGraph preGraph(getFunction(), getModule());
+    preGraph.computeInfluences();
+    // optimize each copy
+    std::unordered_map<SetLocal*, SetLocal*> optimizedToCopy, optimizedToTrivial;
+    for (auto* copy : copies) {
+      auto* trivial = copy->value->cast<SetLocal>();
+      bool canOptimizeToCopy = false;
+      auto& trivialInfluences = preGraph.setInfluences[trivial];
+      if (!trivialInfluences.empty()) {
+        canOptimizeToCopy = true;
+        for (auto* influencedGet : trivialInfluences) {
+          // this get uses the trivial write, so it uses the value in the copy.
+          // however, it may depend on other writes too, if there is a merge/phi,
+          // and in that case we can't do anything
+          assert(influencedGet->index == trivial->index);
+          if (preGraph.getSetses[influencedGet].size() == 1) {
+            // this is ok
+            assert(*preGraph.getSetses[influencedGet].begin() == trivial);
+          } else {
+            canOptimizeToCopy = false;
+            break;
+          }
+        }
+      }
+      if (canOptimizeToCopy) {
+        // worth it for this copy, do it
+        for (auto* influencedGet : trivialInfluences) {
+          influencedGet->index = copy->index;
+        }
+        optimizedToCopy[copy] = trivial;
+      } else {
+        // alternatively, we can try to remove the conflict in the opposite way: given
+        //   (set_local $x
+        //    (get_local $y)
+        //   )
+        // we can look for uses of $x that could instead be uses of $y. this extends
+        // $y's live range, but if it removes the conflict between $x and $y, it may be
+        // worth it
+        if (!trivialInfluences.empty()) { // if the trivial set we added has influences, it means $y lives on
+          auto& copyInfluences = preGraph.setInfluences[copy];
+          if (!copyInfluences.empty()) {
+            bool canOptimizeToTrivial = true;
+            for (auto* influencedGet : copyInfluences) {
+              // as above, avoid merges/phis
+              assert(influencedGet->index == copy->index);
+              if (preGraph.getSetses[influencedGet].size() == 1) {
+                // this is ok
+                assert(*preGraph.getSetses[influencedGet].begin() == copy);
+              } else {
+                canOptimizeToTrivial = false;
+                break;
+              }
+            }
+            if (canOptimizeToTrivial) {
+              // worth it for this copy, do it
+              for (auto* influencedGet : copyInfluences) {
+                influencedGet->index = trivial->index;
+              }
+              optimizedToTrivial[copy] = trivial;
+              // note that we don't
+            }
+          }
+        }
+      }
+    }
+    if (!optimizedToCopy.empty() || !optimizedToTrivial.empty()) {
+      // finally, we need to verify that the changes work properly, that is,
+      // they use the value from the right place (and are not affected by
+      // another set of the index we changed to).
+      // if one does not work, we need to undo all its siblings (don't extend
+      // the live range unless we are definitely removing a conflict, same
+      // logic as before).
+      LocalGraph postGraph(getFunction(), getModule());
+      postGraph.computeInfluences();
+      for (auto& pair : optimizedToCopy) {
+        auto* copy = pair.first;
+        auto* trivial = pair.second;
+        auto& trivialInfluences = preGraph.setInfluences[trivial];
+        for (auto* influencedGet : trivialInfluences) {
+          // verify the set
+          auto& sets = postGraph.getSetses[influencedGet];
+          if (sets.size() != 1 || *sets.begin() != copy) {
+            // not good, undo all the changes for this copy
+            for (auto* undo : trivialInfluences) {
+              undo->index = trivial->index;
+            }
+            break;
+          }
+        }
+      }
+      for (auto& pair : optimizedToTrivial) {
+        auto* copy = pair.first;
+        auto* trivial = pair.second;
+        auto& copyInfluences = preGraph.setInfluences[copy];
+        for (auto* influencedGet : copyInfluences) {
+          // verify the set
+          auto& sets = postGraph.getSetses[influencedGet];
+          if (sets.size() != 1 || *sets.begin() != trivial) {
+            // not good, undo all the changes for this copy
+            for (auto* undo : copyInfluences) {
+              undo->index = copy->index;
+            }
+            break;
+          }
+        }
+        // if this change was ok, we can probably remove the copy itself,
+        // but we leave that for other passes
+      }
+    }
+    // remove the trivial sets
+    for (auto* copy : copies) {
+      copy->value = copy->value->cast<SetLocal>()->value;
+    }
+  }
+};
+
+Pass *createMergeLocalsPass() {
+  return new MergeLocals();
+}
+
+} // namespace wasm
+
diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp
index f2463691a..89467899c 100644
--- a/src/passes/pass.cpp
+++ b/src/passes/pass.cpp
@@ -83,6 +83,7 @@ void PassRegistry::registerPasses() {
   registerPass("instrument-memory", "instrument the build with code to intercept all loads and stores", createInstrumentMemoryPass);
   registerPass("memory-packing", "packs memory into separate segments, skipping zeros", createMemoryPackingPass);
   registerPass("merge-blocks", "merges blocks to their parents", createMergeBlocksPass);
+  registerPass("merge-locals", "merges locals when beneficial", createMergeLocalsPass);
   registerPass("metrics", "reports metrics", createMetricsPass);
   registerPass("nm", "name list", createNameListPass);
   registerPass("optimize-instructions", "optimizes instruction combinations", createOptimizeInstructionsPass);
@@ -140,6 +141,10 @@ void PassRunner::addDefaultFunctionOptimizationPasses() {
   add("vacuum"); // previous pass creates garbage
   add("reorder-locals");
   add("remove-unused-brs"); // simplify-locals opens opportunities for optimizations
+  // if we are willing to work hard, also optimize copies before coalescing
+  if (options.optimizeLevel >= 3 || options.shrinkLevel >= 2) {
+    add("merge-locals"); // very slow on e.g. sqlite
+  }
   add("coalesce-locals");
   add("simplify-locals");
   add("vacuum"); // previous pass creates garbage
diff --git a/src/passes/passes.h b/src/passes/passes.h
index 957ca2d68..5cefdea17 100644
--- a/src/passes/passes.h
+++ b/src/passes/passes.h
@@ -42,6 +42,7 @@ Pass* createInstrumentLocalsPass();
 Pass* createInstrumentMemoryPass();
 Pass* createMemoryPackingPass();
 Pass* createMergeBlocksPass();
+Pass* createMergeLocalsPass();
 Pass* createMinifiedPrinterPass();
 Pass* createMetricsPass();
 Pass* createNameListPass();
diff --git a/src/tools/fuzzing.h b/src/tools/fuzzing.h
index be12b561a..baaaf1b23 100644
--- a/src/tools/fuzzing.h
+++ b/src/tools/fuzzing.h
@@ -480,6 +480,14 @@ private:
   }
 
   Expression* _makeConcrete(WasmType type) {
+    auto choice = upTo(100);
+    if (choice < 10) return makeConst(type);
+    if (choice < 30) return makeSetLocal(type);
+    if (choice < 50) return makeGetLocal(type);
+    if (choice < 60) return makeBlock(type);
+    if (choice < 70) return makeIf(type);
+    if (choice < 80) return makeLoop(type);
+    if (choice < 90) return makeBreak(type);
     switch (upTo(15)) {
       case 0: return makeBlock(type);
       case 1: return makeIf(type);
@@ -501,6 +509,12 @@ private:
   }
 
   Expression* _makenone() {
+    auto choice = upTo(100);
+    if (choice < 50) return makeSetLocal(none);
+    if (choice < 60) return makeBlock(none);
+    if (choice < 70) return makeIf(none);
+    if (choice < 80) return makeLoop(none);
+    if (choice < 90) return makeBreak(none);
     switch (upTo(11)) {
       case 0: return makeBlock(none);
       case 1: return makeIf(none);
diff --git a/src/tools/wasm-opt.cpp b/src/tools/wasm-opt.cpp
index 8fe55aef1..374c1b5bd 100644
--- a/src/tools/wasm-opt.cpp
+++ b/src/tools/wasm-opt.cpp
@@ -182,36 +182,38 @@ int main(int argc, const char* argv[]) {
     std::cout << "[extra-fuzz-command first output:]\n" << firstOutput << '\n';
   }
 
+  Module* curr = &wasm;
+  Module other;
+
+  if (fuzzExec && fuzzBinary) {
+    BufferWithRandomAccess buffer(false);
+    // write the binary
+    WasmBinaryWriter writer(&wasm, buffer, false);
+    writer.write();
+    // read the binary
+    auto input = buffer.getAsChars();
+    WasmBinaryBuilder parser(other, input, false);
+    parser.read();
+    bool valid = WasmValidator().validate(other, features);
+    if (!valid) {
+      WasmPrinter::printModule(&other);
+    }
+    assert(valid);
+    curr = &other;
+  }
+
   if (options.runningPasses()) {
     if (options.debug) std::cerr << "running passes...\n";
-    options.runPasses(wasm);
-    bool valid = WasmValidator().validate(wasm, features);
+    options.runPasses(*curr);
+    bool valid = WasmValidator().validate(*curr, features);
     if (!valid) {
-      WasmPrinter::printModule(&wasm);
+      WasmPrinter::printModule(&*curr);
     }
     assert(valid);
   }
 
   if (fuzzExec) {
-    auto* compare = &wasm;
-    Module second;
-    if (fuzzBinary) {
-      compare = &second;
-      BufferWithRandomAccess buffer(false);
-      // write the binary
-      WasmBinaryWriter writer(&wasm, buffer, false);
-      writer.write();
-      // read the binary
-      auto input = buffer.getAsChars();
-      WasmBinaryBuilder parser(second, input, false);
-      parser.read();
-      bool valid = WasmValidator().validate(second, features);
-      if (!valid) {
-        WasmPrinter::printModule(&second);
-      }
-      assert(valid);
-    }
-    results.check(*compare);
+    results.check(*curr);
   }
 
   if (options.extra.count("output") > 0) {
@@ -220,7 +222,7 @@ int main(int argc, const char* argv[]) {
     writer.setDebug(options.debug);
     writer.setBinary(emitBinary);
     writer.setDebugInfo(debugInfo);
-    writer.write(wasm, options.extra["output"]);
+    writer.write(*curr, options.extra["output"]);
 
     if (extraFuzzCommand.size() > 0) {
       auto secondOutput = runCommand(extraFuzzCommand);