Global optimization fixes (#1360)

* run dfe at the very end, as it may be more effective after inlining * optimize reorder-functions * do a final dfe in asm2wasm after all other opts * make inlining deterministic: std::atomic<T> values are not zero-initialized * do global post opts at the end of asm2wasm, and don't also do them in the module builder * fix function type removing * don't inline+optimize when preserving debug info
author: Alon Zakai <alonzakai@gmail.com> 2018-01-17 13:11:14 -0800
committer: GitHub <noreply@github.com> 2018-01-17 13:11:14 -0800
commit: 0a9ddae715b2cff23a86cf14bbf6a4b870511395 (patch)
tree: ac15dd6da937509b22b6cc5b97f21fa508f48570 /src
parent: 01b23987405d8d7b2f13e40ef906169163ac2a5f (diff)
download: binaryen-0a9ddae715b2cff23a86cf14bbf6a4b870511395.tar.gz
binaryen-0a9ddae715b2cff23a86cf14bbf6a4b870511395.tar.bz2
binaryen-0a9ddae715b2cff23a86cf14bbf6a4b870511395.zip
8 files changed, 96 insertions, 42 deletions
diff --git a/src/asm2wasm.h b/src/asm2wasm.h
index b5f701dac..aa8394d3a 100644
--- a/src/asm2wasm.h
+++ b/src/asm2wasm.h
@@ -966,21 +966,6 @@ void Asm2WasmBuilder::processAsm(Ref ast) {
       }
       // optimize relooper label variable usage at the wasm level, where it is easy
       passRunner.add("relooper-jump-threading");
-    }, [&]() {
-      // beforeGlobalOptimizations
-      // if we added any helper functions (like non-trapping i32-div, etc.), then those
-      // have not been optimized (the optimizing builder has just been fed the asm.js
-      // functions). Optimize those now. Typically there are very few, just do it
-      // sequentially.
-      PassRunner passRunner(&wasm, passOptions);
-      passRunner.addDefaultFunctionOptimizationPasses();
-      for (auto& pair : trappingFunctions.getFunctions()) {
-        auto* func = pair.second;
-        passRunner.runOnFunction(func);
-      }
-      for (auto* func : extraSupportFunctions) {
-        passRunner.runOnFunction(func);
-      }
     }, debug, false /* do not validate globally yet */);
   }
 
@@ -1190,6 +1175,19 @@ void Asm2WasmBuilder::processAsm(Ref ast) {
 
   if (runOptimizationPasses) {
     optimizingBuilder->finish();
+    // if we added any helper functions (like non-trapping i32-div, etc.), then those
+    // have not been optimized (the optimizing builder has just been fed the asm.js
+    // functions). Optimize those now. Typically there are very few, just do it
+    // sequentially.
+    PassRunner passRunner(&wasm, passOptions);
+    passRunner.addDefaultFunctionOptimizationPasses();
+    for (auto& pair : trappingFunctions.getFunctions()) {
+      auto* func = pair.second;
+      passRunner.runOnFunction(func);
+    }
+    for (auto* func : extraSupportFunctions) {
+      passRunner.runOnFunction(func);
+    }
   }
   wasm.debugInfoFileNames = std::move(preprocessor.debugInfoFileNames);
 
@@ -1382,7 +1380,7 @@ void Asm2WasmBuilder::processAsm(Ref ast) {
     }
   };
 
-  PassRunner passRunner(&wasm);
+  PassRunner passRunner(&wasm, passOptions);
   passRunner.setFeatures(passOptions.features);
   if (debug) {
     passRunner.setDebug(true);
@@ -1411,6 +1409,11 @@ void Asm2WasmBuilder::processAsm(Ref ast) {
     passRunner.add<ApplyDebugInfo>();
     passRunner.add("vacuum"); // FIXME maybe just remove the nops that were debuginfo nodes, if not optimizing?
   }
+  if (runOptimizationPasses) {
+    // do final global optimizations after all function work is done
+    // (e.g. duplicate funcs may appear thanks to that work)
+    passRunner.addDefaultGlobalOptimizationPostPasses();
+  }
   passRunner.run();
 
   // remove the debug info intrinsic
diff --git a/src/passes/Inlining.cpp b/src/passes/Inlining.cpp
index b3e111e5a..79785811c 100644
--- a/src/passes/Inlining.cpp
+++ b/src/passes/Inlining.cpp
@@ -50,8 +50,15 @@ static const int FLEXIBLE_SIZE_LIMIT = 20;
 struct FunctionInfo {
   std::atomic<Index> calls;
   Index size;
-  bool lightweight = true;
-  bool usedGlobally = false; // in a table or export
+  std::atomic<bool> lightweight;
+  bool usedGlobally; // in a table or export
+
+  FunctionInfo() {
+    calls = 0;
+    size = 0;
+    lightweight = true;
+    usedGlobally = false;
+  }
 
   bool worthInlining(PassOptions& options, bool allowMultipleInliningsPerFunction) {
     // if it's big, it's just not worth doing (TODO: investigate more)
diff --git a/src/passes/RemoveUnusedModuleElements.cpp b/src/passes/RemoveUnusedModuleElements.cpp
index 13caaccc9..6cbb001b1 100644
--- a/src/passes/RemoveUnusedModuleElements.cpp
+++ b/src/passes/RemoveUnusedModuleElements.cpp
@@ -268,6 +268,7 @@ struct RemoveUnusedModuleElements : public Pass {
     module->functionTypes.erase(std::remove_if(module->functionTypes.begin(), module->functionTypes.end(), [&needed](std::unique_ptr<FunctionType>& type) {
       return needed.count(type.get()) == 0;
     }), module->functionTypes.end());
+    module->updateMaps();
   }
 };
 
diff --git a/src/passes/ReorderFunctions.cpp b/src/passes/ReorderFunctions.cpp
index c468fd9d3..5312ee90a 100644
--- a/src/passes/ReorderFunctions.cpp
+++ b/src/passes/ReorderFunctions.cpp
@@ -19,6 +19,13 @@
 // binaries because fewer bytes are needed to encode references to frequently
 // used functions.
 //
+// This may incur a tradeoff, though, as while it reduces binary size, it may
+// increase gzip size. This might be because the new order has the functions in
+// a less beneficial position for compression, that is, mutually-compressible
+// functions are no longer together (when they were before, in the original order,
+// the has some natural tendency one way or the other). TODO: investigate
+// similarity ordering here.
+//
 
 
 #include <memory>
@@ -28,10 +35,41 @@
 
 namespace wasm {
 
-struct ReorderFunctions : public WalkerPass<PostWalker<ReorderFunctions>> {
-  std::map<Name, uint32_t> counts;
+typedef std::unordered_map<Name, std::atomic<Index>> NameCountMap;
+
+struct CallCountScanner : public WalkerPass<PostWalker<CallCountScanner>> {
+  bool isFunctionParallel() override { return true; }
+
+  CallCountScanner(NameCountMap* counts) : counts(counts) {}
+
+  CallCountScanner* create() override {
+    return new CallCountScanner(counts);
+  }
+
+  void visitCall(Call* curr) {
+    assert(counts->count(curr->target) > 0); // can't add a new element in parallel
+    (*counts)[curr->target]++;
+  }
 
-  void visitModule(Module *module) {
+private:
+  NameCountMap* counts;
+};
+
+struct ReorderFunctions : public Pass {
+  void run(PassRunner* runner, Module* module) override {
+    NameCountMap counts;
+    // fill in info, as we operate on it in parallel (each function to its own entry)
+    for (auto& func : module->functions) {
+      counts[func->name];
+    }
+    // find counts on function calls
+    {
+      PassRunner runner(module);
+      runner.setIsNested(true);
+      runner.add<CallCountScanner>(&counts);
+      runner.run();
+    }
+    // find counts on global usages
     if (module->start.is()) {
       counts[module->start]++;
     }
@@ -43,19 +81,15 @@ struct ReorderFunctions : public WalkerPass<PostWalker<ReorderFunctions>> {
         counts[curr]++;
       }
     }
-    std::sort(module->functions.begin(), module->functions.end(), [this](
+    // sort
+    std::sort(module->functions.begin(), module->functions.end(), [&counts](
       const std::unique_ptr<Function>& a,
       const std::unique_ptr<Function>& b) -> bool {
-      if (this->counts[a->name] == this->counts[b->name]) {
+      if (counts[a->name] == counts[b->name]) {
         return strcmp(a->name.str, b->name.str) > 0;
       }
-      return this->counts[a->name] > this->counts[b->name];
+      return counts[a->name] > counts[b->name];
     });
-    counts.clear();
-  }
-
-  void visitCall(Call *curr) {
-    counts[curr->target]++;
   }
 };
 
diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp
index 88734e85a..11575b1df 100644
--- a/src/passes/pass.cpp
+++ b/src/passes/pass.cpp
@@ -180,11 +180,14 @@ void PassRunner::addDefaultGlobalOptimizationPrePasses() {
 }
 
 void PassRunner::addDefaultGlobalOptimizationPostPasses() {
-  add("duplicate-function-elimination"); // optimizations show more functions as duplicate
-  add("remove-unused-module-elements");
-  if (options.optimizeLevel >= 2 || options.shrinkLevel >= 2) {
+  // inline when working hard, and when not preserving debug info
+  // (inlining+optimizing can remove the annotations)
+  if ((options.optimizeLevel >= 2 || options.shrinkLevel >= 2) &&
+      !options.debugInfo) {
     add("inlining-optimizing");
   }
+  add("duplicate-function-elimination"); // optimizations show more functions as duplicate
+  add("remove-unused-module-elements");
   add("memory-packing");
 }
 
diff --git a/src/wasm-module-building.h b/src/wasm-module-building.h
index 866019328..990ea5191 100644
--- a/src/wasm-module-building.h
+++ b/src/wasm-module-building.h
@@ -35,9 +35,9 @@ static std::mutex debug;
 // Helps build wasm modules efficiently. If you build a module by
 // adding function by function, and you want to optimize them, this class
 // starts optimizing using worker threads *while you are still adding*.
-// It runs function optimization passes at that time, and then at the end
-// it runs global module-level optimization passes. The result is a fully
-// optimized module, optimized while being generated.
+// It runs function optimization passes at that time. This does not
+// run global optimization after that by default, but you can do that
+// to by calling optimizeGlobally().
 //
 // This might also be faster than normal module optimization since it
 // runs all passes on each function, then goes on to the next function
@@ -75,7 +75,6 @@ class OptimizingIncrementalModuleBuilder {
   uint32_t numFunctions;
   PassOptions passOptions;
   std::function<void (PassRunner&)> addPrePasses;
-  std::function<void ()> beforeGlobalOptimizations;
   Function* endMarker;
   std::atomic<Function*>* list;
   uint32_t nextFunction; // only used on main thread
@@ -93,10 +92,9 @@ public:
   // this bounds helps avoid locking.
   OptimizingIncrementalModuleBuilder(Module* wasm, Index numFunctions, PassOptions passOptions,
       std::function<void (PassRunner&)> addPrePasses,
-      std::function<void ()> beforeGlobalOptimizations,
       bool debug, bool validateGlobally)
       : wasm(wasm), numFunctions(numFunctions), passOptions(passOptions),
-        addPrePasses(addPrePasses), beforeGlobalOptimizations(beforeGlobalOptimizations),
+        addPrePasses(addPrePasses),
         endMarker(nullptr), list(nullptr), nextFunction(0),
         numWorkers(0), liveWorkers(0), activeWorkers(0), availableFuncs(0), finishedFuncs(0),
         finishing(false), debug(debug), validateGlobally(validateGlobally) {
@@ -178,7 +176,6 @@ public:
       wakeAllWorkers();
       waitUntilAllFinished();
     }
-    optimizeGlobally();
     // TODO: clear side thread allocators from module allocator, as these threads were transient
   }
 
@@ -230,11 +227,9 @@ private:
   }
 
   void optimizeGlobally() {
-    beforeGlobalOptimizations();
     PassRunner passRunner(wasm, passOptions);
     passRunner.addDefaultGlobalOptimizationPostPasses();
     passRunner.run();
-
   }
 
   // worker code
diff --git a/src/wasm.h b/src/wasm.h
index 485afade1..4072533a7 100644
--- a/src/wasm.h
+++ b/src/wasm.h
@@ -784,6 +784,7 @@ public:
   void removeImport(Name name);
   void removeExport(Name name);
   void removeFunction(Name name);
+  void removeFunctionType(Name name);
   // TODO: remove* for other elements
 
   void updateMaps();
diff --git a/src/wasm/wasm.cpp b/src/wasm/wasm.cpp
index 8739d96f7..32a633926 100644
--- a/src/wasm/wasm.cpp
+++ b/src/wasm/wasm.cpp
@@ -747,7 +747,17 @@ void Module::removeFunction(Name name) {
   functionsMap.erase(name);
 }
 
-  // TODO: remove* for other elements
+void Module::removeFunctionType(Name name) {
+  for (size_t i = 0; i < functionTypes.size(); i++) {
+    if (functionTypes[i]->name == name) {
+      functionTypes.erase(functionTypes.begin() + i);
+      break;
+    }
+  }
+  functionTypesMap.erase(name);
+}
+
+// TODO: remove* for other elements
 
 void Module::updateMaps() {
   functionsMap.clear();
author	Alon Zakai <alonzakai@gmail.com>	2018-01-17 13:11:14 -0800
committer	GitHub <noreply@github.com>	2018-01-17 13:11:14 -0800
commit	0a9ddae715b2cff23a86cf14bbf6a4b870511395 (patch)
tree	ac15dd6da937509b22b6cc5b97f21fa508f48570 /src
parent	01b23987405d8d7b2f13e40ef906169163ac2a5f (diff)
download	binaryen-0a9ddae715b2cff23a86cf14bbf6a4b870511395.tar.gz binaryen-0a9ddae715b2cff23a86cf14bbf6a4b870511395.tar.bz2 binaryen-0a9ddae715b2cff23a86cf14bbf6a4b870511395.zip