Improve and enable inlining pass (#966)

* improve inlining pass to inline single-use functions that are fairly small, which makes it useful for removing unnecessary global constructors from clang. add an inlining-optimizing pass that also optimizes where it inlined, as new opportunities arise. enable that it by default in O2+ * fix a bug where we didn't run all passes properly - refactor addDefaultGlobalOptimizationPasses() into a pre and post version. we can only run the post version in incremental optimizing builds (functions appear one by one, we optimize them first, and do global stuff when all are done), but can run both when doing a full optimize * copy in inlining, allowing multiple inlinings of the same function in the future
author: Alon Zakai <alonzakai@gmail.com> 2017-08-07 15:43:36 -0700
committer: GitHub <noreply@github.com> 2017-08-07 15:43:36 -0700
commit: 4f0bf336e0e04ec349c7524d86ffd2c4066cb644 (patch)
tree: d50a12f9701f8eb4b5f28770cf43a55877c931a8 /src
parent: b93ea39b239052314123d3641df29ff5c5730515 (diff)
download: binaryen-4f0bf336e0e04ec349c7524d86ffd2c4066cb644.tar.gz
binaryen-4f0bf336e0e04ec349c7524d86ffd2c4066cb644.tar.bz2
binaryen-4f0bf336e0e04ec349c7524d86ffd2c4066cb644.zip
6 files changed, 154 insertions, 52 deletions
diff --git a/src/pass.h b/src/pass.h
index 21836ebf9..2f3bca078 100644
--- a/src/pass.h
+++ b/src/pass.h
@@ -109,8 +109,14 @@ struct PassRunner {
   void addDefaultFunctionOptimizationPasses();
 
   // Adds the default optimization passes that work on
-  // entire modules as a whole.
-  void addDefaultGlobalOptimizationPasses();
+  // entire modules as a whole, and make sense to
+  // run before function passes.
+  void addDefaultGlobalOptimizationPrePasses();
+
+  // Adds the default optimization passes that work on
+  // entire modules as a whole, and make sense to
+  // run after function passes.
+  void addDefaultGlobalOptimizationPostPasses();
 
   // Run the passes on the module
   void run();
diff --git a/src/passes/Inlining.cpp b/src/passes/Inlining.cpp
index c352a704a..377aa5247 100644
--- a/src/passes/Inlining.cpp
+++ b/src/passes/Inlining.cpp
@@ -18,45 +18,68 @@
 // Inlining.
 //
 // For now, this does a conservative inlining of all functions that have
-// exactly one use. That should not increase code size, and may have
-// speed benefits.
+// exactly one use, and are fairly small. That should not increase code
+// size, and may have speed benefits.
 //
 
+#include <atomic>
+
 #include <wasm.h>
 #include <pass.h>
 #include <wasm-builder.h>
+#include <ast_utils.h>
 #include <parsing.h>
 
 namespace wasm {
 
+// A limit on how big a function to inline.
+static const int INLINING_SIZE_LIMIT = 15;
+
+// We only inline a function with a single use.
+static const int SINGLE_USE = 1;
+
+// A number of uses of a function that is too high for us to
+// inline it to all those locations.
+static const int TOO_MANY_USES_TO_INLINE = SINGLE_USE + 1;
+
+// Map of function name => number of uses. We build the values in
+// parallel, using atomic increments. This is safe because we never
+// update the map itself in parallel, we only update the values,
+// and so the map never allocates or moves values which could be
+// a problem with atomics (in fact it would be a problem in general
+// as well, not just with atomics, as we don't use a lock in
+// parallel access, we depend on the map itself being constant
+// when running multiple threads).
+typedef std::map<Name, std::atomic<Index>> NameToAtomicIndexMap;
+
 struct FunctionUseCounter : public WalkerPass<PostWalker<FunctionUseCounter>> {
   bool isFunctionParallel() override { return true; }
 
-  FunctionUseCounter(std::map<Name, Index>* output) : output(output) {}
+  FunctionUseCounter(NameToAtomicIndexMap* uses) : uses(uses) {}
 
   FunctionUseCounter* create() override {
-    return new FunctionUseCounter(output);
+    return new FunctionUseCounter(uses);
   }
 
   void visitCall(Call *curr) {
-    (*output)[curr->target]++;
+    assert(uses->count(curr->target) > 0); // can't add a new element in parallel
+    (*uses)[curr->target]++;
   }
 
 private:
-  std::map<Name, Index>* output;
+  NameToAtomicIndexMap* uses;
 };
 
-struct Action {
-  Call* call;
-  Block* block; // the replacement for the call, into which we should inline
+struct InliningAction {
+  Expression** callSite;
   Function* contents;
 
-  Action(Call* call, Block* block, Function* contents) : call(call), block(block), contents(contents) {}
+  InliningAction(Expression** callSite, Function* contents) : callSite(callSite), contents(contents) {}
 };
 
 struct InliningState {
   std::set<Name> canInline;
-  std::map<Name, std::vector<Action>> actionsForFunction; // function name => actions that can be performed in it
+  std::map<Name, std::vector<InliningAction>> actionsForFunction; // function name => actions that can be performed in it
 };
 
 struct Planner : public WalkerPass<PostWalker<Planner>> {
@@ -68,12 +91,18 @@ struct Planner : public WalkerPass<PostWalker<Planner>> {
     return new Planner(state);
   }
 
-  void visitCall(Call *curr) {
-    if (state->canInline.count(curr->target)) {
-      auto* block = Builder(*getModule()).makeBlock();
-      block->type = curr->type;
+  void visitCall(Call* curr) {
+    // plan to inline if we know this is valid to inline, and if the call is
+    // actually performed - if it is dead code, it's pointless to inline
+    if (state->canInline.count(curr->target) &&
+        curr->type != unreachable) {
+      // nest the call in a block. that way the location of the pointer to the call will not
+      // change even if we inline multiple times into the same function, otherwise
+      // call1(call2()) might be a problem
+      auto* block = Builder(*getModule()).makeBlock(curr);
       replaceCurrent(block);
-      state->actionsForFunction[getFunction()->name].emplace_back(curr, block, getModule()->getFunction(curr->target));
+      assert(state->actionsForFunction.count(getFunction()->name) > 0); // can't add a new element in parallel
+      state->actionsForFunction[getFunction()->name].emplace_back(&block->list[0], getModule()->getFunction(curr->target));
     }
   }
 
@@ -91,13 +120,13 @@ private:
 
 // Core inlining logic. Modifies the outside function (adding locals as
 // needed), and returns the inlined code.
-// Since we only inline once, and do not need the function afterwards, we
-// can just reuse all the nodes and even avoid copying.
-static Expression* doInlining(Module* module, Function* into, Action& action) {
+static Expression* doInlining(Module* module, Function* into, InliningAction& action) {
+  auto* call = (*action.callSite)->cast<Call>();
   Builder builder(*module);
-  auto* block = action.block;
+  auto* block = Builder(*module).makeBlock();
+  block->type = call->type;
   block->name = Name(std::string("__inlined_func$") + action.contents->name.str);
-  block->type = action.contents->result;
+  *action.callSite = block;
   // set up a locals mapping
   struct Updater : public PostWalker<Updater> {
     std::map<Index, Index> localMapping;
@@ -121,49 +150,59 @@ static Expression* doInlining(Module* module, Function* into, Action& action) {
   }
   // assign the operands into the params
   for (Index i = 0; i < action.contents->params.size(); i++) {
-    block->list.push_back(builder.makeSetLocal(updater.localMapping[i], action.call->operands[i]));
+    block->list.push_back(builder.makeSetLocal(updater.localMapping[i], call->operands[i]));
   }
-  // update the inlined contents
-  updater.walk(action.contents->body);
-  block->list.push_back(action.contents->body);
-  action.contents->body = builder.makeUnreachable(); // not strictly needed, since it's going away
+  // generate and update the inlined contents
+  auto* contents = ExpressionManipulator::copy(action.contents->body, *module);
+  updater.walk(contents);
+  block->list.push_back(contents);
   return block;
 }
 
 struct Inlining : public Pass {
+  // whether to optimize where we inline
+  bool optimize = false;
+
+  NameToAtomicIndexMap uses;
+
   void run(PassRunner* runner, Module* module) override {
     // keep going while we inline, to handle nesting. TODO: optimize
+    calculateUses(module);
     while (iteration(runner, module)) {}
   }
 
-  bool iteration(PassRunner* runner, Module* module) {
-    // Count uses
-    std::map<Name, Index> uses;
+  void calculateUses(Module* module) {
     // fill in uses, as we operate on it in parallel (each function to its own entry)
     for (auto& func : module->functions) {
-      uses[func->name] = 0;
-    }
-    {
-      PassRunner runner(module);
-      runner.setIsNested(true);
-      runner.add<FunctionUseCounter>(&uses);
-      runner.run();
+      uses[func->name].store(0);
     }
+    PassRunner runner(module);
+    runner.setIsNested(true);
+    runner.add<FunctionUseCounter>(&uses);
+    runner.run();
+    // anything exported or used in a table should not be inlined
     for (auto& ex : module->exports) {
       if (ex->kind == ExternalKind::Function) {
-        uses[ex->value] = 2; // too many, so we ignore it
+        uses[ex->value].store(TOO_MANY_USES_TO_INLINE);
       }
     }
     for (auto& segment : module->table.segments) {
       for (auto name : segment.data) {
-        uses[name]++;
+        if (module->getFunctionOrNull(name)) {
+          uses[name].store(TOO_MANY_USES_TO_INLINE);
+        }
       }
     }
+  }
+
+  bool iteration(PassRunner* runner, Module* module) {
     // decide which to inline
     InliningState state;
-    for (auto iter : uses) {
-      if (iter.second == 1) {
-        state.canInline.insert(iter.first);
+    for (auto& func : module->functions) {
+      auto name = func->name;
+      auto numUses = uses[name].load();
+      if (canInline(numUses) && worthInlining(module->getFunction(name))) {
+        state.canInline.insert(name);
       }
     }
     // fill in actionsForFunction, as we operate on it in parallel (each function to its own entry)
@@ -182,15 +221,21 @@ struct Inlining : public Pass {
     std::set<Function*> inlinedInto;
     for (auto& func : module->functions) {
       for (auto& action : state.actionsForFunction[func->name]) {
+        Name inlinedName = action.contents->name;
         doInlining(module, func.get(), action);
-        inlined.insert(action.contents->name);
+        inlined.insert(inlinedName);
         inlinedInto.insert(func.get());
+        uses[inlinedName]--;
+        assert(uses[inlinedName].load() == 0);
       }
     }
     // anything we inlined into may now have non-unique label names, fix it up
     for (auto func : inlinedInto) {
       wasm::UniqueNameMapper::uniquify(func->body);
     }
+    if (optimize && inlinedInto.size() > 0) {
+      doOptimize(inlinedInto, module, runner);
+    }
     // remove functions that we managed to inline, their one use is gone
     auto& funcs = module->functions;
     funcs.erase(std::remove_if(funcs.begin(), funcs.end(), [&inlined](const std::unique_ptr<Function>& curr) {
@@ -199,11 +244,55 @@ struct Inlining : public Pass {
     // return whether we did any work
     return inlined.size() > 0;
   }
+
+  bool canInline(int numUses) {
+    return numUses == SINGLE_USE;
+  }
+
+  bool worthInlining(Function* func) {
+    return Measurer::measure(func->body) <= INLINING_SIZE_LIMIT;
+  }
+
+  // Run useful optimizations after inlining, things like removing
+  // unnecessary new blocks, sharing variables, etc.
+  void doOptimize(std::set<Function*>& funcs, Module* module, PassRunner* parentRunner) {
+    // save the full list of functions on the side
+    std::vector<std::unique_ptr<Function>> all;
+    all.swap(module->functions);
+    module->updateMaps();
+    for (auto& func : funcs) {
+      module->addFunction(func);
+    }
+    PassRunner runner(module, parentRunner->options);
+    runner.setIsNested(true);
+    runner.setValidateGlobally(false); // not a full valid module
+    runner.add("remove-unused-brs");
+    runner.add("remove-unused-names");
+    runner.add("coalesce-locals");
+    runner.add("simplify-locals");
+    runner.add("vacuum");
+    runner.add("reorder-locals");
+    runner.add("remove-unused-brs");
+    runner.add("merge-blocks");
+    runner.run();
+    // restore all the funcs
+    for (auto& func : module->functions) {
+      func.release();
+    }
+    all.swap(module->functions);
+    module->updateMaps();
+  }
 };
 
 Pass *createInliningPass() {
   return new Inlining();
 }
 
+Pass *createInliningOptimizingPass() {
+  auto* ret = new Inlining();
+  ret->optimize = true;
+  return ret;
+}
+
 } // namespace wasm
 
diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp
index fc0a583fb..37eb50a0b 100644
--- a/src/passes/pass.cpp
+++ b/src/passes/pass.cpp
@@ -73,6 +73,7 @@ void PassRegistry::registerPasses() {
   registerPass("extract-function", "leaves just one function (useful for debugging)", createExtractFunctionPass);
   registerPass("flatten-control-flow", "flattens out control flow to be only on blocks, not nested as expressions", createFlattenControlFlowPass);
   registerPass("inlining", "inlines functions (currently only ones with a single use)", createInliningPass);
+  registerPass("inlining-optimizing", "inlines functions (currently only ones with a single use) and optimizes where we inlined", createInliningOptimizingPass);
   registerPass("legalize-js-interface", "legalizes i64 types on the import/export boundary", createLegalizeJSInterfacePass);
   registerPass("local-cse", "common subexpression elimination inside basic blocks", createLocalCSEPass);
   registerPass("log-execution", "instrument the build with logging of where execution goes", createLogExecutionPass);
@@ -110,11 +111,9 @@ void PassRegistry::registerPasses() {
 }
 
 void PassRunner::addDefaultOptimizationPasses() {
-  add("duplicate-function-elimination");
+  addDefaultGlobalOptimizationPrePasses();
   addDefaultFunctionOptimizationPasses();
-  add("duplicate-function-elimination"); // optimizations show more functions as duplicate
-  add("remove-unused-module-elements");
-  add("memory-packing");
+  addDefaultGlobalOptimizationPostPasses();
 }
 
 void PassRunner::addDefaultFunctionOptimizationPasses() {
@@ -154,9 +153,16 @@ void PassRunner::addDefaultFunctionOptimizationPasses() {
   add("vacuum"); // just to be safe
 }
 
-void PassRunner::addDefaultGlobalOptimizationPasses() {
+void PassRunner::addDefaultGlobalOptimizationPrePasses() {
   add("duplicate-function-elimination");
+}
+
+void PassRunner::addDefaultGlobalOptimizationPostPasses() {
+  add("duplicate-function-elimination"); // optimizations show more functions as duplicate
   add("remove-unused-module-elements");
+  if (options.optimizeLevel >= 2 || options.shrinkLevel >= 2) {
+    add("inlining-optimizing");
+  }
   add("memory-packing");
 }
 
diff --git a/src/passes/passes.h b/src/passes/passes.h
index 7a40b7da4..5e7eba540 100644
--- a/src/passes/passes.h
+++ b/src/passes/passes.h
@@ -32,6 +32,7 @@ Pass *createExtractFunctionPass();
 Pass *createFlattenControlFlowPass();
 Pass *createFullPrinterPass();
 Pass *createInliningPass();
+Pass *createInliningOptimizingPass();
 Pass *createLegalizeJSInterfacePass();
 Pass *createLocalCSEPass();
 Pass *createLogExecutionPass();
diff --git a/src/tools/wasm-merge.cpp b/src/tools/wasm-merge.cpp
index fa2b195c9..441cdc0f7 100644
--- a/src/tools/wasm-merge.cpp
+++ b/src/tools/wasm-merge.cpp
@@ -622,7 +622,7 @@ int main(int argc, const char* argv[]) {
     PassRunner passRunner(&output);
     passRunner.add("precompute");
     passRunner.add("optimize-instructions"); // things now-constant may be further optimized
-    passRunner.addDefaultGlobalOptimizationPasses();
+    passRunner.addDefaultGlobalOptimizationPostPasses();
     passRunner.run();
   }
 
diff --git a/src/wasm-module-building.h b/src/wasm-module-building.h
index b501a1ab6..c52cea9f9 100644
--- a/src/wasm-module-building.h
+++ b/src/wasm-module-building.h
@@ -165,7 +165,7 @@ public:
       }
       addPrePasses(passRunner);
       passRunner.addDefaultFunctionOptimizationPasses();
-      passRunner.addDefaultGlobalOptimizationPasses();
+      passRunner.addDefaultGlobalOptimizationPostPasses();
       passRunner.run();
       return;
     }
@@ -226,7 +226,7 @@ private:
 
   void optimizeGlobally() {
     PassRunner passRunner(wasm, passOptions);
-    passRunner.addDefaultGlobalOptimizationPasses();
+    passRunner.addDefaultGlobalOptimizationPostPasses();
     passRunner.run();
   }
author	Alon Zakai <alonzakai@gmail.com>	2017-08-07 15:43:36 -0700
committer	GitHub <noreply@github.com>	2017-08-07 15:43:36 -0700
commit	4f0bf336e0e04ec349c7524d86ffd2c4066cb644 (patch)
tree	d50a12f9701f8eb4b5f28770cf43a55877c931a8 /src
parent	b93ea39b239052314123d3641df29ff5c5730515 (diff)
download	binaryen-4f0bf336e0e04ec349c7524d86ffd2c4066cb644.tar.gz binaryen-4f0bf336e0e04ec349c7524d86ffd2c4066cb644.tar.bz2 binaryen-4f0bf336e0e04ec349c7524d86ffd2c4066cb644.zip