6 files changed, 154 insertions, 52 deletions
diff --git a/src/pass.h b/src/pass.h
index 21836ebf9..2f3bca078 100644
--- a/src/pass.h
+++ b/src/pass.h
@@ -109,8 +109,14 @@ struct PassRunner {
   void addDefaultFunctionOptimizationPasses();
 
   // Adds the default optimization passes that work on
-  // entire modules as a whole.
-  void addDefaultGlobalOptimizationPasses();
+  // entire modules as a whole, and make sense to
+  // run before function passes.
+  void addDefaultGlobalOptimizationPrePasses();
+
+  // Adds the default optimization passes that work on
+  // entire modules as a whole, and make sense to
+  // run after function passes.
+  void addDefaultGlobalOptimizationPostPasses();
 
   // Run the passes on the module
   void run();
diff --git a/src/passes/Inlining.cpp b/src/passes/Inlining.cpp
index c352a704a..377aa5247 100644
--- a/src/passes/Inlining.cpp
+++ b/src/passes/Inlining.cpp
@@ -18,45 +18,68 @@
 // Inlining.
 //
 // For now, this does a conservative inlining of all functions that have
-// exactly one use. That should not increase code size, and may have
-// speed benefits.
+// exactly one use, and are fairly small. That should not increase code
+// size, and may have speed benefits.
 //
 
+#include <atomic>
+
 #include <wasm.h>
 #include <pass.h>
 #include <wasm-builder.h>
+#include <ast_utils.h>
 #include <parsing.h>
 
 namespace wasm {
 
+// A limit on how big a function to inline.
+static const int INLINING_SIZE_LIMIT = 15;
+
+// We only inline a function with a single use.
+static const int SINGLE_USE = 1;
+
+// A number of uses of a function that is too high for us to
+// inline it to all those locations.
+static const int TOO_MANY_USES_TO_INLINE = SINGLE_USE + 1;
+
+// Map of function name => number of uses. We build the values in
+// parallel, using atomic increments. This is safe because we never
+// update the map itself in parallel, we only update the values,
+// and so the map never allocates or moves values which could be
+// a problem with atomics (in fact it would be a problem in general
+// as well, not just with atomics, as we don't use a lock in
+// parallel access, we depend on the map itself being constant
+// when running multiple threads).
+typedef std::map<Name, std::atomic<Index>> NameToAtomicIndexMap;
+
 struct FunctionUseCounter : public WalkerPass<PostWalker<FunctionUseCounter>> {
   bool isFunctionParallel() override { return true; }
 
-  FunctionUseCounter(std::map<Name, Index>* output) : output(output) {}
+  FunctionUseCounter(NameToAtomicIndexMap* uses) : uses(uses) {}
 
   FunctionUseCounter* create() override {
-    return new FunctionUseCounter(output);
+    return new FunctionUseCounter(uses);
   }
 
   void visitCall(Call *curr) {
-    (*output)[curr->target]++;
+    assert(uses->count(curr->target) > 0); // can't add a new element in parallel
+    (*uses)[curr->target]++;
   }
 
 private:
-  std::map<Name, Index>* output;
+  NameToAtomicIndexMap* uses;
 };
 
-struct Action {
-  Call* call;
-  Block* block; // the replacement for the call, into which we should inline
+struct InliningAction {
+  Expression** callSite;
   Function* contents;
 
-  Action(Call* call, Block* block, Function* contents) : call(call), block(block), contents(contents) {}
+  InliningAction(Expression** callSite, Function* contents) : callSite(callSite), contents(contents) {}
 };
 
 struct InliningState {
   std::set<Name> canInline;
-  std::map<Name, std::vector<Action>> actionsForFunction; // function name => actions that can be performed in it
+  std::map<Name, std::vector<InliningAction>> actionsForFunction; // function name => actions that can be performed in it
 };
 
 struct Planner : public WalkerPass<PostWalker<Planner>> {
@@ -68,12 +91,18 @@ struct Planner : public WalkerPass<PostWalker<Planner>> {
     return new Planner(state);
   }
 
-  void visitCall(Call *curr) {
-    if (state->canInline.count(curr->target)) {
-      auto* block = Builder(*getModule()).makeBlock();
-      block->type = curr->type;
+  void visitCall(Call* curr) {
+    // plan to inline if we know this is valid to inline, and if the call is
+    // actually performed - if it is dead code, it's pointless to inline
+    if (state->canInline.count(curr->target) &&
+        curr->type != unreachable) {
+      // nest the call in a block. that way the location of the pointer to the call will not
+      // change even if we inline multiple times into the same function, otherwise
+      // call1(call2()) might be a problem
+      auto* block = Builder(*getModule()).makeBlock(curr);
       replaceCurrent(block);
-      state->actionsForFunction[getFunction()->name].emplace_back(curr, block, getModule()->getFunction(curr->target));
+      assert(state->actionsForFunction.count(getFunction()->name) > 0); // can't add a new element in parallel
+      state->actionsForFunction[getFunction()->name].emplace_back(&block->list[0], getModule()->getFunction(curr->target));
     }
   }
 
@@ -91,13 +120,13 @@ private:
 
 // Core inlining logic. Modifies the outside function (adding locals as
 // needed), and returns the inlined code.
-// Since we only inline once, and do not need the function afterwards, we
-// can just reuse all the nodes and even avoid copying.
-static Expression* doInlining(Module* module, Function* into, Action& action) {
+static Expression* doInlining(Module* module, Function* into, InliningAction& action) {
+  auto* call = (*action.callSite)->cast<Call>();
   Builder builder(*module);
-  auto* block = action.block;
+  auto* block = Builder(*module).makeBlock();
+  block->type = call->type;
   block->name = Name(std::string("__inlined_func$") + action.contents->name.str);
-  block->type = action.contents->result;
+  *action.callSite = block;
   // set up a locals mapping
   struct Updater : public PostWalker<Updater> {
     std::map<Index, Index> localMapping;
@@ -121,49 +150,59 @@ static Expression* doInlining(Module* module, Function* into, Action& action) {
   }
   // assign the operands into the params
   for (Index i = 0; i < action.contents->params.size(); i++) {
-    block->list.push_back(builder.makeSetLocal(updater.localMapping[i], action.call->operands[i]));
+    block->list.push_back(builder.makeSetLocal(updater.localMapping[i], call->operands[i]));
   }
-  // update the inlined contents
-  updater.walk(action.contents->body);
-  block->list.push_back(action.contents->body);
-  action.contents->body = builder.makeUnreachable(); // not strictly needed, since it's going away
+  // generate and update the inlined contents
+  auto* contents = ExpressionManipulator::copy(action.contents->body, *module);
+  updater.walk(contents);
+  block->list.push_back(contents);
   return block;
 }
 
 struct Inlining : public Pass {
+  // whether to optimize where we inline
+  bool optimize = false;
+
+  NameToAtomicIndexMap uses;
+
   void run(PassRunner* runner, Module* module) override {
     // keep going while we inline, to handle nesting. TODO: optimize
+    calculateUses(module);
     while (iteration(runner, module)) {}
   }
 
-  bool iteration(PassRunner* runner, Module* module) {
-    // Count uses
-    std::map<Name, Index> uses;
+  void calculateUses(Module* module) {
     // fill in uses, as we operate on it in parallel (each function to its own entry)
     for (auto& func : module->functions) {
-      uses[func->name] = 0;
-    }
-    {
-      PassRunner runner(module);
-      runner.setIsNested(true);
-      runner.add<FunctionUseCounter>(&uses);
-      runner.run();
+      uses[func->name].store(0);
     }
+    PassRunner runner(module);
+    runner.setIsNested(true);
+    runner.add<FunctionUseCounter>(&uses);
+    runner.run();
+    // anything exported or used in a table should not be inlined
     for (auto& ex : module->exports) {
       if (ex->kind == ExternalKind::Function) {
-        uses[ex->value] = 2; // too many, so we ignore it
+        uses[ex->value].store(TOO_MANY_USES_TO_INLINE);
       }
     }
     for (auto& segment : module->table.segments) {
       for (auto name : segment.data) {
-        uses[name]++;
+        if (module->getFunctionOrNull(name)) {
+          uses[name].store(TOO_MANY_USES_TO_INLINE);
+        }
       }
     }
+  }
+
+  bool iteration(PassRunner* runner, Module* module) {
     // decide which to inline
     InliningState state;
-    for (auto iter : uses) {
-      if (iter.second == 1) {
-        state.canInline.insert(iter.first);
+    for (auto& func : module->functions) {
+      auto name = func->name;
+      auto numUses = uses[name].load();
+      if (canInline(numUses) && worthInlining(module->getFunction(name))) {
+        state.canInline.insert(name);
       }
     }
     // fill in actionsForFunction, as we operate on it in parallel (each function to its own entry)
@@ -182,15 +221,21 @@ struct Inlining : public Pass {
     std::set<Function*> inlinedInto;
     for (auto& func : module->functions) {
       for (auto& action : state.actionsForFunction[func->name]) {
+        Name inlinedName = action.contents->name;
         doInlining(module, func.get(), action);
-        inlined.insert(action.contents->name);
+        inlined.insert(inlinedName);
         inlinedInto.insert(func.get());
+        uses[inlinedName]--;
+        assert(uses[inlinedName].load() == 0);
       }
     }
     // anything we inlined into may now have non-unique label names, fix it up
     for (auto func : inlinedInto) {
       wasm::UniqueNameMapper::uniquify(func->body);
     }
+    if (optimize && inlinedInto.size() > 0) {
+      doOptimize(inlinedInto, module, runner);
+    }
     // remove functions that we managed to inline, their one use is gone
     auto& funcs = module->functions;
     funcs.erase(std::remove_if(funcs.begin(), funcs.end(), [&inlined](const std::unique_ptr<Function>& curr) {
@@ -199,11 +244,55 @@ struct Inlining : public Pass {
     // return whether we did any work
     return inlined.size() > 0;
   }
+
+  bool canInline(int numUses) {
+    return numUses == SINGLE_USE;
+  }
+
+  bool worthInlining(Function* func) {
+    return Measurer::measure(func->body) <= INLINING_SIZE_LIMIT;
+  }
+
+  // Run useful optimizations after inlining, things like removing
+  // unnecessary new blocks, sharing variables, etc.
+  void doOptimize(std::set<Function*>& funcs, Module* module, PassRunner* parentRunner) {
+    // save the full list of functions on the side
+    std::vector<std::unique_ptr<Function>> all;
+    all.swap(module->functions);
+    module->updateMaps();
+    for (auto& func : funcs) {
+      module->addFunction(func);
+    }
+    PassRunner runner(module, parentRunner->options);
+    runner.setIsNested(true);
+    runner.setValidateGlobally(false); // not a full valid module
+    runner.add("remove-unused-brs");
+    runner.add("remove-unused-names");
+    runner.add("coalesce-locals");
+    runner.add("simplify-locals");
+    runner.add("vacuum");
+    runner.add("reorder-locals");
+    runner.add("remove-unused-brs");
+    runner.add("merge-blocks");
+    runner.run();
+    // restore all the funcs
+    for (auto& func : module->functions) {
+      func.release();
+    }
+    all.swap(module->functions);
+    module->updateMaps();
+  }
 };
 
 Pass *createInliningPass() {
   return new Inlining();
 }
 
+Pass *createInliningOptimizingPass() {
+  auto* ret = new Inlining();
+  ret->optimize = true;
+  return ret;
+}
+
 } // namespace wasm
 
diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp
index fc0a583fb..37eb50a0b 100644
--- a/src/passes/pass.cpp
+++ b/src/passes/pass.cpp
@@ -73,6 +73,7 @@ void PassRegistry::registerPasses() {
   registerPass("extract-function", "leaves just one function (useful for debugging)", createExtractFunctionPass);
   registerPass("flatten-control-flow", "flattens out control flow to be only on blocks, not nested as expressions", createFlattenControlFlowPass);
   registerPass("inlining", "inlines functions (currently only ones with a single use)", createInliningPass);
+  registerPass("inlining-optimizing", "inlines functions (currently only ones with a single use) and optimizes where we inlined", createInliningOptimizingPass);
   registerPass("legalize-js-interface", "legalizes i64 types on the import/export boundary", createLegalizeJSInterfacePass);
   registerPass("local-cse", "common subexpression elimination inside basic blocks", createLocalCSEPass);
   registerPass("log-execution", "instrument the build with logging of where execution goes", createLogExecutionPass);
@@ -110,11 +111,9 @@ void PassRegistry::registerPasses() {
 }
 
 void PassRunner::addDefaultOptimizationPasses() {
-  add("duplicate-function-elimination");
+  addDefaultGlobalOptimizationPrePasses();
   addDefaultFunctionOptimizationPasses();
-  add("duplicate-function-elimination"); // optimizations show more functions as duplicate
-  add("remove-unused-module-elements");
-  add("memory-packing");
+  addDefaultGlobalOptimizationPostPasses();
 }
 
 void PassRunner::addDefaultFunctionOptimizationPasses() {
@@ -154,9 +153,16 @@ void PassRunner::addDefaultFunctionOptimizationPasses() {
   add("vacuum"); // just to be safe
 }
 
-void PassRunner::addDefaultGlobalOptimizationPasses() {
+void PassRunner::addDefaultGlobalOptimizationPrePasses() {
   add("duplicate-function-elimination");
+}
+
+void PassRunner::addDefaultGlobalOptimizationPostPasses() {
+  add("duplicate-function-elimination"); // optimizations show more functions as duplicate
   add("remove-unused-module-elements");
+  if (options.optimizeLevel >= 2 || options.shrinkLevel >= 2) {
+    add("inlining-optimizing");
+  }
   add("memory-packing");
 }
 
diff --git a/src/passes/passes.h b/src/passes/passes.h
index 7a40b7da4..5e7eba540 100644
--- a/src/passes/passes.h
+++ b/src/passes/passes.h
@@ -32,6 +32,7 @@ Pass *createExtractFunctionPass();
 Pass *createFlattenControlFlowPass();
 Pass *createFullPrinterPass();
 Pass *createInliningPass();
+Pass *createInliningOptimizingPass();
 Pass *createLegalizeJSInterfacePass();
 Pass *createLocalCSEPass();
 Pass *createLogExecutionPass();
diff --git a/src/tools/wasm-merge.cpp b/src/tools/wasm-merge.cpp
index fa2b195c9..441cdc0f7 100644
--- a/src/tools/wasm-merge.cpp
+++ b/src/tools/wasm-merge.cpp
@@ -622,7 +622,7 @@ int main(int argc, const char* argv[]) {
     PassRunner passRunner(&output);
     passRunner.add("precompute");
     passRunner.add("optimize-instructions"); // things now-constant may be further optimized
-    passRunner.addDefaultGlobalOptimizationPasses();
+    passRunner.addDefaultGlobalOptimizationPostPasses();
     passRunner.run();
   }
 
diff --git a/src/wasm-module-building.h b/src/wasm-module-building.h
index b501a1ab6..c52cea9f9 100644
--- a/src/wasm-module-building.h
+++ b/src/wasm-module-building.h
@@ -165,7 +165,7 @@ public:
       }
       addPrePasses(passRunner);
       passRunner.addDefaultFunctionOptimizationPasses();
-      passRunner.addDefaultGlobalOptimizationPasses();
+      passRunner.addDefaultGlobalOptimizationPostPasses();
       passRunner.run();
       return;
     }
@@ -226,7 +226,7 @@ private:
 
   void optimizeGlobally() {
     PassRunner passRunner(wasm, passOptions);
-    passRunner.addDefaultGlobalOptimizationPasses();
+    passRunner.addDefaultGlobalOptimizationPostPasses();
     passRunner.run();
   }