10 files changed, 352 insertions, 33 deletions
diff --git a/src/asm2wasm.h b/src/asm2wasm.h
index 4231fcc9a..5f6cee558 100644
--- a/src/asm2wasm.h
+++ b/src/asm2wasm.h
@@ -30,7 +30,8 @@
 #include "pass.h"
 #include "ast_utils.h"
 #include "wasm-builder.h"
-#include <wasm-validator.h>
+#include "wasm-validator.h"
+#include "wasm-module-building.h"
 
 namespace wasm {
 
@@ -136,6 +137,8 @@ class Asm2WasmBuilder {
 
   Builder builder;
 
+  std::unique_ptr<OptimizingIncrementalModuleBuilder> optimizingBuilder;
+
   // globals
 
   unsigned nextGlobal; // next place to put a global
@@ -156,6 +159,7 @@ class Asm2WasmBuilder {
   bool memoryGrowth;
   bool debug;
   bool imprecise;
+  bool optimize;
 
 public:
   std::map<IString, MappedGlobal> mappedGlobals;
@@ -267,7 +271,7 @@ private:
   }
 
 public:
- Asm2WasmBuilder(Module& wasm, bool memoryGrowth, bool debug, bool imprecise)
+ Asm2WasmBuilder(Module& wasm, bool memoryGrowth, bool debug, bool imprecise, bool optimize)
      : wasm(wasm),
        allocator(wasm.allocator),
        builder(wasm),
@@ -275,10 +279,10 @@ public:
        maxGlobal(1000),
        memoryGrowth(memoryGrowth),
        debug(debug),
-       imprecise(imprecise) {}
+       imprecise(imprecise),
+       optimize(optimize) {}
 
  void processAsm(Ref ast);
- void optimize();
 
 private:
   AsmType detectAsmType(Ref ast, AsmData *data) {
@@ -523,6 +527,16 @@ void Asm2WasmBuilder::processAsm(Ref ast) {
 
   IString Int8Array, Int16Array, Int32Array, UInt8Array, UInt16Array, UInt32Array, Float32Array, Float64Array;
 
+  // set up optimization
+
+  if (optimize) {
+    Index numFunctions = 0;
+    for (unsigned i = 1; i < body->size(); i++) {
+      if (body[i][0] == DEFUN) numFunctions++;
+    }
+    optimizingBuilder = std::unique_ptr<OptimizingIncrementalModuleBuilder>(new OptimizingIncrementalModuleBuilder(&wasm, numFunctions));
+  }
+
   // first pass - do almost everything, but function imports and indirect calls
 
   for (unsigned i = 1; i < body->size(); i++) {
@@ -655,7 +669,12 @@ void Asm2WasmBuilder::processAsm(Ref ast) {
       }
     } else if (curr[0] == DEFUN) {
       // function
-      wasm.addFunction(processFunction(curr));
+      auto* func = processFunction(curr);
+      if (optimize) {
+        optimizingBuilder->addFunction(func);
+      } else {
+        wasm.addFunction(func);
+      }
     } else if (curr[0] == RETURN) {
       // exports
       Ref object = curr[1];
@@ -690,6 +709,15 @@ void Asm2WasmBuilder::processAsm(Ref ast) {
     }
   }
 
+  if (optimize) {
+    optimizingBuilder->finish();
+    if (maxGlobal < 1024) {
+      PassRunner passRunner(&wasm);
+      passRunner.add("post-emscripten");
+      passRunner.run();
+    }
+  }
+
   // second pass. first, function imports
 
   std::vector<IString> toErase;
@@ -1726,17 +1754,6 @@ Function* Asm2WasmBuilder::processFunction(Ref ast) {
   return function;
 }
 
-void Asm2WasmBuilder::optimize() {
-  PassRunner passRunner(&wasm);
-  passRunner.addDefaultOptimizationPasses();
-  if (maxGlobal < 1024) {
-    passRunner.add("post-emscripten");
-  }
-  passRunner.run();
-
-  assert(WasmValidator().validate(wasm));
-}
-
 } // namespace wasm
 
 #endif // wasm_asm2wasm_h
diff --git a/src/pass.h b/src/pass.h
index e25809faa..fff2b04eb 100644
--- a/src/pass.h
+++ b/src/pass.h
@@ -97,8 +97,20 @@ struct PassRunner {
   // what -O does.
   void addDefaultOptimizationPasses();
 
+  // Adds the default optimization passes that work on
+  // individual functions.
+  void addDefaultFunctionOptimizationPasses();
+
+  // Adds the default optimization passes that work on
+  // entire modules as a whole.
+  void addDefaultGlobalOptimizationPasses();
+
+  // Run the passes on the module
   void run();
 
+  // Run the passes on a specific function
+  void runFunction(Function* func);
+
   // Get the last pass that was already executed of a certain type.
   template<class P>
   P* getLast();
@@ -112,12 +124,18 @@ struct PassRunner {
 class Pass {
 public:
   virtual ~Pass() {};
+
   // Override this to perform preparation work before the pass runs.
   virtual void prepare(PassRunner* runner, Module* module) {}
   virtual void run(PassRunner* runner, Module* module) = 0;
   // Override this to perform finalization work after the pass runs.
   virtual void finalize(PassRunner* runner, Module* module) {}
 
+  // Run on a single function. This has no prepare/finalize calls.
+  virtual void runFunction(PassRunner* runner, Module* module, Function* function) {
+    WASM_UNREACHABLE(); // by default, passes cannot be run this way
+  }
+
   std::string name;
 
 protected:
@@ -138,6 +156,11 @@ public:
     WalkerType::walkModule(module);
     finalize(runner, module);
   }
+
+  void runFunction(PassRunner* runner, Module* module, Function* func) override {
+    WalkerType::setModule(module);
+    WalkerType::walkFunction(func);
+  }
 };
 
 // Standard passes. All passes in /passes/ are runnable from the shell,
diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp
index ca9f477a3..e5bbf5866 100644
--- a/src/passes/pass.cpp
+++ b/src/passes/pass.cpp
@@ -74,6 +74,25 @@ void PassRunner::addDefaultOptimizationPasses() {
   add("duplicate-function-elimination"); // optimizations show more functions as duplicate
 }
 
+void PassRunner::addDefaultFunctionOptimizationPasses() {
+  add("dce");
+  add("remove-unused-brs");
+  add("remove-unused-names");
+  add("optimize-instructions");
+  add("simplify-locals");
+  add("vacuum"); // previous pass creates garbage
+  add("coalesce-locals");
+  add("vacuum"); // previous pass creates garbage
+  add("reorder-locals");
+  add("merge-blocks");
+  add("optimize-instructions");
+  add("vacuum"); // should not be needed, last few passes do not create garbage, but just to be safe
+}
+
+void PassRunner::addDefaultGlobalOptimizationPasses() {
+  add("duplicate-function-elimination");
+}
+
 void PassRunner::run() {
   std::chrono::high_resolution_clock::time_point beforeEverything;
   size_t padding = 0;
@@ -108,6 +127,12 @@ void PassRunner::run() {
   }
 }
 
+void PassRunner::runFunction(Function* func) {
+  for (auto pass : passes) {
+    pass->runFunction(this, wasm, func);
+  }
+}
+
 template<class P>
 P* PassRunner::getLast() {
   bool found = false;
diff --git a/src/support/threads.cpp b/src/support/threads.cpp
index 2c2db1997..4830cd092 100644
--- a/src/support/threads.cpp
+++ b/src/support/threads.cpp
@@ -116,14 +116,18 @@ void ThreadPool::initialize(size_t num) {
   DEBUG_POOL("initialize() is done\n");
 }
 
+size_t ThreadPool::getNumCores() {
+  size_t num = std::max(1U, std::thread::hardware_concurrency());
+  if (getenv("BINARYEN_CORES")) {
+    num = std::stoi(getenv("BINARYEN_CORES"));
+  }
+  return num;
+}
+
 ThreadPool* ThreadPool::get() {
   if (!pool) {
-    size_t num = std::max(1U, std::thread::hardware_concurrency());
-    if (getenv("BINARYEN_CORES")) {
-      num = std::stoi(getenv("BINARYEN_CORES"));
-    }
     pool = std::unique_ptr<ThreadPool>(new ThreadPool());
-    pool->initialize(num);
+    pool->initialize(getNumCores());
   }
   return pool.get();
 }
diff --git a/src/support/threads.h b/src/support/threads.h
index 9a43ac699..ea9e53c13 100644
--- a/src/support/threads.h
+++ b/src/support/threads.h
@@ -79,6 +79,9 @@ private:
   void initialize(size_t num);
 
 public:
+  // Get the number of cores we can use.
+  static size_t getNumCores();
+
   // Get the singleton threadpool. This can return null
   // if there is just one thread available.
   static ThreadPool* get();
diff --git a/src/tools/asm2wasm.cpp b/src/tools/asm2wasm.cpp
index e18f78de9..7f1a07d1d 100644
--- a/src/tools/asm2wasm.cpp
+++ b/src/tools/asm2wasm.cpp
@@ -92,14 +92,9 @@ int main(int argc, const char *argv[]) {
   if (options.debug) std::cerr << "wasming..." << std::endl;
   Module wasm;
   wasm.memory.initial = wasm.memory.max = totalMemory / Memory::kPageSize;
-  Asm2WasmBuilder asm2wasm(wasm, pre.memoryGrowth, options.debug, imprecise);
+  Asm2WasmBuilder asm2wasm(wasm, pre.memoryGrowth, options.debug, imprecise, opts);
   asm2wasm.processAsm(asmjs);
 
-  if (opts) {
-    if (options.debug) std::cerr << "optimizing..." << std::endl;
-    asm2wasm.optimize();
-  }
-
   if (options.debug) std::cerr << "printing..." << std::endl;
   Output output(options.extra["output"], Flags::Text, options.debug ? Flags::Debug : Flags::Release);
   WasmPrinter::printModule(&wasm, output.getStream());
diff --git a/src/wasm-module-building.h b/src/wasm-module-building.h
new file mode 100644
index 000000000..3cdebc558
--- /dev/null
+++ b/src/wasm-module-building.h
@@ -0,0 +1,252 @@
+/*
+ * Copyright 2016 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef optimizing_incremental_module_builder_h
+#define optimizing_incremental_module_builder_h
+
+#include <wasm.h>
+#include <support/threads.h>
+
+namespace wasm {
+
+#ifdef BINARYEN_THREAD_DEBUG
+static std::mutex debug;
+#define DEBUG_THREAD(x) { std::lock_guard<std::mutex> lock(debug); std::cerr << "[OptimizingIncrementalModuleBuilder Threading (thread: " << std::this_thread::get_id()  << ")] " << x; std::cerr << '\n'; }
+#else
+#define DEBUG_THREAD(x)
+#endif
+
+//
+// OptimizingIncrementalModuleBuilder
+//
+// Helps build wasm modules efficiently. If you build a module by
+// adding function by function, and you want to optimize them, this class
+// starts optimizing using worker threads *while you are still adding*.
+// It runs function optimization passes at that time, and then at the end
+// it runs global module-level optimization passes. The result is a fully
+// optimized module, optimized while being generated.
+//
+// This might also be faster than normal module optimization since it
+// runs all passes on each function, then goes on to the next function
+// which is better for data locality.
+//
+// Usage: Create an instance, passing it the module and the total
+//        number of functions. Then call addFunction as you have
+//        new functions to add (this also adds it to the module). Finally,
+//        call finish() when all functions have been added.
+//
+// This avoids locking by using atomics. We allocate an array of nullptrs
+// that represent all the functions, and as we add a function, we place it
+// at the next index. Each worker will read from the start to the end,
+// and when a non-nullptr is found, the worker optimizes that function and
+// nulls it. There is also an end marker that is not nullptr nor the address of
+// a valid function, which represents that beyond this point we have not
+// yet filled in. In other words,
+//    * the main thread fills everything with the end marker
+//    * the main thread transforms a marker entry into a function
+//    * workers pause when they see the marker
+//    * workers skip over nullptrs
+//    * workers transform functions into nullptrs, and optimize them
+//    * we keep an atomic count of the number of active workers and
+//      the number of optimized functions.
+//    * after adding a function, the main thread wakes up workers if
+//      it calculates there is work for them.
+//    * a lock is used for going to sleep and waking up.
+// Locking should be rare, as optimization is
+// generally slower than generation; in the optimal case, we never
+// lock beyond the first step, and all further work is lock-free.
+//
+
+class OptimizingIncrementalModuleBuilder {
+  Module* wasm;
+  uint32_t numFunctions;
+  Function* endMarker;
+  std::atomic<Function*>* list;
+  uint32_t nextFunction; // only used on main thread
+  uint32_t numWorkers;
+  std::vector<std::unique_ptr<std::thread>> threads;
+  std::atomic<uint32_t> liveWorkers, activeWorkers, availableFuncs, finishedFuncs;
+  std::mutex mutex;
+  std::condition_variable condition;
+  bool finishing;
+
+public:
+  // numFunctions must be equal to the number of functions allocated, or higher. Knowing
+  // this bounds helps avoid locking.
+  OptimizingIncrementalModuleBuilder(Module* wasm, Index numFunctions) : wasm(wasm), numFunctions(numFunctions), nextFunction(0), finishing(false) {
+    // prepare work list
+    endMarker = new Function();
+    list = new std::atomic<Function*>[numFunctions];
+    for (uint32_t i = 0; i < numFunctions; i++) {
+      list[i].store(endMarker);
+    }
+    // create workers
+    DEBUG_THREAD("creating workers");
+    numWorkers = ThreadPool::getNumCores();
+    assert(numWorkers >= 1);
+    liveWorkers.store(0);
+    activeWorkers.store(0);
+    for (uint32_t i = 0; i < numWorkers; i++) { // TODO: one less, and add it at the very end, to not compete with main thread?
+      createWorker();
+    }
+    waitUntilAllReady();
+    DEBUG_THREAD("workers are ready");
+    // prepare the rest of the initial state
+    availableFuncs.store(0);
+    finishedFuncs.store(0);
+  }
+
+  ~OptimizingIncrementalModuleBuilder() {
+    delete[] list;
+    delete endMarker;
+  }
+
+  // Add a function to the module, and to be optimized
+  void addFunction(Function* func) {
+    wasm->addFunction(func);
+    queueFunction(func);
+    // wake workers if needed
+    auto wake = availableFuncs.load();
+    for (uint32_t i = 0; i < wake; i++) {
+      wakeWorker();
+    }
+  }
+
+  // All functions have been added, block until all are optimized, and then do
+  // global optimizations. When this returns, the module is ready and optimized.
+  void finish() {
+    DEBUG_THREAD("finish()ing");
+    assert(nextFunction == numFunctions);
+    wakeAllWorkers();
+    waitUntilAllFinished();
+    optimizeGlobally();
+    // TODO: clear side thread allocators from module allocator, as these threads were transient
+  }
+
+private:
+  void createWorker() {
+    DEBUG_THREAD("create a worker");
+    threads.emplace_back(std::unique_ptr<std::thread>(new std::thread(workerMain, this)));
+  }
+
+  void wakeWorker() {
+    DEBUG_THREAD("wake a worker");
+    std::lock_guard<std::mutex> lock(mutex);
+    condition.notify_one();
+  }
+
+  void wakeAllWorkers() {
+    DEBUG_THREAD("wake all workers");
+    std::lock_guard<std::mutex> lock(mutex);
+    condition.notify_all();
+  }
+
+  void waitUntilAllReady() {
+    DEBUG_THREAD("wait until all workers are ready");
+    std::unique_lock<std::mutex> lock(mutex);
+    if (liveWorkers.load() < numWorkers) {
+      condition.wait(lock, [this]() { return liveWorkers.load() == numWorkers; });
+    }
+  }
+
+  void waitUntilAllFinished() {
+    DEBUG_THREAD("wait until all workers are finished");
+    {
+      std::unique_lock<std::mutex> lock(mutex);
+      finishing = true;
+      if (liveWorkers.load() > 0) {
+        condition.wait(lock, [this]() { return liveWorkers.load() == 0; });
+      }
+    }
+    DEBUG_THREAD("joining");
+    for (auto& thread : threads) thread->join();
+    DEBUG_THREAD("joined");
+  }
+
+  void queueFunction(Function* func) {
+    DEBUG_THREAD("queue function");
+    assert(nextFunction < numFunctions); // TODO: if we are given more than we expected, use a slower work queue?
+    list[nextFunction++].store(func);
+    availableFuncs++;
+  }
+
+  void optimizeGlobally() {
+    PassRunner passRunner(wasm);
+    passRunner.addDefaultGlobalOptimizationPasses();
+    passRunner.run();
+  }
+
+  // worker code
+
+  void optimizeFunction(Function* func) {
+    PassRunner passRunner(wasm);
+    passRunner.addDefaultFunctionOptimizationPasses();
+    passRunner.runFunction(func);
+  }
+
+  static void workerMain(void* param) {
+    DEBUG_THREAD("workerMain");
+    OptimizingIncrementalModuleBuilder* self = (OptimizingIncrementalModuleBuilder*)param;
+    {
+      std::lock_guard<std::mutex> lock(self->mutex);
+      self->liveWorkers++;
+      self->activeWorkers++;
+      self->condition.notify_all();
+    }
+    for (uint32_t i = 0; i < self->numFunctions; i++) {
+      DEBUG_THREAD("workerMain iteration " << i);
+      if (self->list[i].load() == self->endMarker) {
+        // sleep, this entry isn't ready yet
+        DEBUG_THREAD("workerMain sleep");
+        self->activeWorkers--;
+        {
+          std::unique_lock<std::mutex> lock(self->mutex);
+          if (!self->finishing) { // while waiting for the lock, things may have ended
+            self->condition.wait(lock);
+          }
+        }
+        // continue
+        DEBUG_THREAD("workerMain continue");
+        self->activeWorkers++;
+        i--;
+        continue;
+      }
+      DEBUG_THREAD("workerMain exchange item");
+      auto* func = self->list[i].exchange(nullptr);
+      if (func == nullptr) {
+        DEBUG_THREAD("workerMain sees was already taken");
+        continue; // someone else has taken this one
+      }
+      // we have work to do!
+      DEBUG_THREAD("workerMain work on " << size_t(func));
+      self->availableFuncs--;
+      self->optimizeFunction(func);
+      self->finishedFuncs++;
+    }
+    DEBUG_THREAD("workerMain ready to exit");
+    {
+      std::lock_guard<std::mutex> lock(self->mutex);
+      self->liveWorkers--;
+      self->condition.notify_all();
+    }
+    DEBUG_THREAD("workerMain exiting");
+  }
+};
+
+} // namespace wasm
+
+#endif // optimizing_incremental_module_builder_h
+
diff --git a/src/wasm-traversal.h b/src/wasm-traversal.h
index 9ed02027e..e5da420b0 100644
--- a/src/wasm-traversal.h
+++ b/src/wasm-traversal.h
@@ -187,6 +187,7 @@ struct Walker : public VisitorType {
   void walkFunction(Function* func) {
     setFunction(func);
     static_cast<SubType*>(this)->doWalkFunction(func);
+    static_cast<SubType*>(this)->visitFunction(func);
   }
 
   // override this to provide custom functionality
@@ -197,6 +198,7 @@ struct Walker : public VisitorType {
   void walkModule(Module *module) {
     setModule(module);
     static_cast<SubType*>(this)->doWalkModule(module);
+    static_cast<SubType*>(this)->visitModule(module);
   }
 
   // override this to provide custom functionality
@@ -223,7 +225,6 @@ struct Walker : public VisitorType {
         allocated = std::unique_ptr<SubType>(instance);
       }
       instance->walkFunction(func);
-      instance->visitFunction(func);
     };
 
     // if this is not a function-parallel traversal, run
@@ -259,7 +260,6 @@ struct Walker : public VisitorType {
     }
     self->visitTable(&module->table);
     self->visitMemory(&module->memory);
-    self->visitModule(module);
   }
 
   // Walk implementation. We don't use recursion as ASTs may be highly
diff --git a/test/memorygrowth.fromasm b/test/memorygrowth.fromasm
index f2acc39d2..07909b855 100644
--- a/test/memorygrowth.fromasm
+++ b/test/memorygrowth.fromasm
@@ -10114,9 +10114,9 @@
   (func $ib (result i32)
     (i32.const 0)
   )
-  (func $__growWasmMemory (param $0 i32)
+  (func $__growWasmMemory (param $newSize i32)
     (grow_memory
-      (get_local $0)
+      (get_local $newSize)
     )
   )
 )
diff --git a/test/memorygrowth.fromasm.imprecise b/test/memorygrowth.fromasm.imprecise
index f2acc39d2..07909b855 100644
--- a/test/memorygrowth.fromasm.imprecise
+++ b/test/memorygrowth.fromasm.imprecise
@@ -10114,9 +10114,9 @@
   (func $ib (result i32)
     (i32.const 0)
   )
-  (func $__growWasmMemory (param $0 i32)
+  (func $__growWasmMemory (param $newSize i32)
     (grow_memory
-      (get_local $0)
+      (get_local $newSize)
     )
   )
 )