5 files changed, 290 insertions, 26 deletions
diff --git a/src/asm2wasm.h b/src/asm2wasm.h
index 4231fcc9a..5f6cee558 100644
--- a/src/asm2wasm.h
+++ b/src/asm2wasm.h
@@ -30,7 +30,8 @@
 #include "pass.h"
 #include "ast_utils.h"
 #include "wasm-builder.h"
-#include <wasm-validator.h>
+#include "wasm-validator.h"
+#include "wasm-module-building.h"
 
 namespace wasm {
 
@@ -136,6 +137,8 @@ class Asm2WasmBuilder {
 
   Builder builder;
 
+  std::unique_ptr<OptimizingIncrementalModuleBuilder> optimizingBuilder;
+
   // globals
 
   unsigned nextGlobal; // next place to put a global
@@ -156,6 +159,7 @@ class Asm2WasmBuilder {
   bool memoryGrowth;
   bool debug;
   bool imprecise;
+  bool optimize;
 
 public:
   std::map<IString, MappedGlobal> mappedGlobals;
@@ -267,7 +271,7 @@ private:
   }
 
 public:
- Asm2WasmBuilder(Module& wasm, bool memoryGrowth, bool debug, bool imprecise)
+ Asm2WasmBuilder(Module& wasm, bool memoryGrowth, bool debug, bool imprecise, bool optimize)
      : wasm(wasm),
        allocator(wasm.allocator),
        builder(wasm),
@@ -275,10 +279,10 @@ public:
        maxGlobal(1000),
        memoryGrowth(memoryGrowth),
        debug(debug),
-       imprecise(imprecise) {}
+       imprecise(imprecise),
+       optimize(optimize) {}
 
  void processAsm(Ref ast);
- void optimize();
 
 private:
   AsmType detectAsmType(Ref ast, AsmData *data) {
@@ -523,6 +527,16 @@ void Asm2WasmBuilder::processAsm(Ref ast) {
 
   IString Int8Array, Int16Array, Int32Array, UInt8Array, UInt16Array, UInt32Array, Float32Array, Float64Array;
 
+  // set up optimization
+
+  if (optimize) {
+    Index numFunctions = 0;
+    for (unsigned i = 1; i < body->size(); i++) {
+      if (body[i][0] == DEFUN) numFunctions++;
+    }
+    optimizingBuilder = std::unique_ptr<OptimizingIncrementalModuleBuilder>(new OptimizingIncrementalModuleBuilder(&wasm, numFunctions));
+  }
+
   // first pass - do almost everything, but function imports and indirect calls
 
   for (unsigned i = 1; i < body->size(); i++) {
@@ -655,7 +669,12 @@ void Asm2WasmBuilder::processAsm(Ref ast) {
       }
     } else if (curr[0] == DEFUN) {
       // function
-      wasm.addFunction(processFunction(curr));
+      auto* func = processFunction(curr);
+      if (optimize) {
+        optimizingBuilder->addFunction(func);
+      } else {
+        wasm.addFunction(func);
+      }
     } else if (curr[0] == RETURN) {
       // exports
       Ref object = curr[1];
@@ -690,6 +709,15 @@ void Asm2WasmBuilder::processAsm(Ref ast) {
     }
   }
 
+  if (optimize) {
+    optimizingBuilder->finish();
+    if (maxGlobal < 1024) {
+      PassRunner passRunner(&wasm);
+      passRunner.add("post-emscripten");
+      passRunner.run();
+    }
+  }
+
   // second pass. first, function imports
 
   std::vector<IString> toErase;
@@ -1726,17 +1754,6 @@ Function* Asm2WasmBuilder::processFunction(Ref ast) {
   return function;
 }
 
-void Asm2WasmBuilder::optimize() {
-  PassRunner passRunner(&wasm);
-  passRunner.addDefaultOptimizationPasses();
-  if (maxGlobal < 1024) {
-    passRunner.add("post-emscripten");
-  }
-  passRunner.run();
-
-  assert(WasmValidator().validate(wasm));
-}
-
 } // namespace wasm
 
 #endif // wasm_asm2wasm_h
diff --git a/src/tools/asm2wasm.cpp b/src/tools/asm2wasm.cpp
index e18f78de9..7f1a07d1d 100644
--- a/src/tools/asm2wasm.cpp
+++ b/src/tools/asm2wasm.cpp
@@ -92,14 +92,9 @@ int main(int argc, const char *argv[]) {
   if (options.debug) std::cerr << "wasming..." << std::endl;
   Module wasm;
   wasm.memory.initial = wasm.memory.max = totalMemory / Memory::kPageSize;
-  Asm2WasmBuilder asm2wasm(wasm, pre.memoryGrowth, options.debug, imprecise);
+  Asm2WasmBuilder asm2wasm(wasm, pre.memoryGrowth, options.debug, imprecise, opts);
   asm2wasm.processAsm(asmjs);
 
-  if (opts) {
-    if (options.debug) std::cerr << "optimizing..." << std::endl;
-    asm2wasm.optimize();
-  }
-
   if (options.debug) std::cerr << "printing..." << std::endl;
   Output output(options.extra["output"], Flags::Text, options.debug ? Flags::Debug : Flags::Release);
   WasmPrinter::printModule(&wasm, output.getStream());
diff --git a/src/wasm-module-building.h b/src/wasm-module-building.h
new file mode 100644
index 000000000..3cdebc558
--- /dev/null
+++ b/src/wasm-module-building.h
@@ -0,0 +1,252 @@
+/*
+ * Copyright 2016 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef optimizing_incremental_module_builder_h
+#define optimizing_incremental_module_builder_h
+
+#include <wasm.h>
+#include <support/threads.h>
+
+namespace wasm {
+
+#ifdef BINARYEN_THREAD_DEBUG
+static std::mutex debug;
+#define DEBUG_THREAD(x) { std::lock_guard<std::mutex> lock(debug); std::cerr << "[OptimizingIncrementalModuleBuilder Threading (thread: " << std::this_thread::get_id()  << ")] " << x; std::cerr << '\n'; }
+#else
+#define DEBUG_THREAD(x)
+#endif
+
+//
+// OptimizingIncrementalModuleBuilder
+//
+// Helps build wasm modules efficiently. If you build a module by
+// adding function by function, and you want to optimize them, this class
+// starts optimizing using worker threads *while you are still adding*.
+// It runs function optimization passes at that time, and then at the end
+// it runs global module-level optimization passes. The result is a fully
+// optimized module, optimized while being generated.
+//
+// This might also be faster than normal module optimization since it
+// runs all passes on each function, then goes on to the next function
+// which is better for data locality.
+//
+// Usage: Create an instance, passing it the module and the total
+//        number of functions. Then call addFunction as you have
+//        new functions to add (this also adds it to the module). Finally,
+//        call finish() when all functions have been added.
+//
+// This avoids locking by using atomics. We allocate an array of nullptrs
+// that represent all the functions, and as we add a function, we place it
+// at the next index. Each worker will read from the start to the end,
+// and when a non-nullptr is found, the worker optimizes that function and
+// nulls it. There is also an end marker that is not nullptr nor the address of
+// a valid function, which represents that beyond this point we have not
+// yet filled in. In other words,
+//    * the main thread fills everything with the end marker
+//    * the main thread transforms a marker entry into a function
+//    * workers pause when they see the marker
+//    * workers skip over nullptrs
+//    * workers transform functions into nullptrs, and optimize them
+//    * we keep an atomic count of the number of active workers and
+//      the number of optimized functions.
+//    * after adding a function, the main thread wakes up workers if
+//      it calculates there is work for them.
+//    * a lock is used for going to sleep and waking up.
+// Locking should be rare, as optimization is
+// generally slower than generation; in the optimal case, we never
+// lock beyond the first step, and all further work is lock-free.
+//
+
+class OptimizingIncrementalModuleBuilder {
+  Module* wasm;
+  uint32_t numFunctions;
+  Function* endMarker;
+  std::atomic<Function*>* list;
+  uint32_t nextFunction; // only used on main thread
+  uint32_t numWorkers;
+  std::vector<std::unique_ptr<std::thread>> threads;
+  std::atomic<uint32_t> liveWorkers, activeWorkers, availableFuncs, finishedFuncs;
+  std::mutex mutex;
+  std::condition_variable condition;
+  bool finishing;
+
+public:
+  // numFunctions must be equal to the number of functions allocated, or higher. Knowing
+  // this bounds helps avoid locking.
+  OptimizingIncrementalModuleBuilder(Module* wasm, Index numFunctions) : wasm(wasm), numFunctions(numFunctions), nextFunction(0), finishing(false) {
+    // prepare work list
+    endMarker = new Function();
+    list = new std::atomic<Function*>[numFunctions];
+    for (uint32_t i = 0; i < numFunctions; i++) {
+      list[i].store(endMarker);
+    }
+    // create workers
+    DEBUG_THREAD("creating workers");
+    numWorkers = ThreadPool::getNumCores();
+    assert(numWorkers >= 1);
+    liveWorkers.store(0);
+    activeWorkers.store(0);
+    for (uint32_t i = 0; i < numWorkers; i++) { // TODO: one less, and add it at the very end, to not compete with main thread?
+      createWorker();
+    }
+    waitUntilAllReady();
+    DEBUG_THREAD("workers are ready");
+    // prepare the rest of the initial state
+    availableFuncs.store(0);
+    finishedFuncs.store(0);
+  }
+
+  ~OptimizingIncrementalModuleBuilder() {
+    delete[] list;
+    delete endMarker;
+  }
+
+  // Add a function to the module, and to be optimized
+  void addFunction(Function* func) {
+    wasm->addFunction(func);
+    queueFunction(func);
+    // wake workers if needed
+    auto wake = availableFuncs.load();
+    for (uint32_t i = 0; i < wake; i++) {
+      wakeWorker();
+    }
+  }
+
+  // All functions have been added, block until all are optimized, and then do
+  // global optimizations. When this returns, the module is ready and optimized.
+  void finish() {
+    DEBUG_THREAD("finish()ing");
+    assert(nextFunction == numFunctions);
+    wakeAllWorkers();
+    waitUntilAllFinished();
+    optimizeGlobally();
+    // TODO: clear side thread allocators from module allocator, as these threads were transient
+  }
+
+private:
+  void createWorker() {
+    DEBUG_THREAD("create a worker");
+    threads.emplace_back(std::unique_ptr<std::thread>(new std::thread(workerMain, this)));
+  }
+
+  void wakeWorker() {
+    DEBUG_THREAD("wake a worker");
+    std::lock_guard<std::mutex> lock(mutex);
+    condition.notify_one();
+  }
+
+  void wakeAllWorkers() {
+    DEBUG_THREAD("wake all workers");
+    std::lock_guard<std::mutex> lock(mutex);
+    condition.notify_all();
+  }
+
+  void waitUntilAllReady() {
+    DEBUG_THREAD("wait until all workers are ready");
+    std::unique_lock<std::mutex> lock(mutex);
+    if (liveWorkers.load() < numWorkers) {
+      condition.wait(lock, [this]() { return liveWorkers.load() == numWorkers; });
+    }
+  }
+
+  void waitUntilAllFinished() {
+    DEBUG_THREAD("wait until all workers are finished");
+    {
+      std::unique_lock<std::mutex> lock(mutex);
+      finishing = true;
+      if (liveWorkers.load() > 0) {
+        condition.wait(lock, [this]() { return liveWorkers.load() == 0; });
+      }
+    }
+    DEBUG_THREAD("joining");
+    for (auto& thread : threads) thread->join();
+    DEBUG_THREAD("joined");
+  }
+
+  void queueFunction(Function* func) {
+    DEBUG_THREAD("queue function");
+    assert(nextFunction < numFunctions); // TODO: if we are given more than we expected, use a slower work queue?
+    list[nextFunction++].store(func);
+    availableFuncs++;
+  }
+
+  void optimizeGlobally() {
+    PassRunner passRunner(wasm);
+    passRunner.addDefaultGlobalOptimizationPasses();
+    passRunner.run();
+  }
+
+  // worker code
+
+  void optimizeFunction(Function* func) {
+    PassRunner passRunner(wasm);
+    passRunner.addDefaultFunctionOptimizationPasses();
+    passRunner.runFunction(func);
+  }
+
+  static void workerMain(void* param) {
+    DEBUG_THREAD("workerMain");
+    OptimizingIncrementalModuleBuilder* self = (OptimizingIncrementalModuleBuilder*)param;
+    {
+      std::lock_guard<std::mutex> lock(self->mutex);
+      self->liveWorkers++;
+      self->activeWorkers++;
+      self->condition.notify_all();
+    }
+    for (uint32_t i = 0; i < self->numFunctions; i++) {
+      DEBUG_THREAD("workerMain iteration " << i);
+      if (self->list[i].load() == self->endMarker) {
+        // sleep, this entry isn't ready yet
+        DEBUG_THREAD("workerMain sleep");
+        self->activeWorkers--;
+        {
+          std::unique_lock<std::mutex> lock(self->mutex);
+          if (!self->finishing) { // while waiting for the lock, things may have ended
+            self->condition.wait(lock);
+          }
+        }
+        // continue
+        DEBUG_THREAD("workerMain continue");
+        self->activeWorkers++;
+        i--;
+        continue;
+      }
+      DEBUG_THREAD("workerMain exchange item");
+      auto* func = self->list[i].exchange(nullptr);
+      if (func == nullptr) {
+        DEBUG_THREAD("workerMain sees was already taken");
+        continue; // someone else has taken this one
+      }
+      // we have work to do!
+      DEBUG_THREAD("workerMain work on " << size_t(func));
+      self->availableFuncs--;
+      self->optimizeFunction(func);
+      self->finishedFuncs++;
+    }
+    DEBUG_THREAD("workerMain ready to exit");
+    {
+      std::lock_guard<std::mutex> lock(self->mutex);
+      self->liveWorkers--;
+      self->condition.notify_all();
+    }
+    DEBUG_THREAD("workerMain exiting");
+  }
+};
+
+} // namespace wasm
+
+#endif // optimizing_incremental_module_builder_h
+
diff --git a/test/memorygrowth.fromasm b/test/memorygrowth.fromasm
index f2acc39d2..07909b855 100644
--- a/test/memorygrowth.fromasm
+++ b/test/memorygrowth.fromasm
@@ -10114,9 +10114,9 @@
   (func $ib (result i32)
     (i32.const 0)
   )
-  (func $__growWasmMemory (param $0 i32)
+  (func $__growWasmMemory (param $newSize i32)
     (grow_memory
-      (get_local $0)
+      (get_local $newSize)
     )
   )
 )
diff --git a/test/memorygrowth.fromasm.imprecise b/test/memorygrowth.fromasm.imprecise
index f2acc39d2..07909b855 100644
--- a/test/memorygrowth.fromasm.imprecise
+++ b/test/memorygrowth.fromasm.imprecise
@@ -10114,9 +10114,9 @@
   (func $ib (result i32)
     (i32.const 0)
   )
-  (func $__growWasmMemory (param $0 i32)
+  (func $__growWasmMemory (param $newSize i32)
     (grow_memory
-      (get_local $0)
+      (get_local $newSize)
     )
   )
 )