diff options
-rw-r--r-- | src/asm2wasm.h | 49 | ||||
-rw-r--r-- | src/pass.h | 23 | ||||
-rw-r--r-- | src/passes/pass.cpp | 25 | ||||
-rw-r--r-- | src/support/threads.cpp | 14 | ||||
-rw-r--r-- | src/support/threads.h | 3 | ||||
-rw-r--r-- | src/tools/asm2wasm.cpp | 7 | ||||
-rw-r--r-- | src/wasm-module-building.h | 252 | ||||
-rw-r--r-- | src/wasm-traversal.h | 4 | ||||
-rw-r--r-- | test/memorygrowth.fromasm | 4 | ||||
-rw-r--r-- | test/memorygrowth.fromasm.imprecise | 4 |
10 files changed, 352 insertions, 33 deletions
diff --git a/src/asm2wasm.h b/src/asm2wasm.h index 4231fcc9a..5f6cee558 100644 --- a/src/asm2wasm.h +++ b/src/asm2wasm.h @@ -30,7 +30,8 @@ #include "pass.h" #include "ast_utils.h" #include "wasm-builder.h" -#include <wasm-validator.h> +#include "wasm-validator.h" +#include "wasm-module-building.h" namespace wasm { @@ -136,6 +137,8 @@ class Asm2WasmBuilder { Builder builder; + std::unique_ptr<OptimizingIncrementalModuleBuilder> optimizingBuilder; + // globals unsigned nextGlobal; // next place to put a global @@ -156,6 +159,7 @@ class Asm2WasmBuilder { bool memoryGrowth; bool debug; bool imprecise; + bool optimize; public: std::map<IString, MappedGlobal> mappedGlobals; @@ -267,7 +271,7 @@ private: } public: - Asm2WasmBuilder(Module& wasm, bool memoryGrowth, bool debug, bool imprecise) + Asm2WasmBuilder(Module& wasm, bool memoryGrowth, bool debug, bool imprecise, bool optimize) : wasm(wasm), allocator(wasm.allocator), builder(wasm), @@ -275,10 +279,10 @@ public: maxGlobal(1000), memoryGrowth(memoryGrowth), debug(debug), - imprecise(imprecise) {} + imprecise(imprecise), + optimize(optimize) {} void processAsm(Ref ast); - void optimize(); private: AsmType detectAsmType(Ref ast, AsmData *data) { @@ -523,6 +527,16 @@ void Asm2WasmBuilder::processAsm(Ref ast) { IString Int8Array, Int16Array, Int32Array, UInt8Array, UInt16Array, UInt32Array, Float32Array, Float64Array; + // set up optimization + + if (optimize) { + Index numFunctions = 0; + for (unsigned i = 1; i < body->size(); i++) { + if (body[i][0] == DEFUN) numFunctions++; + } + optimizingBuilder = std::unique_ptr<OptimizingIncrementalModuleBuilder>(new OptimizingIncrementalModuleBuilder(&wasm, numFunctions)); + } + // first pass - do almost everything, but function imports and indirect calls for (unsigned i = 1; i < body->size(); i++) { @@ -655,7 +669,12 @@ void Asm2WasmBuilder::processAsm(Ref ast) { } } else if (curr[0] == DEFUN) { // function - wasm.addFunction(processFunction(curr)); + auto* func = processFunction(curr); + if (optimize) { + optimizingBuilder->addFunction(func); + } else { + wasm.addFunction(func); + } } else if (curr[0] == RETURN) { // exports Ref object = curr[1]; @@ -690,6 +709,15 @@ void Asm2WasmBuilder::processAsm(Ref ast) { } } + if (optimize) { + optimizingBuilder->finish(); + if (maxGlobal < 1024) { + PassRunner passRunner(&wasm); + passRunner.add("post-emscripten"); + passRunner.run(); + } + } + // second pass. first, function imports std::vector<IString> toErase; @@ -1726,17 +1754,6 @@ Function* Asm2WasmBuilder::processFunction(Ref ast) { return function; } -void Asm2WasmBuilder::optimize() { - PassRunner passRunner(&wasm); - passRunner.addDefaultOptimizationPasses(); - if (maxGlobal < 1024) { - passRunner.add("post-emscripten"); - } - passRunner.run(); - - assert(WasmValidator().validate(wasm)); -} - } // namespace wasm #endif // wasm_asm2wasm_h diff --git a/src/pass.h b/src/pass.h index e25809faa..fff2b04eb 100644 --- a/src/pass.h +++ b/src/pass.h @@ -97,8 +97,20 @@ struct PassRunner { // what -O does. void addDefaultOptimizationPasses(); + // Adds the default optimization passes that work on + // individual functions. + void addDefaultFunctionOptimizationPasses(); + + // Adds the default optimization passes that work on + // entire modules as a whole. + void addDefaultGlobalOptimizationPasses(); + + // Run the passes on the module void run(); + // Run the passes on a specific function + void runFunction(Function* func); + // Get the last pass that was already executed of a certain type. template<class P> P* getLast(); @@ -112,12 +124,18 @@ struct PassRunner { class Pass { public: virtual ~Pass() {}; + // Override this to perform preparation work before the pass runs. virtual void prepare(PassRunner* runner, Module* module) {} virtual void run(PassRunner* runner, Module* module) = 0; // Override this to perform finalization work after the pass runs. virtual void finalize(PassRunner* runner, Module* module) {} + // Run on a single function. This has no prepare/finalize calls. + virtual void runFunction(PassRunner* runner, Module* module, Function* function) { + WASM_UNREACHABLE(); // by default, passes cannot be run this way + } + std::string name; protected: @@ -138,6 +156,11 @@ public: WalkerType::walkModule(module); finalize(runner, module); } + + void runFunction(PassRunner* runner, Module* module, Function* func) override { + WalkerType::setModule(module); + WalkerType::walkFunction(func); + } }; // Standard passes. All passes in /passes/ are runnable from the shell, diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp index ca9f477a3..e5bbf5866 100644 --- a/src/passes/pass.cpp +++ b/src/passes/pass.cpp @@ -74,6 +74,25 @@ void PassRunner::addDefaultOptimizationPasses() { add("duplicate-function-elimination"); // optimizations show more functions as duplicate } +void PassRunner::addDefaultFunctionOptimizationPasses() { + add("dce"); + add("remove-unused-brs"); + add("remove-unused-names"); + add("optimize-instructions"); + add("simplify-locals"); + add("vacuum"); // previous pass creates garbage + add("coalesce-locals"); + add("vacuum"); // previous pass creates garbage + add("reorder-locals"); + add("merge-blocks"); + add("optimize-instructions"); + add("vacuum"); // should not be needed, last few passes do not create garbage, but just to be safe +} + +void PassRunner::addDefaultGlobalOptimizationPasses() { + add("duplicate-function-elimination"); +} + void PassRunner::run() { std::chrono::high_resolution_clock::time_point beforeEverything; size_t padding = 0; @@ -108,6 +127,12 @@ void PassRunner::run() { } } +void PassRunner::runFunction(Function* func) { + for (auto pass : passes) { + pass->runFunction(this, wasm, func); + } +} + template<class P> P* PassRunner::getLast() { bool found = false; diff --git a/src/support/threads.cpp b/src/support/threads.cpp index 2c2db1997..4830cd092 100644 --- a/src/support/threads.cpp +++ b/src/support/threads.cpp @@ -116,14 +116,18 @@ void ThreadPool::initialize(size_t num) { DEBUG_POOL("initialize() is done\n"); } +size_t ThreadPool::getNumCores() { + size_t num = std::max(1U, std::thread::hardware_concurrency()); + if (getenv("BINARYEN_CORES")) { + num = std::stoi(getenv("BINARYEN_CORES")); + } + return num; +} + ThreadPool* ThreadPool::get() { if (!pool) { - size_t num = std::max(1U, std::thread::hardware_concurrency()); - if (getenv("BINARYEN_CORES")) { - num = std::stoi(getenv("BINARYEN_CORES")); - } pool = std::unique_ptr<ThreadPool>(new ThreadPool()); - pool->initialize(num); + pool->initialize(getNumCores()); } return pool.get(); } diff --git a/src/support/threads.h b/src/support/threads.h index 9a43ac699..ea9e53c13 100644 --- a/src/support/threads.h +++ b/src/support/threads.h @@ -79,6 +79,9 @@ private: void initialize(size_t num); public: + // Get the number of cores we can use. + static size_t getNumCores(); + // Get the singleton threadpool. This can return null // if there is just one thread available. static ThreadPool* get(); diff --git a/src/tools/asm2wasm.cpp b/src/tools/asm2wasm.cpp index e18f78de9..7f1a07d1d 100644 --- a/src/tools/asm2wasm.cpp +++ b/src/tools/asm2wasm.cpp @@ -92,14 +92,9 @@ int main(int argc, const char *argv[]) { if (options.debug) std::cerr << "wasming..." << std::endl; Module wasm; wasm.memory.initial = wasm.memory.max = totalMemory / Memory::kPageSize; - Asm2WasmBuilder asm2wasm(wasm, pre.memoryGrowth, options.debug, imprecise); + Asm2WasmBuilder asm2wasm(wasm, pre.memoryGrowth, options.debug, imprecise, opts); asm2wasm.processAsm(asmjs); - if (opts) { - if (options.debug) std::cerr << "optimizing..." << std::endl; - asm2wasm.optimize(); - } - if (options.debug) std::cerr << "printing..." << std::endl; Output output(options.extra["output"], Flags::Text, options.debug ? Flags::Debug : Flags::Release); WasmPrinter::printModule(&wasm, output.getStream()); diff --git a/src/wasm-module-building.h b/src/wasm-module-building.h new file mode 100644 index 000000000..3cdebc558 --- /dev/null +++ b/src/wasm-module-building.h @@ -0,0 +1,252 @@ +/* + * Copyright 2016 WebAssembly Community Group participants + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef optimizing_incremental_module_builder_h +#define optimizing_incremental_module_builder_h + +#include <wasm.h> +#include <support/threads.h> + +namespace wasm { + +#ifdef BINARYEN_THREAD_DEBUG +static std::mutex debug; +#define DEBUG_THREAD(x) { std::lock_guard<std::mutex> lock(debug); std::cerr << "[OptimizingIncrementalModuleBuilder Threading (thread: " << std::this_thread::get_id() << ")] " << x; std::cerr << '\n'; } +#else +#define DEBUG_THREAD(x) +#endif + +// +// OptimizingIncrementalModuleBuilder +// +// Helps build wasm modules efficiently. If you build a module by +// adding function by function, and you want to optimize them, this class +// starts optimizing using worker threads *while you are still adding*. +// It runs function optimization passes at that time, and then at the end +// it runs global module-level optimization passes. The result is a fully +// optimized module, optimized while being generated. +// +// This might also be faster than normal module optimization since it +// runs all passes on each function, then goes on to the next function +// which is better for data locality. +// +// Usage: Create an instance, passing it the module and the total +// number of functions. Then call addFunction as you have +// new functions to add (this also adds it to the module). Finally, +// call finish() when all functions have been added. +// +// This avoids locking by using atomics. We allocate an array of nullptrs +// that represent all the functions, and as we add a function, we place it +// at the next index. Each worker will read from the start to the end, +// and when a non-nullptr is found, the worker optimizes that function and +// nulls it. There is also an end marker that is not nullptr nor the address of +// a valid function, which represents that beyond this point we have not +// yet filled in. In other words, +// * the main thread fills everything with the end marker +// * the main thread transforms a marker entry into a function +// * workers pause when they see the marker +// * workers skip over nullptrs +// * workers transform functions into nullptrs, and optimize them +// * we keep an atomic count of the number of active workers and +// the number of optimized functions. +// * after adding a function, the main thread wakes up workers if +// it calculates there is work for them. +// * a lock is used for going to sleep and waking up. +// Locking should be rare, as optimization is +// generally slower than generation; in the optimal case, we never +// lock beyond the first step, and all further work is lock-free. +// + +class OptimizingIncrementalModuleBuilder { + Module* wasm; + uint32_t numFunctions; + Function* endMarker; + std::atomic<Function*>* list; + uint32_t nextFunction; // only used on main thread + uint32_t numWorkers; + std::vector<std::unique_ptr<std::thread>> threads; + std::atomic<uint32_t> liveWorkers, activeWorkers, availableFuncs, finishedFuncs; + std::mutex mutex; + std::condition_variable condition; + bool finishing; + +public: + // numFunctions must be equal to the number of functions allocated, or higher. Knowing + // this bounds helps avoid locking. + OptimizingIncrementalModuleBuilder(Module* wasm, Index numFunctions) : wasm(wasm), numFunctions(numFunctions), nextFunction(0), finishing(false) { + // prepare work list + endMarker = new Function(); + list = new std::atomic<Function*>[numFunctions]; + for (uint32_t i = 0; i < numFunctions; i++) { + list[i].store(endMarker); + } + // create workers + DEBUG_THREAD("creating workers"); + numWorkers = ThreadPool::getNumCores(); + assert(numWorkers >= 1); + liveWorkers.store(0); + activeWorkers.store(0); + for (uint32_t i = 0; i < numWorkers; i++) { // TODO: one less, and add it at the very end, to not compete with main thread? + createWorker(); + } + waitUntilAllReady(); + DEBUG_THREAD("workers are ready"); + // prepare the rest of the initial state + availableFuncs.store(0); + finishedFuncs.store(0); + } + + ~OptimizingIncrementalModuleBuilder() { + delete[] list; + delete endMarker; + } + + // Add a function to the module, and to be optimized + void addFunction(Function* func) { + wasm->addFunction(func); + queueFunction(func); + // wake workers if needed + auto wake = availableFuncs.load(); + for (uint32_t i = 0; i < wake; i++) { + wakeWorker(); + } + } + + // All functions have been added, block until all are optimized, and then do + // global optimizations. When this returns, the module is ready and optimized. + void finish() { + DEBUG_THREAD("finish()ing"); + assert(nextFunction == numFunctions); + wakeAllWorkers(); + waitUntilAllFinished(); + optimizeGlobally(); + // TODO: clear side thread allocators from module allocator, as these threads were transient + } + +private: + void createWorker() { + DEBUG_THREAD("create a worker"); + threads.emplace_back(std::unique_ptr<std::thread>(new std::thread(workerMain, this))); + } + + void wakeWorker() { + DEBUG_THREAD("wake a worker"); + std::lock_guard<std::mutex> lock(mutex); + condition.notify_one(); + } + + void wakeAllWorkers() { + DEBUG_THREAD("wake all workers"); + std::lock_guard<std::mutex> lock(mutex); + condition.notify_all(); + } + + void waitUntilAllReady() { + DEBUG_THREAD("wait until all workers are ready"); + std::unique_lock<std::mutex> lock(mutex); + if (liveWorkers.load() < numWorkers) { + condition.wait(lock, [this]() { return liveWorkers.load() == numWorkers; }); + } + } + + void waitUntilAllFinished() { + DEBUG_THREAD("wait until all workers are finished"); + { + std::unique_lock<std::mutex> lock(mutex); + finishing = true; + if (liveWorkers.load() > 0) { + condition.wait(lock, [this]() { return liveWorkers.load() == 0; }); + } + } + DEBUG_THREAD("joining"); + for (auto& thread : threads) thread->join(); + DEBUG_THREAD("joined"); + } + + void queueFunction(Function* func) { + DEBUG_THREAD("queue function"); + assert(nextFunction < numFunctions); // TODO: if we are given more than we expected, use a slower work queue? + list[nextFunction++].store(func); + availableFuncs++; + } + + void optimizeGlobally() { + PassRunner passRunner(wasm); + passRunner.addDefaultGlobalOptimizationPasses(); + passRunner.run(); + } + + // worker code + + void optimizeFunction(Function* func) { + PassRunner passRunner(wasm); + passRunner.addDefaultFunctionOptimizationPasses(); + passRunner.runFunction(func); + } + + static void workerMain(void* param) { + DEBUG_THREAD("workerMain"); + OptimizingIncrementalModuleBuilder* self = (OptimizingIncrementalModuleBuilder*)param; + { + std::lock_guard<std::mutex> lock(self->mutex); + self->liveWorkers++; + self->activeWorkers++; + self->condition.notify_all(); + } + for (uint32_t i = 0; i < self->numFunctions; i++) { + DEBUG_THREAD("workerMain iteration " << i); + if (self->list[i].load() == self->endMarker) { + // sleep, this entry isn't ready yet + DEBUG_THREAD("workerMain sleep"); + self->activeWorkers--; + { + std::unique_lock<std::mutex> lock(self->mutex); + if (!self->finishing) { // while waiting for the lock, things may have ended + self->condition.wait(lock); + } + } + // continue + DEBUG_THREAD("workerMain continue"); + self->activeWorkers++; + i--; + continue; + } + DEBUG_THREAD("workerMain exchange item"); + auto* func = self->list[i].exchange(nullptr); + if (func == nullptr) { + DEBUG_THREAD("workerMain sees was already taken"); + continue; // someone else has taken this one + } + // we have work to do! + DEBUG_THREAD("workerMain work on " << size_t(func)); + self->availableFuncs--; + self->optimizeFunction(func); + self->finishedFuncs++; + } + DEBUG_THREAD("workerMain ready to exit"); + { + std::lock_guard<std::mutex> lock(self->mutex); + self->liveWorkers--; + self->condition.notify_all(); + } + DEBUG_THREAD("workerMain exiting"); + } +}; + +} // namespace wasm + +#endif // optimizing_incremental_module_builder_h + diff --git a/src/wasm-traversal.h b/src/wasm-traversal.h index 9ed02027e..e5da420b0 100644 --- a/src/wasm-traversal.h +++ b/src/wasm-traversal.h @@ -187,6 +187,7 @@ struct Walker : public VisitorType { void walkFunction(Function* func) { setFunction(func); static_cast<SubType*>(this)->doWalkFunction(func); + static_cast<SubType*>(this)->visitFunction(func); } // override this to provide custom functionality @@ -197,6 +198,7 @@ struct Walker : public VisitorType { void walkModule(Module *module) { setModule(module); static_cast<SubType*>(this)->doWalkModule(module); + static_cast<SubType*>(this)->visitModule(module); } // override this to provide custom functionality @@ -223,7 +225,6 @@ struct Walker : public VisitorType { allocated = std::unique_ptr<SubType>(instance); } instance->walkFunction(func); - instance->visitFunction(func); }; // if this is not a function-parallel traversal, run @@ -259,7 +260,6 @@ struct Walker : public VisitorType { } self->visitTable(&module->table); self->visitMemory(&module->memory); - self->visitModule(module); } // Walk implementation. We don't use recursion as ASTs may be highly diff --git a/test/memorygrowth.fromasm b/test/memorygrowth.fromasm index f2acc39d2..07909b855 100644 --- a/test/memorygrowth.fromasm +++ b/test/memorygrowth.fromasm @@ -10114,9 +10114,9 @@ (func $ib (result i32) (i32.const 0) ) - (func $__growWasmMemory (param $0 i32) + (func $__growWasmMemory (param $newSize i32) (grow_memory - (get_local $0) + (get_local $newSize) ) ) ) diff --git a/test/memorygrowth.fromasm.imprecise b/test/memorygrowth.fromasm.imprecise index f2acc39d2..07909b855 100644 --- a/test/memorygrowth.fromasm.imprecise +++ b/test/memorygrowth.fromasm.imprecise @@ -10114,9 +10114,9 @@ (func $ib (result i32) (i32.const 0) ) - (func $__growWasmMemory (param $0 i32) + (func $__growWasmMemory (param $newSize i32) (grow_memory - (get_local $0) + (get_local $newSize) ) ) ) |