From cfeddd741f6503457c6c4a5e1741d91eb0e7e08d Mon Sep 17 00:00:00 2001 From: Alon Zakai Date: Tue, 22 Aug 2017 10:27:52 -0700 Subject: Inline many (#1125) * Improve inlining pass to inline single-use functions that are fairly small, which makes it useful for removing unnecessary global constructors from clang. * Add an inlining-optimizing pass that also optimizes where it inlined, as new opportunities arise. enable that it by default in O2+ * In addition, in -O3+ also inline small functions with multiple uses. This helps a lot with things like safe-int-divide functions (where each int divide is replaced by a safe divide that won't trap). Inlining gets rid of around half of the overhead there. --- src/passes/Inlining.cpp | 156 +++++++++++++++++++++++++++++------------------- 1 file changed, 94 insertions(+), 62 deletions(-) (limited to 'src/passes/Inlining.cpp') diff --git a/src/passes/Inlining.cpp b/src/passes/Inlining.cpp index 20becfe46..710fc00ff 100644 --- a/src/passes/Inlining.cpp +++ b/src/passes/Inlining.cpp @@ -17,10 +17,15 @@ // // Inlining. // -// For now, this does a conservative inlining of all functions that have +// By default, this does a conservative inlining of all functions that have // exactly one use, and are fairly small. That should not increase code // size, and may have speed benefits. // +// When opt level is 3+ (-O3 or above), we more aggressively inline +// even functions with more than one use, that seem to be "lightweight" +// (no loops or calls etc.), so inlining them may get rid of call overhead +// that would be noticeable otherwise +// #include @@ -33,42 +38,66 @@ namespace wasm { -// A limit on how big a function to inline. -static const int INLINING_SIZE_LIMIT = 15; +// A limit on how big a function to inline when being careful about size +static const int CAREFUL_SIZE_LIMIT = 15; + +// A limit on how big a function to inline when being more flexible. In +// particular it's nice that with this limit we can inline the clamp +// functions (i32s-div, f64-to-int, etc.), that can affect perf. +static const int FLEXIBLE_SIZE_LIMIT = 20; + +// Useful into on a function, helping us decide if we can inline it +struct FunctionInfo { + std::atomic calls; + Index size; + bool lightweight = true; + bool usedGlobally = false; // in a table or export + + bool worthInlining(PassOptions& options, bool allowMultipleInliningsPerFunction) { + // if it's big, it's just not worth doing (TODO: investigate more) + if (size > FLEXIBLE_SIZE_LIMIT) return false; + // if it has one use, then inlining it would likely reduce code size + // since we are just moving code around, + optimizing, so worth it + // if small enough that we are pretty sure its ok + if (calls == 1 && !usedGlobally && size <= CAREFUL_SIZE_LIMIT) return true; + if (!allowMultipleInliningsPerFunction) return false; + // more than one use, so we can't eliminate it after inlining, + // so only worth it if we really care about speed and don't care + // about size, and if it's lightweight so a good candidate for + // speeding us up + return options.optimizeLevel >= 3 && options.shrinkLevel == 0 && lightweight; + } +}; -// We only inline a function with a single use. -static const int SINGLE_USE = 1; +typedef std::unordered_map NameInfoMap; -// A number of uses of a function that is too high for us to -// inline it to all those locations. -static const int TOO_MANY_USES_TO_INLINE = SINGLE_USE + 1; +struct FunctionInfoScanner : public WalkerPass> { + bool isFunctionParallel() override { return true; } -// Map of function name => number of uses. We build the values in -// parallel, using atomic increments. This is safe because we never -// update the map itself in parallel, we only update the values, -// and so the map never allocates or moves values which could be -// a problem with atomics (in fact it would be a problem in general -// as well, not just with atomics, as we don't use a lock in -// parallel access, we depend on the map itself being constant -// when running multiple threads). -typedef std::map> NameToAtomicIndexMap; + FunctionInfoScanner(NameInfoMap* infos) : infos(infos) {} -struct FunctionUseCounter : public WalkerPass> { - bool isFunctionParallel() override { return true; } + FunctionInfoScanner* create() override { + return new FunctionInfoScanner(infos); + } - FunctionUseCounter(NameToAtomicIndexMap* uses) : uses(uses) {} + void visitLoop(Loop* curr) { + // having a loop is not lightweight + (*infos)[getFunction()->name].lightweight = false; + } - FunctionUseCounter* create() override { - return new FunctionUseCounter(uses); + void visitCall(Call* curr) { + assert(infos->count(curr->target) > 0); // can't add a new element in parallel + (*infos)[curr->target].calls++; + // having a call is not lightweight + (*infos)[getFunction()->name].lightweight = false; } - void visitCall(Call *curr) { - assert(uses->count(curr->target) > 0); // can't add a new element in parallel - (*uses)[curr->target]++; + void visitFunction(Function* curr) { + (*infos)[curr->name].size = Measurer::measure(curr->body); } private: - NameToAtomicIndexMap* uses; + NameInfoMap* infos; }; struct InliningAction { @@ -79,8 +108,8 @@ struct InliningAction { }; struct InliningState { - std::set canInline; - std::map> actionsForFunction; // function name => actions that can be performed in it + std::unordered_set worthInlining; + std::unordered_map> actionsForFunction; // function name => actions that can be performed in it }; struct Planner : public WalkerPass> { @@ -95,7 +124,7 @@ struct Planner : public WalkerPass> { void visitCall(Call* curr) { // plan to inline if we know this is valid to inline, and if the call is // actually performed - if it is dead code, it's pointless to inline - if (state->canInline.count(curr->target) && + if (state->worthInlining.count(curr->target) && curr->type != unreachable) { // nest the call in a block. that way the location of the pointer to the call will not // change even if we inline multiple times into the same function, otherwise @@ -110,7 +139,7 @@ struct Planner : public WalkerPass> { void doWalkFunction(Function* func) { // we shouldn't inline into us if we are to be inlined // ourselves - that has the risk of cycles - if (state->canInline.count(func->name) == 0) { + if (state->worthInlining.count(func->name) == 0) { walk(func->body); } } @@ -169,33 +198,43 @@ struct Inlining : public Pass { // whether to optimize where we inline bool optimize = false; - NameToAtomicIndexMap uses; + NameInfoMap infos; + + bool firstIteration; void run(PassRunner* runner, Module* module) override { // keep going while we inline, to handle nesting. TODO: optimize - calculateUses(module); - while (iteration(runner, module)) {} + firstIteration = true; + while (1) { + calculateInfos(module); + if (!iteration(runner, module)) { + return; + } + firstIteration = false; + } } - void calculateUses(Module* module) { - // fill in uses, as we operate on it in parallel (each function to its own entry) + void calculateInfos(Module* module) { + infos.clear(); + // fill in info, as we operate on it in parallel (each function to its own entry) for (auto& func : module->functions) { - uses[func->name].store(0); + infos[func->name]; } PassRunner runner(module); runner.setIsNested(true); - runner.add(&uses); + runner.add(&infos); runner.run(); + // fill in global uses // anything exported or used in a table should not be inlined for (auto& ex : module->exports) { if (ex->kind == ExternalKind::Function) { - uses[ex->value].store(TOO_MANY_USES_TO_INLINE); + infos[ex->value].usedGlobally = true; } } for (auto& segment : module->table.segments) { for (auto name : segment.data) { if (module->getFunctionOrNull(name)) { - uses[name].store(TOO_MANY_USES_TO_INLINE); + infos[name].usedGlobally = true; } } } @@ -205,12 +244,12 @@ struct Inlining : public Pass { // decide which to inline InliningState state; for (auto& func : module->functions) { - auto name = func->name; - auto numUses = uses[name].load(); - if (canInline(numUses) && worthInlining(module->getFunction(name))) { - state.canInline.insert(name); + // on the first iteration, allow multiple inlinings per function + if (infos[func->name].worthInlining(runner->options, firstIteration /* allowMultipleInliningsPerFunction */)) { + state.worthInlining.insert(func->name); } } + if (state.worthInlining.size() == 0) return false; // fill in actionsForFunction, as we operate on it in parallel (each function to its own entry) for (auto& func : module->functions) { state.actionsForFunction[func->name]; @@ -222,17 +261,16 @@ struct Inlining : public Pass { runner.add(&state); runner.run(); } - // perform inlinings - std::set inlined; - std::set inlinedInto; + // perform inlinings TODO: parallelize + std::unordered_map inlinedUses; // how many uses we inlined + std::unordered_set inlinedInto; // which functions were inlined into for (auto& func : module->functions) { for (auto& action : state.actionsForFunction[func->name]) { Name inlinedName = action.contents->name; doInlining(module, func.get(), action); - inlined.insert(inlinedName); + inlinedUses[inlinedName]++; inlinedInto.insert(func.get()); - uses[inlinedName]--; - assert(uses[inlinedName].load() == 0); + assert(inlinedUses[inlinedName] <= infos[inlinedName].calls); } } // anything we inlined into may now have non-unique label names, fix it up @@ -242,26 +280,20 @@ struct Inlining : public Pass { if (optimize && inlinedInto.size() > 0) { doOptimize(inlinedInto, module, runner); } - // remove functions that we managed to inline, their one use is gone + // remove functions that we no longer need after inlining auto& funcs = module->functions; - funcs.erase(std::remove_if(funcs.begin(), funcs.end(), [&inlined](const std::unique_ptr& curr) { - return inlined.count(curr->name) > 0; + funcs.erase(std::remove_if(funcs.begin(), funcs.end(), [&](const std::unique_ptr& curr) { + auto name = curr->name; + auto& info = infos[name]; + return inlinedUses.count(name) && inlinedUses[name] == info.calls && !info.usedGlobally; }), funcs.end()); // return whether we did any work - return inlined.size() > 0; - } - - bool canInline(int numUses) { - return numUses == SINGLE_USE; - } - - bool worthInlining(Function* func) { - return Measurer::measure(func->body) <= INLINING_SIZE_LIMIT; + return inlinedUses.size() > 0; } // Run useful optimizations after inlining, things like removing // unnecessary new blocks, sharing variables, etc. - void doOptimize(std::set& funcs, Module* module, PassRunner* parentRunner) { + void doOptimize(std::unordered_set& funcs, Module* module, PassRunner* parentRunner) { // save the full list of functions on the side std::vector> all; all.swap(module->functions); -- cgit v1.2.3