diff options
Diffstat (limited to 'src/passes/ReorderGlobals.cpp')
-rw-r--r-- | src/passes/ReorderGlobals.cpp | 397 |
1 files changed, 326 insertions, 71 deletions
diff --git a/src/passes/ReorderGlobals.cpp b/src/passes/ReorderGlobals.cpp index 9b5940a21..4602f3284 100644 --- a/src/passes/ReorderGlobals.cpp +++ b/src/passes/ReorderGlobals.cpp @@ -19,6 +19,17 @@ // binaries because fewer bytes are needed to encode references to frequently // used globals. // +// This pass also sorts by dependencies, as each global can only appear after +// those it refers to. Other passes can use this internally to fix the ordering +// after they add new globals. +// +// The "always" variant of this pass will always sort globals, even if there are +// so few they all fit in a single LEB. In "always" mode we sort regardless and +// we measure size by considering each subsequent index to have a higher cost. +// That is, in reality the size of all globals up to 128 is a single byte, and +// then the LEB grows to 2, while in "always" mode we increase the size by 1/128 +// for each global in a smooth manner. "Always" is mainly useful for testing. +// #include "memory" @@ -29,14 +40,15 @@ namespace wasm { -using NameCountMap = std::unordered_map<Name, std::atomic<Index>>; +// We'll count uses in parallel. +using AtomicNameCountMap = std::unordered_map<Name, std::atomic<Index>>; struct UseCountScanner : public WalkerPass<PostWalker<UseCountScanner>> { bool isFunctionParallel() override { return true; } bool modifiesBinaryenIR() override { return false; } - UseCountScanner(NameCountMap& counts) : counts(counts) {} + UseCountScanner(AtomicNameCountMap& counts) : counts(counts) {} std::unique_ptr<Pass> create() override { return std::make_unique<UseCountScanner>(counts); @@ -53,112 +65,355 @@ struct UseCountScanner : public WalkerPass<PostWalker<UseCountScanner>> { } private: - NameCountMap& counts; + AtomicNameCountMap& counts; }; struct ReorderGlobals : public Pass { - // Whether to always reorder globals, even if there are very few and the - // benefit is minor. That is useful for testing, and also internally in passes - // that use us to reorder them so dependencies appear first (that is, if a - // pass ends up with an earlier global reading a later one, the sorting in - // this pass will reorder them properly; we need to take those dependencies - // into account anyhow here). bool always; ReorderGlobals(bool always) : always(always) {} + // For efficiency we will use global indices rather than names. That is, we + // use the index of the global in the original ordering to identify each + // global. A different ordering is then a vector of new indices, saying where + // each one moves, which is logically a mapping between indices. + using IndexIndexMap = std::vector<Index>; + + // We will also track counts of uses for each global. We use floating-point + // values here since while the initial counts are integers, we will be + // considering fractional sums of them later. + using IndexCountMap = std::vector<double>; + + // We must take into account dependencies, so that globals appear before + // their users in other globals: + // + // (global $a i32 (i32.const 10)) + // (global $b i32 (global.get $a)) ;; $b depends on $a; $a must be first + // + // To do so we construct a map from each global to those it depends on. We + // also build the reverse map, of those that it is depended upon by. + struct Dependencies { + std::unordered_map<Index, std::unordered_set<Index>> dependsOn; + std::unordered_map<Index, std::unordered_set<Index>> dependedUpon; + }; + void run(Module* module) override { - if (module->globals.size() < 128 && !always) { + auto& globals = module->globals; + + if (globals.size() < 128 && !always) { // The module has so few globals that they all fit in a single-byte U32LEB // value, so no reordering we can do can actually decrease code size. Note // that this is the common case with wasm MVP modules where the only // globals are typically the stack pointer and perhaps a handful of others // (however, features like wasm GC there may be a great many globals). + // TODO: we still need to sort here to fix dependencies sometimes return; } - NameCountMap counts; + AtomicNameCountMap atomicCounts; // Fill in info, as we'll operate on it in parallel. - for (auto& global : module->globals) { - counts[global->name]; + for (auto& global : globals) { + atomicCounts[global->name]; } // Count uses. - UseCountScanner scanner(counts); + UseCountScanner scanner(atomicCounts); scanner.run(getPassRunner(), module); scanner.runOnModuleCode(getPassRunner(), module); - // Do a toplogical sort to ensure we keep dependencies before the things - // that need them. For example, if $b's definition depends on $a then $b - // must appear later: - // - // (global $a i32 (i32.const 10)) - // (global $b i32 (global.get $a)) ;; $b depends on $a - // - struct DependencySort : TopologicalSort<Name, DependencySort> { - Module& wasm; - const NameCountMap& counts; - - std::unordered_map<Name, std::vector<Name>> deps; - - DependencySort(Module& wasm, const NameCountMap& counts) - : wasm(wasm), counts(counts) { - // Sort a list of global names by their counts. - auto sort = [&](std::vector<Name>& globals) { - std::stable_sort( - globals.begin(), globals.end(), [&](const Name& a, const Name& b) { - return counts.at(a) < counts.at(b); - }); - }; - - // Sort the globals. - std::vector<Name> sortedNames; - for (auto& global : wasm.globals) { - sortedNames.push_back(global->name); - } - sort(sortedNames); + // Switch to non-atomic for all further processing, and convert names to + // indices. + std::unordered_map<Name, Index> originalIndices; + for (Index i = 0; i < globals.size(); i++) { + originalIndices[globals[i]->name] = i; + } + IndexCountMap counts(globals.size()); + for (auto& [name, count] : atomicCounts) { + counts[originalIndices[name]] = count; + } - // Everything is a root (we need to emit all globals). - for (auto global : sortedNames) { - push(global); + // Compute dependencies. + Dependencies deps; + for (Index i = 0; i < globals.size(); i++) { + auto& global = globals[i]; + if (!global->imported()) { + for (auto* get : FindAll<GlobalGet>(global->init).list) { + auto getIndex = originalIndices[get->name]; + deps.dependsOn[i].insert(getIndex); + deps.dependedUpon[getIndex].insert(i); } + } + } + + // Compute various sorting options. All the options use a variation of the + // algorithm in doSort() below, see there for more details; the only + // difference between the sorts is in the use counts we provide it. + struct SortAndSize { + IndexIndexMap sort; + double size; + SortAndSize(IndexIndexMap&& sort, double size) + : sort(std::move(sort)), size(size) {} + }; + std::vector<SortAndSize> options; + auto addOption = [&](const IndexCountMap& customCounts) { + // Compute the sort using custom counts that guide us how to order. + auto sort = doSort(customCounts, deps, module); + // Compute the size using the true counts. + auto size = computeSize(sort, counts); + options.emplace_back(std::move(sort), size); + }; - // The dependencies are the globals referred to. - for (auto& global : wasm.globals) { - if (global->imported()) { - continue; - } - std::vector<Name> vec; - for (auto* get : FindAll<GlobalGet>(global->init).list) { - vec.push_back(get->name); - } - sort(vec); - deps[global->name] = std::move(vec); + // Compute the closest thing we can to the original, unoptimized sort, by + // setting all counts to 0 there, so it only takes into account dependencies + // and the original ordering and nothing else. + // + // We put this sort first because if they all end up with equal size we + // prefer it (as it avoids pointless changes). + IndexCountMap zeroes(globals.size()); + addOption(zeroes); + + // A simple sort that uses the counts. As the algorithm in doSort() is + // greedy (see below), this is a pure greedy sort (at each point it time it + // picks the global with the highest count that it can). + addOption(counts); + + // We can make the sort less greedy by adding to each global's count some of + // the count of its children. Then we'd still be picking in a greedy manner + // but our choices would be influenced by the bigger picture of what can be + // unlocked by emitting a particular global (it may have a low count itself, + // but allow emitting something that depends on it that has a high count). + // We try two variations of this, one which is a simple sum (add the + // dependent's counts to the global's) and one which exponentially decreases + // them (that is, we add a small multiple of the dependent's counts, which + // may help as the dependents also depend on other things potentially, so a + // simple sum may be too naive). + double const EXPONENTIAL_FACTOR = 0.095; + IndexCountMap sumCounts(globals.size()), exponentialCounts(globals.size()); + + struct Sort : public TopologicalSort<Index, Sort> { + const Dependencies& deps; + + Sort(Index numGlobals, const Dependencies& deps) : deps(deps) { + for (Index i = 0; i < numGlobals; i++) { + push(i); } } - void pushPredecessors(Name global) { - for (auto pred : deps[global]) { - push(pred); + void pushPredecessors(Index global) { + auto iter = deps.dependedUpon.find(global); + if (iter == deps.dependedUpon.end()) { + return; + } + for (auto dep : iter->second) { + push(dep); } } - }; + } sort(globals.size(), deps); - std::unordered_map<Name, Index> sortedIndexes; - for (auto global : DependencySort(*module, counts)) { - auto index = sortedIndexes.size(); - sortedIndexes[global] = index; + for (auto global : sort) { + // We can compute this global's count as in the sorted order all the + // values it cares about are resolved. Start with the self-count, then + // add the deps. + sumCounts[global] = exponentialCounts[global] = counts[global]; + for (auto dep : deps.dependedUpon[global]) { + sumCounts[global] += sumCounts[dep]; + exponentialCounts[global] += + EXPONENTIAL_FACTOR * exponentialCounts[dep]; + } } - std::sort( - module->globals.begin(), - module->globals.end(), - [&](const std::unique_ptr<Global>& a, const std::unique_ptr<Global>& b) { - return sortedIndexes[a->name] < sortedIndexes[b->name]; - }); + addOption(sumCounts); + addOption(exponentialCounts); + // Pick the best out of all the options we computed. + IndexIndexMap* best = nullptr; + double bestSize; + for (auto& [sort, size] : options) { + if (!best || size < bestSize) { + best = &sort; + bestSize = size; + } + } + + // Apply the indices we computed. + std::vector<std::unique_ptr<Global>> old(std::move(globals)); + globals.resize(old.size()); + for (Index i = 0; i < old.size(); i++) { + globals[(*best)[i]] = std::move(old[i]); + } module->updateMaps(); } + + IndexIndexMap doSort(const IndexCountMap& counts, + const Dependencies& originalDeps, + Module* module) { + auto& globals = module->globals; + + // Copy the deps as we will operate on them as we go. + auto deps = originalDeps; + + // To sort the globals we do a simple greedy approach of always picking the + // global with the highest count at every point in time, subject to the + // constraint that we can only emit globals that have all of their + // dependencies already emitted. To do so we keep a list of the "available" + // globals, which are those with no remaining dependencies. Then by keeping + // the list of available globals in heap form we can simply pop the largest + // from the heap each time, and add new available ones as they become so. + // + // Other approaches here could be to do a topological sort, but the optimal + // order may not require strict ordering by topological depth, e.g.: + /* + // $c - $a + // / + // $e + // \ + // $d - $b + */ + // Here $e depends on $c and $d, $c depends on $a, and $d on $b. This is a + // partial order, as $d can be before or after $a, for example. As a result, + // if we sorted topologically by sub-trees here then we'd keep $c and $a + // together, and $d and $b, but a better order might interleave them. A good + // order also may not keep topological depths separated, e.g. we may want to + // put $a in between $c and $d despite it having a greater depth. + // + // The greedy approach here may also be unoptimal, however. Consider that we + // might see that the best available global is $a, but if we popped $b + // instead that could unlock $c which depends on $b, and $c may have a much + // higher use count than $a. For that reason we try several variations of + // this with different counts, see earlier. + std::vector<Index> availableHeap; + + // Comparison function. Given a and b, returns if a should be before b. This + // is used in a heap, where "highest" means "popped first", so see the notes + // below on how we order. + auto cmp = [&](Index a, Index b) { + // Imports always go first. The binary writer takes care of this itself + // anyhow, but it is better to do it here in the IR so we can actually + // see what the final layout will be. + auto aImported = globals[a]->imported(); + auto bImported = globals[b]->imported(); + // The highest items will be popped first off the heap, so we want imports + // to be at higher indexes, that is, + // + // unimported, unimported, imported, imported. + // + // Then the imports are popped first. + if (aImported != bImported) { + return bImported; + } + + // Sort by the counts. We want higher counts at higher indexes so they are + // popped first, that is, + // + // 10, 20, 30, 40 + // + auto aCount = counts[a]; + auto bCount = counts[b]; + if (aCount != bCount) { + return aCount < bCount; + } + + // Break ties using the original order, which means just using the + // indices we have. We need lower indexes at the top so they are popped + // first, that is, + // + // 3, 2, 1, 0 + // + return a > b; + }; + + // Push an item that just became available to the available heap. + auto push = [&](Index global) { + availableHeap.push_back(global); + std::push_heap(availableHeap.begin(), availableHeap.end(), cmp); + }; + + // The initially available globals are those with no dependencies. + for (Index i = 0; i < globals.size(); i++) { + if (deps.dependsOn[i].empty()) { + push(i); + } + } + + // Pop off the heap: Emit the global and its final, sorted index. Keep + // doing that until we finish processing all the globals. + IndexIndexMap sortedindices(globals.size()); + Index numSortedindices = 0; + while (!availableHeap.empty()) { + std::pop_heap(availableHeap.begin(), availableHeap.end(), cmp); + auto global = availableHeap.back(); + sortedindices[global] = numSortedindices++; + availableHeap.pop_back(); + + // Each time we pop we emit the global, which means anything that only + // depended on it becomes available to be popped as well. + for (auto other : deps.dependedUpon[global]) { + assert(deps.dependsOn[other].count(global)); + deps.dependsOn[other].erase(global); + if (deps.dependsOn[other].empty()) { + push(other); + } + } + } + + // All globals must have been handled. Cycles would prevent this, but they + // cannot exist in valid IR. + assert(numSortedindices == globals.size()); + + return sortedindices; + } + + // Given an indexing of the globals and the counts of how many times each is + // used, estimate the size of relevant parts of the wasm binary (that is, of + // LEBs in global.gets). + double computeSize(IndexIndexMap& indices, IndexCountMap& counts) { + // |indices| maps each old index to its new position in the sort. We need + // the reverse map here, which at index 0 has the old index of the global + // that will be first, and so forth. + IndexIndexMap actualOrder(indices.size()); + for (Index i = 0; i < indices.size(); i++) { + // Each global has a unique index, so we only replace 0's here, and they + // must be in bounds. + assert(indices[i] < indices.size()); + assert(actualOrder[indices[i]] == 0); + + actualOrder[indices[i]] = i; + } + + if (always) { + // In this mode we gradually increase the cost of later globals, in an + // unrealistic but smooth manner. + double total = 0; + for (Index i = 0; i < actualOrder.size(); i++) { + // Multiply the count for this global by a smoothed LEB factor, which + // starts at 1 (for 1 byte) at index 0, and then increases linearly with + // i, so that after 128 globals we reach 2 (which is the true index at + // which the LEB size normally jumps from 1 to 2), and so forth. + total += counts[actualOrder[i]] * (1.0 + (i / 128.0)); + } + return total; + } + + // The total size we are computing. + double total = 0; + // Track the size in bits and the next index at which the size increases. At + // the first iteration we'll compute the size of the LEB for index 0, and so + // forth. + size_t sizeInBits = 0; + size_t nextSizeIncrease = 0; + for (Index i = 0; i < actualOrder.size(); i++) { + if (i == nextSizeIncrease) { + sizeInBits++; + // At the current size we have 7 * sizeInBits bits to use. For example, + // at index 0 the size is 1 and we'll compute 128 here, and indeed after + // emitting 128 globals (0,..,127) the 129th (at index 128) requires a + // larger LEB. + nextSizeIncrease = 1 << (7 * sizeInBits); + } + total += counts[actualOrder[i]] * sizeInBits; + } + return total; + } }; Pass* createReorderGlobalsPass() { return new ReorderGlobals(false); } |