/* * Copyright 2022 WebAssembly Community Group participants * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // // Sorts globals by their static use count. This helps reduce the size of wasm // binaries because fewer bytes are needed to encode references to frequently // used globals. // // This pass also sorts by dependencies, as each global can only appear after // those it refers to. Other passes can use this internally to fix the ordering // after they add new globals. // // The "always" variant of this pass will always sort globals, even if there are // so few they all fit in a single LEB. In "always" mode we sort regardless and // we measure size by considering each subsequent index to have a higher cost. // That is, in reality the size of all globals up to 128 is a single byte, and // then the LEB grows to 2, while in "always" mode we increase the size by 1/128 // for each global in a smooth manner. "Always" is mainly useful for testing. // #include "memory" #include "ir/find_all.h" #include "pass.h" #include "support/topological_sort.h" #include "wasm.h" namespace wasm { // We'll count uses in parallel. using AtomicNameCountMap = std::unordered_map>; struct UseCountScanner : public WalkerPass> { bool isFunctionParallel() override { return true; } bool modifiesBinaryenIR() override { return false; } UseCountScanner(AtomicNameCountMap& counts) : counts(counts) {} std::unique_ptr create() override { return std::make_unique(counts); } void visitGlobalGet(GlobalGet* curr) { // We can't add a new element to the map in parallel. assert(counts.count(curr->name) > 0); counts[curr->name]++; } void visitGlobalSet(GlobalSet* curr) { assert(counts.count(curr->name) > 0); counts[curr->name]++; } private: AtomicNameCountMap& counts; }; struct ReorderGlobals : public Pass { bool always; ReorderGlobals(bool always) : always(always) {} // For efficiency we will use global indices rather than names. That is, we // use the index of the global in the original ordering to identify each // global. A different ordering is then a vector of new indices, saying where // each one moves, which is logically a mapping between indices. using IndexIndexMap = std::vector; // We will also track counts of uses for each global. We use floating-point // values here since while the initial counts are integers, we will be // considering fractional sums of them later. using IndexCountMap = std::vector; // We must take into account dependencies, so that globals appear before // their users in other globals: // // (global $a i32 (i32.const 10)) // (global $b i32 (global.get $a)) ;; $b depends on $a; $a must be first // // To do so we construct a map from each global to those it depends on. We // also build the reverse map, of those that it is depended upon by. struct Dependencies { std::unordered_map> dependsOn; std::unordered_map> dependedUpon; }; void run(Module* module) override { auto& globals = module->globals; if (globals.size() < 128 && !always) { // The module has so few globals that they all fit in a single-byte U32LEB // value, so no reordering we can do can actually decrease code size. Note // that this is the common case with wasm MVP modules where the only // globals are typically the stack pointer and perhaps a handful of others // (however, features like wasm GC there may be a great many globals). // TODO: we still need to sort here to fix dependencies sometimes return; } AtomicNameCountMap atomicCounts; // Fill in info, as we'll operate on it in parallel. for (auto& global : globals) { atomicCounts[global->name]; } // Count uses. UseCountScanner scanner(atomicCounts); scanner.run(getPassRunner(), module); scanner.runOnModuleCode(getPassRunner(), module); // Switch to non-atomic for all further processing, and convert names to // indices. std::unordered_map originalIndices; for (Index i = 0; i < globals.size(); i++) { originalIndices[globals[i]->name] = i; } IndexCountMap counts(globals.size()); for (auto& [name, count] : atomicCounts) { counts[originalIndices[name]] = count; } // Compute dependencies. Dependencies deps; for (Index i = 0; i < globals.size(); i++) { auto& global = globals[i]; if (!global->imported()) { for (auto* get : FindAll(global->init).list) { auto getIndex = originalIndices[get->name]; deps.dependsOn[i].insert(getIndex); deps.dependedUpon[getIndex].insert(i); } } } // Compute various sorting options. All the options use a variation of the // algorithm in doSort() below, see there for more details; the only // difference between the sorts is in the use counts we provide it. struct SortAndSize { IndexIndexMap sort; double size; SortAndSize(IndexIndexMap&& sort, double size) : sort(std::move(sort)), size(size) {} }; std::vector options; auto addOption = [&](const IndexCountMap& customCounts) { // Compute the sort using custom counts that guide us how to order. auto sort = doSort(customCounts, deps, module); // Compute the size using the true counts. auto size = computeSize(sort, counts); options.emplace_back(std::move(sort), size); }; // Compute the closest thing we can to the original, unoptimized sort, by // setting all counts to 0 there, so it only takes into account dependencies // and the original ordering and nothing else. // // We put this sort first because if they all end up with equal size we // prefer it (as it avoids pointless changes). IndexCountMap zeroes(globals.size()); addOption(zeroes); // A simple sort that uses the counts. As the algorithm in doSort() is // greedy (see below), this is a pure greedy sort (at each point it time it // picks the global with the highest count that it can). addOption(counts); // We can make the sort less greedy by adding to each global's count some of // the count of its children. Then we'd still be picking in a greedy manner // but our choices would be influenced by the bigger picture of what can be // unlocked by emitting a particular global (it may have a low count itself, // but allow emitting something that depends on it that has a high count). // We try two variations of this, one which is a simple sum (add the // dependent's counts to the global's) and one which exponentially decreases // them (that is, we add a small multiple of the dependent's counts, which // may help as the dependents also depend on other things potentially, so a // simple sum may be too naive). double const EXPONENTIAL_FACTOR = 0.095; IndexCountMap sumCounts(globals.size()), exponentialCounts(globals.size()); struct Sort : public TopologicalSort { const Dependencies& deps; Sort(Index numGlobals, const Dependencies& deps) : deps(deps) { for (Index i = 0; i < numGlobals; i++) { push(i); } } void pushPredecessors(Index global) { auto iter = deps.dependedUpon.find(global); if (iter == deps.dependedUpon.end()) { return; } for (auto dep : iter->second) { push(dep); } } } sort(globals.size(), deps); for (auto global : sort) { // We can compute this global's count as in the sorted order all the // values it cares about are resolved. Start with the self-count, then // add the deps. sumCounts[global] = exponentialCounts[global] = counts[global]; for (auto dep : deps.dependedUpon[global]) { sumCounts[global] += sumCounts[dep]; exponentialCounts[global] += EXPONENTIAL_FACTOR * exponentialCounts[dep]; } } addOption(sumCounts); addOption(exponentialCounts); // Pick the best out of all the options we computed. IndexIndexMap* best = nullptr; double bestSize; for (auto& [sort, size] : options) { if (!best || size < bestSize) { best = &sort; bestSize = size; } } // Apply the indices we computed. std::vector> old(std::move(globals)); globals.resize(old.size()); for (Index i = 0; i < old.size(); i++) { globals[(*best)[i]] = std::move(old[i]); } module->updateMaps(); } IndexIndexMap doSort(const IndexCountMap& counts, const Dependencies& originalDeps, Module* module) { auto& globals = module->globals; // Copy the deps as we will operate on them as we go. auto deps = originalDeps; // To sort the globals we do a simple greedy approach of always picking the // global with the highest count at every point in time, subject to the // constraint that we can only emit globals that have all of their // dependencies already emitted. To do so we keep a list of the "available" // globals, which are those with no remaining dependencies. Then by keeping // the list of available globals in heap form we can simply pop the largest // from the heap each time, and add new available ones as they become so. // // Other approaches here could be to do a topological sort, but the optimal // order may not require strict ordering by topological depth, e.g.: /* // $c - $a // / // $e // \ // $d - $b */ // Here $e depends on $c and $d, $c depends on $a, and $d on $b. This is a // partial order, as $d can be before or after $a, for example. As a result, // if we sorted topologically by sub-trees here then we'd keep $c and $a // together, and $d and $b, but a better order might interleave them. A good // order also may not keep topological depths separated, e.g. we may want to // put $a in between $c and $d despite it having a greater depth. // // The greedy approach here may also be unoptimal, however. Consider that we // might see that the best available global is $a, but if we popped $b // instead that could unlock $c which depends on $b, and $c may have a much // higher use count than $a. For that reason we try several variations of // this with different counts, see earlier. std::vector availableHeap; // Comparison function. Given a and b, returns if a should be before b. This // is used in a heap, where "highest" means "popped first", so see the notes // below on how we order. auto cmp = [&](Index a, Index b) { // Imports always go first. The binary writer takes care of this itself // anyhow, but it is better to do it here in the IR so we can actually // see what the final layout will be. auto aImported = globals[a]->imported(); auto bImported = globals[b]->imported(); // The highest items will be popped first off the heap, so we want imports // to be at higher indexes, that is, // // unimported, unimported, imported, imported. // // Then the imports are popped first. if (aImported != bImported) { return bImported; } // Sort by the counts. We want higher counts at higher indexes so they are // popped first, that is, // // 10, 20, 30, 40 // auto aCount = counts[a]; auto bCount = counts[b]; if (aCount != bCount) { return aCount < bCount; } // Break ties using the original order, which means just using the // indices we have. We need lower indexes at the top so they are popped // first, that is, // // 3, 2, 1, 0 // return a > b; }; // Push an item that just became available to the available heap. auto push = [&](Index global) { availableHeap.push_back(global); std::push_heap(availableHeap.begin(), availableHeap.end(), cmp); }; // The initially available globals are those with no dependencies. for (Index i = 0; i < globals.size(); i++) { if (deps.dependsOn[i].empty()) { push(i); } } // Pop off the heap: Emit the global and its final, sorted index. Keep // doing that until we finish processing all the globals. IndexIndexMap sortedindices(globals.size()); Index numSortedindices = 0; while (!availableHeap.empty()) { std::pop_heap(availableHeap.begin(), availableHeap.end(), cmp); auto global = availableHeap.back(); sortedindices[global] = numSortedindices++; availableHeap.pop_back(); // Each time we pop we emit the global, which means anything that only // depended on it becomes available to be popped as well. for (auto other : deps.dependedUpon[global]) { assert(deps.dependsOn[other].count(global)); deps.dependsOn[other].erase(global); if (deps.dependsOn[other].empty()) { push(other); } } } // All globals must have been handled. Cycles would prevent this, but they // cannot exist in valid IR. assert(numSortedindices == globals.size()); return sortedindices; } // Given an indexing of the globals and the counts of how many times each is // used, estimate the size of relevant parts of the wasm binary (that is, of // LEBs in global.gets). double computeSize(IndexIndexMap& indices, IndexCountMap& counts) { // |indices| maps each old index to its new position in the sort. We need // the reverse map here, which at index 0 has the old index of the global // that will be first, and so forth. IndexIndexMap actualOrder(indices.size()); for (Index i = 0; i < indices.size(); i++) { // Each global has a unique index, so we only replace 0's here, and they // must be in bounds. assert(indices[i] < indices.size()); assert(actualOrder[indices[i]] == 0); actualOrder[indices[i]] = i; } if (always) { // In this mode we gradually increase the cost of later globals, in an // unrealistic but smooth manner. double total = 0; for (Index i = 0; i < actualOrder.size(); i++) { // Multiply the count for this global by a smoothed LEB factor, which // starts at 1 (for 1 byte) at index 0, and then increases linearly with // i, so that after 128 globals we reach 2 (which is the true index at // which the LEB size normally jumps from 1 to 2), and so forth. total += counts[actualOrder[i]] * (1.0 + (i / 128.0)); } return total; } // The total size we are computing. double total = 0; // Track the size in bits and the next index at which the size increases. At // the first iteration we'll compute the size of the LEB for index 0, and so // forth. size_t sizeInBits = 0; size_t nextSizeIncrease = 0; for (Index i = 0; i < actualOrder.size(); i++) { if (i == nextSizeIncrease) { sizeInBits++; // At the current size we have 7 * sizeInBits bits to use. For example, // at index 0 the size is 1 and we'll compute 128 here, and indeed after // emitting 128 globals (0,..,127) the 129th (at index 128) requires a // larger LEB. nextSizeIncrease = 1 << (7 * sizeInBits); } total += counts[actualOrder[i]] * sizeInBits; } return total; } }; Pass* createReorderGlobalsPass() { return new ReorderGlobals(false); } Pass* createReorderGlobalsAlwaysPass() { return new ReorderGlobals(true); } } // namespace wasm