1 files changed, 326 insertions, 71 deletions
diff --git a/src/passes/ReorderGlobals.cpp b/src/passes/ReorderGlobals.cpp
index 9b5940a21..4602f3284 100644
--- a/src/passes/ReorderGlobals.cpp
+++ b/src/passes/ReorderGlobals.cpp
@@ -19,6 +19,17 @@
 // binaries because fewer bytes are needed to encode references to frequently
 // used globals.
 //
+// This pass also sorts by dependencies, as each global can only appear after
+// those it refers to. Other passes can use this internally to fix the ordering
+// after they add new globals.
+//
+// The "always" variant of this pass will always sort globals, even if there are
+// so few they all fit in a single LEB. In "always" mode we sort regardless and
+// we measure size by considering each subsequent index to have a higher cost.
+// That is, in reality the size of all globals up to 128 is a single byte, and
+// then the LEB grows to 2, while in "always" mode we increase the size by 1/128
+// for each global in a smooth manner. "Always" is mainly useful for testing.
+//
 
 #include "memory"
 
@@ -29,14 +40,15 @@
 
 namespace wasm {
 
-using NameCountMap = std::unordered_map<Name, std::atomic<Index>>;
+// We'll count uses in parallel.
+using AtomicNameCountMap = std::unordered_map<Name, std::atomic<Index>>;
 
 struct UseCountScanner : public WalkerPass<PostWalker<UseCountScanner>> {
   bool isFunctionParallel() override { return true; }
 
   bool modifiesBinaryenIR() override { return false; }
 
-  UseCountScanner(NameCountMap& counts) : counts(counts) {}
+  UseCountScanner(AtomicNameCountMap& counts) : counts(counts) {}
 
   std::unique_ptr<Pass> create() override {
     return std::make_unique<UseCountScanner>(counts);
@@ -53,112 +65,355 @@ struct UseCountScanner : public WalkerPass<PostWalker<UseCountScanner>> {
   }
 
 private:
-  NameCountMap& counts;
+  AtomicNameCountMap& counts;
 };
 
 struct ReorderGlobals : public Pass {
-  // Whether to always reorder globals, even if there are very few and the
-  // benefit is minor. That is useful for testing, and also internally in passes
-  // that use us to reorder them so dependencies appear first (that is, if a
-  // pass ends up with an earlier global reading a later one, the sorting in
-  // this pass will reorder them properly; we need to take those dependencies
-  // into account anyhow here).
   bool always;
 
   ReorderGlobals(bool always) : always(always) {}
 
+  // For efficiency we will use global indices rather than names. That is, we
+  // use the index of the global in the original ordering to identify each
+  // global. A different ordering is then a vector of new indices, saying where
+  // each one moves, which is logically a mapping between indices.
+  using IndexIndexMap = std::vector<Index>;
+
+  // We will also track counts of uses for each global. We use floating-point
+  // values here since while the initial counts are integers, we will be
+  // considering fractional sums of them later.
+  using IndexCountMap = std::vector<double>;
+
+  // We must take into account dependencies, so that globals appear before
+  // their users in other globals:
+  //
+  //   (global $a i32 (i32.const 10))
+  //   (global $b i32 (global.get $a)) ;; $b depends on $a; $a must be first
+  //
+  // To do so we construct a map from each global to those it depends on. We
+  // also build the reverse map, of those that it is depended upon by.
+  struct Dependencies {
+    std::unordered_map<Index, std::unordered_set<Index>> dependsOn;
+    std::unordered_map<Index, std::unordered_set<Index>> dependedUpon;
+  };
+
   void run(Module* module) override {
-    if (module->globals.size() < 128 && !always) {
+    auto& globals = module->globals;
+
+    if (globals.size() < 128 && !always) {
       // The module has so few globals that they all fit in a single-byte U32LEB
       // value, so no reordering we can do can actually decrease code size. Note
       // that this is the common case with wasm MVP modules where the only
       // globals are typically the stack pointer and perhaps a handful of others
       // (however, features like wasm GC there may be a great many globals).
+      // TODO: we still need to sort here to fix dependencies sometimes
       return;
     }
 
-    NameCountMap counts;
+    AtomicNameCountMap atomicCounts;
     // Fill in info, as we'll operate on it in parallel.
-    for (auto& global : module->globals) {
-      counts[global->name];
+    for (auto& global : globals) {
+      atomicCounts[global->name];
     }
 
     // Count uses.
-    UseCountScanner scanner(counts);
+    UseCountScanner scanner(atomicCounts);
     scanner.run(getPassRunner(), module);
     scanner.runOnModuleCode(getPassRunner(), module);
 
-    // Do a toplogical sort to ensure we keep dependencies before the things
-    // that need them. For example, if $b's definition depends on $a then $b
-    // must appear later:
-    //
-    //   (global $a i32 (i32.const 10))
-    //   (global $b i32 (global.get $a)) ;; $b depends on $a
-    //
-    struct DependencySort : TopologicalSort<Name, DependencySort> {
-      Module& wasm;
-      const NameCountMap& counts;
-
-      std::unordered_map<Name, std::vector<Name>> deps;
-
-      DependencySort(Module& wasm, const NameCountMap& counts)
-        : wasm(wasm), counts(counts) {
-        // Sort a list of global names by their counts.
-        auto sort = [&](std::vector<Name>& globals) {
-          std::stable_sort(
-            globals.begin(), globals.end(), [&](const Name& a, const Name& b) {
-              return counts.at(a) < counts.at(b);
-            });
-        };
-
-        // Sort the globals.
-        std::vector<Name> sortedNames;
-        for (auto& global : wasm.globals) {
-          sortedNames.push_back(global->name);
-        }
-        sort(sortedNames);
+    // Switch to non-atomic for all further processing, and convert names to
+    // indices.
+    std::unordered_map<Name, Index> originalIndices;
+    for (Index i = 0; i < globals.size(); i++) {
+      originalIndices[globals[i]->name] = i;
+    }
+    IndexCountMap counts(globals.size());
+    for (auto& [name, count] : atomicCounts) {
+      counts[originalIndices[name]] = count;
+    }
 
-        // Everything is a root (we need to emit all globals).
-        for (auto global : sortedNames) {
-          push(global);
+    // Compute dependencies.
+    Dependencies deps;
+    for (Index i = 0; i < globals.size(); i++) {
+      auto& global = globals[i];
+      if (!global->imported()) {
+        for (auto* get : FindAll<GlobalGet>(global->init).list) {
+          auto getIndex = originalIndices[get->name];
+          deps.dependsOn[i].insert(getIndex);
+          deps.dependedUpon[getIndex].insert(i);
         }
+      }
+    }
+
+    // Compute various sorting options. All the options use a variation of the
+    // algorithm in doSort() below, see there for more details; the only
+    // difference between the sorts is in the use counts we provide it.
+    struct SortAndSize {
+      IndexIndexMap sort;
+      double size;
+      SortAndSize(IndexIndexMap&& sort, double size)
+        : sort(std::move(sort)), size(size) {}
+    };
+    std::vector<SortAndSize> options;
+    auto addOption = [&](const IndexCountMap& customCounts) {
+      // Compute the sort using custom counts that guide us how to order.
+      auto sort = doSort(customCounts, deps, module);
+      // Compute the size using the true counts.
+      auto size = computeSize(sort, counts);
+      options.emplace_back(std::move(sort), size);
+    };
 
-        // The dependencies are the globals referred to.
-        for (auto& global : wasm.globals) {
-          if (global->imported()) {
-            continue;
-          }
-          std::vector<Name> vec;
-          for (auto* get : FindAll<GlobalGet>(global->init).list) {
-            vec.push_back(get->name);
-          }
-          sort(vec);
-          deps[global->name] = std::move(vec);
+    // Compute the closest thing we can to the original, unoptimized sort, by
+    // setting all counts to 0 there, so it only takes into account dependencies
+    // and the original ordering and nothing else.
+    //
+    // We put this sort first because if they all end up with equal size we
+    // prefer it (as it avoids pointless changes).
+    IndexCountMap zeroes(globals.size());
+    addOption(zeroes);
+
+    // A simple sort that uses the counts. As the algorithm in doSort() is
+    // greedy (see below), this is a pure greedy sort (at each point it time it
+    // picks the global with the highest count that it can).
+    addOption(counts);
+
+    // We can make the sort less greedy by adding to each global's count some of
+    // the count of its children. Then we'd still be picking in a greedy manner
+    // but our choices would be influenced by the bigger picture of what can be
+    // unlocked by emitting a particular global (it may have a low count itself,
+    // but allow emitting something that depends on it that has a high count).
+    // We try two variations of this, one which is a simple sum (add the
+    // dependent's counts to the global's) and one which exponentially decreases
+    // them (that is, we add a small multiple of the dependent's counts, which
+    // may help as the dependents also depend on other things potentially, so a
+    // simple sum may be too naive).
+    double const EXPONENTIAL_FACTOR = 0.095;
+    IndexCountMap sumCounts(globals.size()), exponentialCounts(globals.size());
+
+    struct Sort : public TopologicalSort<Index, Sort> {
+      const Dependencies& deps;
+
+      Sort(Index numGlobals, const Dependencies& deps) : deps(deps) {
+        for (Index i = 0; i < numGlobals; i++) {
+          push(i);
         }
       }
 
-      void pushPredecessors(Name global) {
-        for (auto pred : deps[global]) {
-          push(pred);
+      void pushPredecessors(Index global) {
+        auto iter = deps.dependedUpon.find(global);
+        if (iter == deps.dependedUpon.end()) {
+          return;
+        }
+        for (auto dep : iter->second) {
+          push(dep);
         }
       }
-    };
+    } sort(globals.size(), deps);
 
-    std::unordered_map<Name, Index> sortedIndexes;
-    for (auto global : DependencySort(*module, counts)) {
-      auto index = sortedIndexes.size();
-      sortedIndexes[global] = index;
+    for (auto global : sort) {
+      // We can compute this global's count as in the sorted order all the
+      // values it cares about are resolved. Start with the self-count, then
+      // add the deps.
+      sumCounts[global] = exponentialCounts[global] = counts[global];
+      for (auto dep : deps.dependedUpon[global]) {
+        sumCounts[global] += sumCounts[dep];
+        exponentialCounts[global] +=
+          EXPONENTIAL_FACTOR * exponentialCounts[dep];
+      }
     }
 
-    std::sort(
-      module->globals.begin(),
-      module->globals.end(),
-      [&](const std::unique_ptr<Global>& a, const std::unique_ptr<Global>& b) {
-        return sortedIndexes[a->name] < sortedIndexes[b->name];
-      });
+    addOption(sumCounts);
+    addOption(exponentialCounts);
 
+    // Pick the best out of all the options we computed.
+    IndexIndexMap* best = nullptr;
+    double bestSize;
+    for (auto& [sort, size] : options) {
+      if (!best || size < bestSize) {
+        best = &sort;
+        bestSize = size;
+      }
+    }
+
+    // Apply the indices we computed.
+    std::vector<std::unique_ptr<Global>> old(std::move(globals));
+    globals.resize(old.size());
+    for (Index i = 0; i < old.size(); i++) {
+      globals[(*best)[i]] = std::move(old[i]);
+    }
     module->updateMaps();
   }
+
+  IndexIndexMap doSort(const IndexCountMap& counts,
+                       const Dependencies& originalDeps,
+                       Module* module) {
+    auto& globals = module->globals;
+
+    // Copy the deps as we will operate on them as we go.
+    auto deps = originalDeps;
+
+    // To sort the globals we do a simple greedy approach of always picking the
+    // global with the highest count at every point in time, subject to the
+    // constraint that we can only emit globals that have all of their
+    // dependencies already emitted. To do so we keep a list of the "available"
+    // globals, which are those with no remaining dependencies. Then by keeping
+    // the list of available globals in heap form we can simply pop the largest
+    // from the heap each time, and add new available ones as they become so.
+    //
+    // Other approaches here could be to do a topological sort, but the optimal
+    // order may not require strict ordering by topological depth, e.g.:
+    /*
+    //     $c - $a
+    //    /
+    //  $e
+    //    \
+    //     $d - $b
+    */
+    // Here $e depends on $c and $d, $c depends on $a, and $d on $b. This is a
+    // partial order, as $d can be before or after $a, for example. As a result,
+    // if we sorted topologically by sub-trees here then we'd keep $c and $a
+    // together, and $d and $b, but a better order might interleave them. A good
+    // order also may not keep topological depths separated, e.g. we may want to
+    // put $a in between $c and $d despite it having a greater depth.
+    //
+    // The greedy approach here may also be unoptimal, however. Consider that we
+    // might see that the best available global is $a, but if we popped $b
+    // instead that could unlock $c which depends on $b, and $c may have a much
+    // higher use count than $a. For that reason we try several variations of
+    // this with different counts, see earlier.
+    std::vector<Index> availableHeap;
+
+    // Comparison function. Given a and b, returns if a should be before b. This
+    // is used in a heap, where "highest" means "popped first", so see the notes
+    // below on how we order.
+    auto cmp = [&](Index a, Index b) {
+      // Imports always go first. The binary writer takes care of this itself
+      // anyhow, but it is better to do it here in the IR so we can actually
+      // see what the final layout will be.
+      auto aImported = globals[a]->imported();
+      auto bImported = globals[b]->imported();
+      // The highest items will be popped first off the heap, so we want imports
+      // to be at higher indexes, that is,
+      //
+      //  unimported, unimported, imported, imported.
+      //
+      // Then the imports are popped first.
+      if (aImported != bImported) {
+        return bImported;
+      }
+
+      // Sort by the counts. We want higher counts at higher indexes so they are
+      // popped first, that is,
+      //
+      //  10, 20, 30, 40
+      //
+      auto aCount = counts[a];
+      auto bCount = counts[b];
+      if (aCount != bCount) {
+        return aCount < bCount;
+      }
+
+      // Break ties using the original order, which means just using the
+      // indices we have. We need lower indexes at the top so they are popped
+      // first, that is,
+      //
+      //  3, 2, 1, 0
+      //
+      return a > b;
+    };
+
+    // Push an item that just became available to the available heap.
+    auto push = [&](Index global) {
+      availableHeap.push_back(global);
+      std::push_heap(availableHeap.begin(), availableHeap.end(), cmp);
+    };
+
+    // The initially available globals are those with no dependencies.
+    for (Index i = 0; i < globals.size(); i++) {
+      if (deps.dependsOn[i].empty()) {
+        push(i);
+      }
+    }
+
+    // Pop off the heap: Emit the global and its final, sorted index. Keep
+    // doing that until we finish processing all the globals.
+    IndexIndexMap sortedindices(globals.size());
+    Index numSortedindices = 0;
+    while (!availableHeap.empty()) {
+      std::pop_heap(availableHeap.begin(), availableHeap.end(), cmp);
+      auto global = availableHeap.back();
+      sortedindices[global] = numSortedindices++;
+      availableHeap.pop_back();
+
+      // Each time we pop we emit the global, which means anything that only
+      // depended on it becomes available to be popped as well.
+      for (auto other : deps.dependedUpon[global]) {
+        assert(deps.dependsOn[other].count(global));
+        deps.dependsOn[other].erase(global);
+        if (deps.dependsOn[other].empty()) {
+          push(other);
+        }
+      }
+    }
+
+    // All globals must have been handled. Cycles would prevent this, but they
+    // cannot exist in valid IR.
+    assert(numSortedindices == globals.size());
+
+    return sortedindices;
+  }
+
+  // Given an indexing of the globals and the counts of how many times each is
+  // used, estimate the size of relevant parts of the wasm binary (that is, of
+  // LEBs in global.gets).
+  double computeSize(IndexIndexMap& indices, IndexCountMap& counts) {
+    // |indices| maps each old index to its new position in the sort. We need
+    // the reverse map here, which at index 0 has the old index of the global
+    // that will be first, and so forth.
+    IndexIndexMap actualOrder(indices.size());
+    for (Index i = 0; i < indices.size(); i++) {
+      // Each global has a unique index, so we only replace 0's here, and they
+      // must be in bounds.
+      assert(indices[i] < indices.size());
+      assert(actualOrder[indices[i]] == 0);
+
+      actualOrder[indices[i]] = i;
+    }
+
+    if (always) {
+      // In this mode we gradually increase the cost of later globals, in an
+      // unrealistic but smooth manner.
+      double total = 0;
+      for (Index i = 0; i < actualOrder.size(); i++) {
+        // Multiply the count for this global by a smoothed LEB factor, which
+        // starts at 1 (for 1 byte) at index 0, and then increases linearly with
+        // i, so that after 128 globals we reach 2 (which is the true index at
+        // which the LEB size normally jumps from 1 to 2), and so forth.
+        total += counts[actualOrder[i]] * (1.0 + (i / 128.0));
+      }
+      return total;
+    }
+
+    // The total size we are computing.
+    double total = 0;
+    // Track the size in bits and the next index at which the size increases. At
+    // the first iteration we'll compute the size of the LEB for index 0, and so
+    // forth.
+    size_t sizeInBits = 0;
+    size_t nextSizeIncrease = 0;
+    for (Index i = 0; i < actualOrder.size(); i++) {
+      if (i == nextSizeIncrease) {
+        sizeInBits++;
+        // At the current size we have 7 * sizeInBits bits to use.  For example,
+        // at index 0 the size is 1 and we'll compute 128 here, and indeed after
+        // emitting 128 globals (0,..,127) the 129th (at index 128) requires a
+        // larger LEB.
+        nextSizeIncrease = 1 << (7 * sizeInBits);
+      }
+      total += counts[actualOrder[i]] * sizeInBits;
+    }
+    return total;
+  }
 };
 
 Pass* createReorderGlobalsPass() { return new ReorderGlobals(false); }