Simplify ReorderGlobals using new topological sort utils (#6885)

Use the new TopologicalSort and MinTopologicalSortOf utilities instead of the old CRTP topological sort utility and a bespoke heap-based topological sort in ReorderGlobals. Since there is no longer a heap to pop from, the direction of the custom comparator is now much more intuitive. Further simplify the code by switching from tracking the new order of globals using a sequence of new indices to tracking the order using a sequence of old indices. This change also makes the pass about 20% faster on a large real-world module.
author: Thomas Lively <tlively@google.com> 2024-08-29 15:08:00 -0700
committer: GitHub <noreply@github.com> 2024-08-29 15:08:00 -0700
commit: 871ff0d4f910b565c15f82e8f3c9aa769b01d286 (patch)
tree: be231d3804086dbb4a335a2fe0334475a5937130 /src/passes/ReorderGlobals.cpp
parent: b63aeadb09a4450f55041dfb3fb7260807e91dfc (diff)
download: binaryen-871ff0d4f910b565c15f82e8f3c9aa769b01d286.tar.gz
binaryen-871ff0d4f910b565c15f82e8f3c9aa769b01d286.tar.bz2
binaryen-871ff0d4f910b565c15f82e8f3c9aa769b01d286.zip
1 files changed, 62 insertions, 141 deletions
diff --git a/src/passes/ReorderGlobals.cpp b/src/passes/ReorderGlobals.cpp
index 4602f3284..d17bb86cc 100644
--- a/src/passes/ReorderGlobals.cpp
+++ b/src/passes/ReorderGlobals.cpp
@@ -35,7 +35,7 @@
 
 #include "ir/find_all.h"
 #include "pass.h"
-#include "support/topological_sort.h"
+#include "support/topological_orders.h"
 #include "wasm.h"
 
 namespace wasm {
@@ -75,8 +75,8 @@ struct ReorderGlobals : public Pass {
 
   // For efficiency we will use global indices rather than names. That is, we
   // use the index of the global in the original ordering to identify each
-  // global. A different ordering is then a vector of new indices, saying where
-  // each one moves, which is logically a mapping between indices.
+  // global. A different ordering is then a vector of old indices, saying where
+  // each element comes from, which is logically a mapping between indices.
   using IndexIndexMap = std::vector<Index>;
 
   // We will also track counts of uses for each global. We use floating-point
@@ -190,26 +190,16 @@ struct ReorderGlobals : public Pass {
     double const EXPONENTIAL_FACTOR = 0.095;
     IndexCountMap sumCounts(globals.size()), exponentialCounts(globals.size());
 
-    struct Sort : public TopologicalSort<Index, Sort> {
-      const Dependencies& deps;
-
-      Sort(Index numGlobals, const Dependencies& deps) : deps(deps) {
-        for (Index i = 0; i < numGlobals; i++) {
-          push(i);
+    std::vector<std::vector<size_t>> dependenceGraph(globals.size());
+    for (size_t i = 0; i < globals.size(); ++i) {
+      if (auto it = deps.dependsOn.find(i); it != deps.dependsOn.end()) {
+        for (auto dep : it->second) {
+          dependenceGraph[i].push_back(dep);
         }
       }
+    }
 
-      void pushPredecessors(Index global) {
-        auto iter = deps.dependedUpon.find(global);
-        if (iter == deps.dependedUpon.end()) {
-          return;
-        }
-        for (auto dep : iter->second) {
-          push(dep);
-        }
-      }
-    } sort(globals.size(), deps);
-
+    auto sort = *TopologicalSort(dependenceGraph);
     for (auto global : sort) {
       // We can compute this global's count as in the sorted order all the
       // values it cares about are resolved. Start with the self-count, then
@@ -236,160 +226,91 @@ struct ReorderGlobals : public Pass {
     }
 
     // Apply the indices we computed.
-    std::vector<std::unique_ptr<Global>> old(std::move(globals));
+    auto old = std::move(globals);
     globals.resize(old.size());
     for (Index i = 0; i < old.size(); i++) {
-      globals[(*best)[i]] = std::move(old[i]);
+      globals[i] = std::move(old[(*best)[i]]);
     }
     module->updateMaps();
   }
 
   IndexIndexMap doSort(const IndexCountMap& counts,
-                       const Dependencies& originalDeps,
+                       const Dependencies& deps,
                        Module* module) {
     auto& globals = module->globals;
 
-    // Copy the deps as we will operate on them as we go.
-    auto deps = originalDeps;
-
     // To sort the globals we do a simple greedy approach of always picking the
     // global with the highest count at every point in time, subject to the
     // constraint that we can only emit globals that have all of their
-    // dependencies already emitted. To do so we keep a list of the "available"
-    // globals, which are those with no remaining dependencies. Then by keeping
-    // the list of available globals in heap form we can simply pop the largest
-    // from the heap each time, and add new available ones as they become so.
+    // dependencies already emitted.
     //
-    // Other approaches here could be to do a topological sort, but the optimal
-    // order may not require strict ordering by topological depth, e.g.:
-    /*
-    //     $c - $a
-    //    /
-    //  $e
-    //    \
-    //     $d - $b
-    */
-    // Here $e depends on $c and $d, $c depends on $a, and $d on $b. This is a
-    // partial order, as $d can be before or after $a, for example. As a result,
-    // if we sorted topologically by sub-trees here then we'd keep $c and $a
-    // together, and $d and $b, but a better order might interleave them. A good
-    // order also may not keep topological depths separated, e.g. we may want to
-    // put $a in between $c and $d despite it having a greater depth.
+    // The greedy approach here may also be suboptimal, however. Consider that
+    // we might see that the best available global is $a, but if we instead
+    // selected some other global $b, that would allow us to select a third
+    // global $c that depends on $b, and $c might have a much higher use count
+    // than $a. For that reason we try several variations of this with different
+    // counts, see earlier.
     //
-    // The greedy approach here may also be unoptimal, however. Consider that we
-    // might see that the best available global is $a, but if we popped $b
-    // instead that could unlock $c which depends on $b, and $c may have a much
-    // higher use count than $a. For that reason we try several variations of
-    // this with different counts, see earlier.
-    std::vector<Index> availableHeap;
-
-    // Comparison function. Given a and b, returns if a should be before b. This
-    // is used in a heap, where "highest" means "popped first", so see the notes
-    // below on how we order.
-    auto cmp = [&](Index a, Index b) {
-      // Imports always go first. The binary writer takes care of this itself
-      // anyhow, but it is better to do it here in the IR so we can actually
-      // see what the final layout will be.
-      auto aImported = globals[a]->imported();
-      auto bImported = globals[b]->imported();
-      // The highest items will be popped first off the heap, so we want imports
-      // to be at higher indexes, that is,
-      //
-      //  unimported, unimported, imported, imported.
-      //
-      // Then the imports are popped first.
-      if (aImported != bImported) {
-        return bImported;
-      }
-
-      // Sort by the counts. We want higher counts at higher indexes so they are
-      // popped first, that is,
-      //
-      //  10, 20, 30, 40
-      //
-      auto aCount = counts[a];
-      auto bCount = counts[b];
-      if (aCount != bCount) {
-        return aCount < bCount;
-      }
-
-      // Break ties using the original order, which means just using the
-      // indices we have. We need lower indexes at the top so they are popped
-      // first, that is,
-      //
-      //  3, 2, 1, 0
-      //
-      return a > b;
-    };
-
-    // Push an item that just became available to the available heap.
-    auto push = [&](Index global) {
-      availableHeap.push_back(global);
-      std::push_heap(availableHeap.begin(), availableHeap.end(), cmp);
-    };
-
-    // The initially available globals are those with no dependencies.
-    for (Index i = 0; i < globals.size(); i++) {
-      if (deps.dependsOn[i].empty()) {
-        push(i);
-      }
+    // Sort the globals into the optimal order based on the counts, ignoring
+    // dependencies for now.
+    std::vector<Index> sortedGlobals;
+    sortedGlobals.resize(globals.size());
+    for (Index i = 0; i < globals.size(); ++i) {
+      sortedGlobals[i] = i;
     }
+    std::sort(
+      sortedGlobals.begin(), sortedGlobals.end(), [&](Index a, Index b) {
+        // Imports always go first. The binary writer takes care of this itself
+        // anyhow, but it is better to do it here in the IR so we can actually
+        // see what the final layout will be.
+        auto aImported = globals[a]->imported();
+        auto bImported = globals[b]->imported();
+        if (aImported != bImported) {
+          return aImported;
+        }
 
-    // Pop off the heap: Emit the global and its final, sorted index. Keep
-    // doing that until we finish processing all the globals.
-    IndexIndexMap sortedindices(globals.size());
-    Index numSortedindices = 0;
-    while (!availableHeap.empty()) {
-      std::pop_heap(availableHeap.begin(), availableHeap.end(), cmp);
-      auto global = availableHeap.back();
-      sortedindices[global] = numSortedindices++;
-      availableHeap.pop_back();
-
-      // Each time we pop we emit the global, which means anything that only
-      // depended on it becomes available to be popped as well.
-      for (auto other : deps.dependedUpon[global]) {
-        assert(deps.dependsOn[other].count(global));
-        deps.dependsOn[other].erase(global);
-        if (deps.dependsOn[other].empty()) {
-          push(other);
+        // Sort by the counts. Higher counts come first.
+        auto aCount = counts[a];
+        auto bCount = counts[b];
+        if (aCount != bCount) {
+          return aCount > bCount;
         }
+
+        // Break ties using the original order, which means just using the
+        // indices we have.
+        return a < b;
+      });
+
+    // Now use that optimal order to create an ordered graph that includes the
+    // dependencies. The final order will be the minimum topological sort of
+    // this graph.
+    std::vector<std::pair<Index, std::vector<Index>>> graph;
+    graph.reserve(globals.size());
+    for (auto i : sortedGlobals) {
+      std::vector<Index> children;
+      if (auto it = deps.dependedUpon.find(i); it != deps.dependedUpon.end()) {
+        children = std::vector<Index>(it->second.begin(), it->second.end());
       }
+      graph.emplace_back(i, std::move(children));
     }
 
-    // All globals must have been handled. Cycles would prevent this, but they
-    // cannot exist in valid IR.
-    assert(numSortedindices == globals.size());
-
-    return sortedindices;
+    return *MinTopologicalSortOf<Index>(graph.begin(), graph.end());
   }
 
   // Given an indexing of the globals and the counts of how many times each is
   // used, estimate the size of relevant parts of the wasm binary (that is, of
   // LEBs in global.gets).
   double computeSize(IndexIndexMap& indices, IndexCountMap& counts) {
-    // |indices| maps each old index to its new position in the sort. We need
-    // the reverse map here, which at index 0 has the old index of the global
-    // that will be first, and so forth.
-    IndexIndexMap actualOrder(indices.size());
-    for (Index i = 0; i < indices.size(); i++) {
-      // Each global has a unique index, so we only replace 0's here, and they
-      // must be in bounds.
-      assert(indices[i] < indices.size());
-      assert(actualOrder[indices[i]] == 0);
-
-      actualOrder[indices[i]] = i;
-    }
-
     if (always) {
       // In this mode we gradually increase the cost of later globals, in an
       // unrealistic but smooth manner.
       double total = 0;
-      for (Index i = 0; i < actualOrder.size(); i++) {
+      for (Index i = 0; i < indices.size(); i++) {
         // Multiply the count for this global by a smoothed LEB factor, which
         // starts at 1 (for 1 byte) at index 0, and then increases linearly with
         // i, so that after 128 globals we reach 2 (which is the true index at
         // which the LEB size normally jumps from 1 to 2), and so forth.
-        total += counts[actualOrder[i]] * (1.0 + (i / 128.0));
+        total += counts[indices[i]] * (1.0 + (i / 128.0));
       }
       return total;
     }
@@ -401,7 +322,7 @@ struct ReorderGlobals : public Pass {
     // forth.
     size_t sizeInBits = 0;
     size_t nextSizeIncrease = 0;
-    for (Index i = 0; i < actualOrder.size(); i++) {
+    for (Index i = 0; i < indices.size(); i++) {
       if (i == nextSizeIncrease) {
         sizeInBits++;
         // At the current size we have 7 * sizeInBits bits to use.  For example,
@@ -410,7 +331,7 @@ struct ReorderGlobals : public Pass {
         // larger LEB.
         nextSizeIncrease = 1 << (7 * sizeInBits);
       }
-      total += counts[actualOrder[i]] * sizeInBits;
+      total += counts[indices[i]] * sizeInBits;
     }
     return total;
   }
author	Thomas Lively <tlively@google.com>	2024-08-29 15:08:00 -0700
committer	GitHub <noreply@github.com>	2024-08-29 15:08:00 -0700
commit	871ff0d4f910b565c15f82e8f3c9aa769b01d286 (patch)
tree	be231d3804086dbb4a335a2fe0334475a5937130 /src/passes/ReorderGlobals.cpp
parent	b63aeadb09a4450f55041dfb3fb7260807e91dfc (diff)
download	binaryen-871ff0d4f910b565c15f82e8f3c9aa769b01d286.tar.gz binaryen-871ff0d4f910b565c15f82e8f3c9aa769b01d286.tar.bz2 binaryen-871ff0d4f910b565c15f82e8f3c9aa769b01d286.zip