Use Tarjan's SCC alg. to find self-referential types (#3621)

Previously we computed the fixed point of the parent-child relation to identify self-referential HeapTypes in the TypeBuilder canonicalizer. That algorithm was O(|V|^3) in the worst case and took over five seconds to find the self-referential HeapTypes in an example program with just 1134 HeapTypes, probably due to high allocation traffic from the std::unordered_map and std::unordered_sets used to implement the parent-child graph's adjacency list. This PR replaces that algorithm with Tarjan's strongly connected component algorithm, which runs in O(|V|+|E|) and finds the self-referential HeapTypes in the mentioned example program in under 30 ms. All strongly connected components of more than one element in the HeapType parent-child graph correspond to sets of mutually recursive HeapTypes that are therefore self-referential. The only other self-referential HeapTypes are those that are not mutually recursive with any other HeapTypes, but these are trivial to find because they must be their own direct children.
author: Thomas Lively <7121787+tlively@users.noreply.github.com> 2021-02-26 11:03:10 -0800
committer: GitHub <noreply@github.com> 2021-02-26 11:03:10 -0800
commit: cafe85d6d8d5a7497ebf40b8474052be0885e58f (patch)
tree: c38932e394bff7b2ad87660a55e985a76c4ace0b /src
parent: b3f67918fe34e7de76d1f5565bd3f07f4f7eb12f (diff)
download: binaryen-cafe85d6d8d5a7497ebf40b8474052be0885e58f.tar.gz
binaryen-cafe85d6d8d5a7497ebf40b8474052be0885e58f.tar.bz2
binaryen-cafe85d6d8d5a7497ebf40b8474052be0885e58f.zip
1 files changed, 90 insertions, 30 deletions
diff --git a/src/wasm/wasm-type.cpp b/src/wasm/wasm-type.cpp
index a2132d09a..1000e8b92 100644
--- a/src/wasm/wasm-type.cpp
+++ b/src/wasm/wasm-type.cpp
@@ -1301,7 +1301,7 @@ struct Canonicalizer {
   std::unordered_map<HeapType, std::vector<HeapType*>> heapTypeLocations;
 
   // These heap types will not participate in canonicalization.
-  std::unordered_set<TypeID> selfReferentialHeapTypes;
+  std::unordered_set<HeapType> selfReferentialHeapTypes;
 
   // Maps Types and HeapTypes backed by the TypeBuilder's Stores to globally
   // canonical Types and HeapTypes.
@@ -1381,7 +1381,7 @@ Canonicalizer::Canonicalizer(TypeBuilder& builder) : builder(builder) {
   // self-referential HeapTypes to the global store so that they will outlive
   // the TypeBuilder without their IDs changing.
   for (auto& entry : builder.impl->entries) {
-    if (selfReferentialHeapTypes.count(entry.get().getID())) {
+    if (selfReferentialHeapTypes.count(entry.get())) {
       noncanonicalHeapTypes.emplace_back(std::move(entry.info));
     }
   }
@@ -1445,33 +1445,93 @@ void Canonicalizer::scanType(Type* type) {
 }
 
 void Canonicalizer::findSelfReferentialHeapTypes() {
-  // Calculate the fixed point of the reachability relation.
-  auto fixedPoint = children;
-  bool changed;
-  do {
-    changed = false;
-    for (auto& entry : fixedPoint) {
-      auto& succs = entry.second;
-      std::unordered_set<TypeID> newSuccs;
-      for (auto& other : succs) {
-        auto& otherSuccs = fixedPoint[other];
-        newSuccs.insert(otherSuccs.begin(), otherSuccs.end());
+  // Use Tarjan's strongly connected components algorithm on the parent-child
+  // graph to find self-referential types in O(|V|+|E|) time. Each HeapType in a
+  // strongly connected component with multiple elements must be
+  // self-referential because it is mutually recursive with all other HeapTypes
+  // in that strongly connected component. HeapTypes in strongly connected
+  // components of size one may also be self-referential, but it is trivial to
+  // find these because they must be their own direct children. See
+  // https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm.
+
+  // Get the HeapType children of a HeapType, skipping all intermediate Types.
+  auto getChildren = [&](HeapType type) {
+    std::unordered_set<HeapType> results;
+    std::function<void(TypeID, bool)> visit = [&](TypeID id, bool isRoot) {
+      if (isRoot || typeLocations.count(Type(id))) {
+        auto it = children.find(id);
+        if (it != children.end()) {
+          for (TypeID child : it->second) {
+            visit(child, false);
+          }
+        }
+      } else {
+        results.insert(HeapType(id));
+      }
+    };
+    visit(type.getID(), true);
+    return results;
+  };
+
+  // The Tarjan's algorithm stack. Similar to a DFS stack, but elements are only
+  // popped off once they are committed to a strongly connected component.
+  // HeapTypes stay on the stack after they are visited iff they have a back
+  // edge to a HeapType earlier in the stack.
+  std::vector<HeapType> stack;
+  std::unordered_set<HeapType> stackElems;
+  // Indices assigned to each HeapType in the order they are visited.
+  std::unordered_map<HeapType, size_t> indices;
+  // The smallest index of the HeapTypes reachable from each HeapType through
+  // its subtree.
+  std::unordered_map<HeapType, size_t> minReachable;
+
+  std::function<void(HeapType)> visit = [&](HeapType curr) {
+    size_t index = indices.size();
+    indices[curr] = index;
+    minReachable[curr] = index;
+    stack.push_back(curr);
+    stackElems.insert(curr);
+
+    for (HeapType child : getChildren(curr)) {
+      if (!indices.count(child)) {
+        // Child has not been visited yet; visit it.
+        visit(child);
+        minReachable[curr] = std::min(minReachable[curr], minReachable[child]);
+      } else if (stackElems.count(child)) {
+        // Child was already visited but not committed to a strongly connected
+        // component.
+        minReachable[curr] = std::min(minReachable[curr], indices[child]);
       }
-      size_t oldSize = succs.size();
-      succs.insert(newSuccs.begin(), newSuccs.end());
-      if (succs.size() != oldSize) {
-        changed = true;
+      // We cannot differentiate self-referential types with strongly connected
+      // component size one from non-self-referential types just by looking at
+      // the strongly connected components. If a type is directly
+      // self-referential, mark it here so we don't miss it later.
+      if (child == curr) {
+        selfReferentialHeapTypes.insert(child);
       }
     }
-  } while (changed);
 
-  // Find HeapTypes that reach themselves
-  for (auto& entry : builder.impl->entries) {
-    TypeID id = entry.get().getID();
-    auto it = fixedPoint.find(id);
-    if (it != fixedPoint.end() && it->second.count(id)) {
-      selfReferentialHeapTypes.insert(id);
+    if (minReachable[curr] == indices[curr]) {
+      // Curr doesn't have any back edges to an earlier element, so it is the
+      // root of the strongly connected component including itself and anything
+      // after it still on the stack. If this strongly connected component has
+      // more than one element, they are all self referential HeapTypes.
+      // Self-referential types with SCC size one were already accounted for.
+      if (stack.back() != curr) {
+        selfReferentialHeapTypes.insert(curr);
+        while (stack.back() != curr) {
+          selfReferentialHeapTypes.insert(stack.back());
+          stackElems.erase(stack.back());
+          stack.pop_back();
+        }
+      }
+      stack.pop_back();
+      stackElems.erase(curr);
     }
+  };
+
+  for (auto& entry : builder.impl->entries) {
+    visit(entry.get());
   }
 }
 
@@ -1497,19 +1557,19 @@ std::vector<Canonicalizer::Item> Canonicalizer::getOrderedItems() {
 
   // Remove self-referential HeapTypes to cut cycles.
   auto childrenDAG = children;
-  for (TypeID id : selfReferentialHeapTypes) {
-    childrenDAG.erase(id);
+  for (HeapType type : selfReferentialHeapTypes) {
+    childrenDAG.erase(type.getID());
     for (auto& kv : childrenDAG) {
-      kv.second.erase(id);
+      kv.second.erase(type.getID());
     }
   }
 
   // Collect the remaining types that will be sorted.
   std::unordered_set<TypeID> toSort;
   for (auto& entry : builder.impl->entries) {
-    TypeID id = entry.get().getID();
-    if (!selfReferentialHeapTypes.count(id)) {
-      toSort.insert(id);
+    HeapType curr = entry.get();
+    if (!selfReferentialHeapTypes.count(curr)) {
+      toSort.insert(curr.getID());
     }
   }
   for (auto& kv : childrenDAG) {
author	Thomas Lively <7121787+tlively@users.noreply.github.com>	2021-02-26 11:03:10 -0800
committer	GitHub <noreply@github.com>	2021-02-26 11:03:10 -0800
commit	cafe85d6d8d5a7497ebf40b8474052be0885e58f (patch)
tree	c38932e394bff7b2ad87660a55e985a76c4ace0b /src
parent	b3f67918fe34e7de76d1f5565bd3f07f4f7eb12f (diff)
download	binaryen-cafe85d6d8d5a7497ebf40b8474052be0885e58f.tar.gz binaryen-cafe85d6d8d5a7497ebf40b8474052be0885e58f.tar.bz2 binaryen-cafe85d6d8d5a7497ebf40b8474052be0885e58f.zip