diff options
author | Thomas Lively <7121787+tlively@users.noreply.github.com> | 2021-02-26 11:03:10 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-02-26 11:03:10 -0800 |
commit | cafe85d6d8d5a7497ebf40b8474052be0885e58f (patch) | |
tree | c38932e394bff7b2ad87660a55e985a76c4ace0b /src | |
parent | b3f67918fe34e7de76d1f5565bd3f07f4f7eb12f (diff) | |
download | binaryen-cafe85d6d8d5a7497ebf40b8474052be0885e58f.tar.gz binaryen-cafe85d6d8d5a7497ebf40b8474052be0885e58f.tar.bz2 binaryen-cafe85d6d8d5a7497ebf40b8474052be0885e58f.zip |
Use Tarjan's SCC alg. to find self-referential types (#3621)
Previously we computed the fixed point of the parent-child relation to identify
self-referential HeapTypes in the TypeBuilder canonicalizer. That algorithm was
O(|V|^3) in the worst case and took over five seconds to find the
self-referential HeapTypes in an example program with just 1134 HeapTypes,
probably due to high allocation traffic from the std::unordered_map and
std::unordered_sets used to implement the parent-child graph's adjacency list.
This PR replaces that algorithm with Tarjan's strongly connected component
algorithm, which runs in O(|V|+|E|) and finds the self-referential HeapTypes in
the mentioned example program in under 30 ms. All strongly connected components
of more than one element in the HeapType parent-child graph correspond to sets
of mutually recursive HeapTypes that are therefore self-referential. The only
other self-referential HeapTypes are those that are not mutually recursive with
any other HeapTypes, but these are trivial to find because they must be their
own direct children.
Diffstat (limited to 'src')
-rw-r--r-- | src/wasm/wasm-type.cpp | 120 |
1 files changed, 90 insertions, 30 deletions
diff --git a/src/wasm/wasm-type.cpp b/src/wasm/wasm-type.cpp index a2132d09a..1000e8b92 100644 --- a/src/wasm/wasm-type.cpp +++ b/src/wasm/wasm-type.cpp @@ -1301,7 +1301,7 @@ struct Canonicalizer { std::unordered_map<HeapType, std::vector<HeapType*>> heapTypeLocations; // These heap types will not participate in canonicalization. - std::unordered_set<TypeID> selfReferentialHeapTypes; + std::unordered_set<HeapType> selfReferentialHeapTypes; // Maps Types and HeapTypes backed by the TypeBuilder's Stores to globally // canonical Types and HeapTypes. @@ -1381,7 +1381,7 @@ Canonicalizer::Canonicalizer(TypeBuilder& builder) : builder(builder) { // self-referential HeapTypes to the global store so that they will outlive // the TypeBuilder without their IDs changing. for (auto& entry : builder.impl->entries) { - if (selfReferentialHeapTypes.count(entry.get().getID())) { + if (selfReferentialHeapTypes.count(entry.get())) { noncanonicalHeapTypes.emplace_back(std::move(entry.info)); } } @@ -1445,33 +1445,93 @@ void Canonicalizer::scanType(Type* type) { } void Canonicalizer::findSelfReferentialHeapTypes() { - // Calculate the fixed point of the reachability relation. - auto fixedPoint = children; - bool changed; - do { - changed = false; - for (auto& entry : fixedPoint) { - auto& succs = entry.second; - std::unordered_set<TypeID> newSuccs; - for (auto& other : succs) { - auto& otherSuccs = fixedPoint[other]; - newSuccs.insert(otherSuccs.begin(), otherSuccs.end()); + // Use Tarjan's strongly connected components algorithm on the parent-child + // graph to find self-referential types in O(|V|+|E|) time. Each HeapType in a + // strongly connected component with multiple elements must be + // self-referential because it is mutually recursive with all other HeapTypes + // in that strongly connected component. HeapTypes in strongly connected + // components of size one may also be self-referential, but it is trivial to + // find these because they must be their own direct children. See + // https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm. + + // Get the HeapType children of a HeapType, skipping all intermediate Types. + auto getChildren = [&](HeapType type) { + std::unordered_set<HeapType> results; + std::function<void(TypeID, bool)> visit = [&](TypeID id, bool isRoot) { + if (isRoot || typeLocations.count(Type(id))) { + auto it = children.find(id); + if (it != children.end()) { + for (TypeID child : it->second) { + visit(child, false); + } + } + } else { + results.insert(HeapType(id)); + } + }; + visit(type.getID(), true); + return results; + }; + + // The Tarjan's algorithm stack. Similar to a DFS stack, but elements are only + // popped off once they are committed to a strongly connected component. + // HeapTypes stay on the stack after they are visited iff they have a back + // edge to a HeapType earlier in the stack. + std::vector<HeapType> stack; + std::unordered_set<HeapType> stackElems; + // Indices assigned to each HeapType in the order they are visited. + std::unordered_map<HeapType, size_t> indices; + // The smallest index of the HeapTypes reachable from each HeapType through + // its subtree. + std::unordered_map<HeapType, size_t> minReachable; + + std::function<void(HeapType)> visit = [&](HeapType curr) { + size_t index = indices.size(); + indices[curr] = index; + minReachable[curr] = index; + stack.push_back(curr); + stackElems.insert(curr); + + for (HeapType child : getChildren(curr)) { + if (!indices.count(child)) { + // Child has not been visited yet; visit it. + visit(child); + minReachable[curr] = std::min(minReachable[curr], minReachable[child]); + } else if (stackElems.count(child)) { + // Child was already visited but not committed to a strongly connected + // component. + minReachable[curr] = std::min(minReachable[curr], indices[child]); } - size_t oldSize = succs.size(); - succs.insert(newSuccs.begin(), newSuccs.end()); - if (succs.size() != oldSize) { - changed = true; + // We cannot differentiate self-referential types with strongly connected + // component size one from non-self-referential types just by looking at + // the strongly connected components. If a type is directly + // self-referential, mark it here so we don't miss it later. + if (child == curr) { + selfReferentialHeapTypes.insert(child); } } - } while (changed); - // Find HeapTypes that reach themselves - for (auto& entry : builder.impl->entries) { - TypeID id = entry.get().getID(); - auto it = fixedPoint.find(id); - if (it != fixedPoint.end() && it->second.count(id)) { - selfReferentialHeapTypes.insert(id); + if (minReachable[curr] == indices[curr]) { + // Curr doesn't have any back edges to an earlier element, so it is the + // root of the strongly connected component including itself and anything + // after it still on the stack. If this strongly connected component has + // more than one element, they are all self referential HeapTypes. + // Self-referential types with SCC size one were already accounted for. + if (stack.back() != curr) { + selfReferentialHeapTypes.insert(curr); + while (stack.back() != curr) { + selfReferentialHeapTypes.insert(stack.back()); + stackElems.erase(stack.back()); + stack.pop_back(); + } + } + stack.pop_back(); + stackElems.erase(curr); } + }; + + for (auto& entry : builder.impl->entries) { + visit(entry.get()); } } @@ -1497,19 +1557,19 @@ std::vector<Canonicalizer::Item> Canonicalizer::getOrderedItems() { // Remove self-referential HeapTypes to cut cycles. auto childrenDAG = children; - for (TypeID id : selfReferentialHeapTypes) { - childrenDAG.erase(id); + for (HeapType type : selfReferentialHeapTypes) { + childrenDAG.erase(type.getID()); for (auto& kv : childrenDAG) { - kv.second.erase(id); + kv.second.erase(type.getID()); } } // Collect the remaining types that will be sorted. std::unordered_set<TypeID> toSort; for (auto& entry : builder.impl->entries) { - TypeID id = entry.get().getID(); - if (!selfReferentialHeapTypes.count(id)) { - toSort.insert(id); + HeapType curr = entry.get(); + if (!selfReferentialHeapTypes.count(curr)) { + toSort.insert(curr.getID()); } } for (auto& kv : childrenDAG) { |