diff options
author | Ashley Nelson <nashley@google.com> | 2023-07-17 19:48:29 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-07-18 02:48:29 +0000 |
commit | 159750d0fccd7ee453ac2f8569128e0ea94ba8a5 (patch) | |
tree | d2958b6099ef67bb755141a1d870b380ce5fa6a9 /src | |
parent | f96fcb0e0c15299045b828447e65754727eeab57 (diff) | |
download | binaryen-159750d0fccd7ee453ac2f8569128e0ea94ba8a5.tar.gz binaryen-159750d0fccd7ee453ac2f8569128e0ea94ba8a5.tar.bz2 binaryen-159750d0fccd7ee453ac2f8569128e0ea94ba8a5.zip |
[Outlining] LLVM Suffix Tree (#5821)
This PR adds LLVM's suffix tree data structure to Binaryen. This suffix tree is implemented using Ukkonen's algorithm for linear-time suffix tree construction, and is intended for fast substring queries.
Note: All of the .h and .cpp files included are from LLVM. These files were copied directly instead of imported into our existing LLVM integration (in third_party/) to avoid bumping the commit hash and avoid the potential for complications with upstream changes.
Diffstat (limited to 'src')
-rw-r--r-- | src/support/CMakeLists.txt | 15 | ||||
-rw-r--r-- | src/support/suffix_tree.cpp | 305 | ||||
-rw-r--r-- | src/support/suffix_tree.h | 211 | ||||
-rw-r--r-- | src/support/suffix_tree_node.cpp | 45 | ||||
-rw-r--r-- | src/support/suffix_tree_node.h | 179 |
5 files changed, 754 insertions, 1 deletions
diff --git a/src/support/CMakeLists.txt b/src/support/CMakeLists.txt index 2758ee198..87f835c7b 100644 --- a/src/support/CMakeLists.txt +++ b/src/support/CMakeLists.txt @@ -14,4 +14,17 @@ set(support_SOURCES utilities.cpp ${support_HEADERS} ) -add_library(support OBJECT ${support_SOURCES}) + +# The below condition is intended for removal once the suffix_tree and +# suffix_tree_node source files no longer depend on LLVM code in the +# third_party folder +if(EMSCRIPTEN) + add_library(support OBJECT ${support_SOURCES}) +else() + set(support_with_suffix_tree_SOURCES + suffix_tree.cpp + suffix_tree_node.cpp + ${support_SOURCES} + ) + add_library(support OBJECT ${support_with_suffix_tree_SOURCES}) +endif() diff --git a/src/support/suffix_tree.cpp b/src/support/suffix_tree.cpp new file mode 100644 index 000000000..a66bcde38 --- /dev/null +++ b/src/support/suffix_tree.cpp @@ -0,0 +1,305 @@ +// This file began as an import from LLVM, and so it has the same license as +// LLVM, copied below together with the code. + +//===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the Suffix Tree class. +// +//===----------------------------------------------------------------------===// + +#include "support/suffix_tree.h" +#include "support/suffix_tree_node.h" +#include "llvm/Support/Casting.h" + +using namespace llvm; + +namespace wasm { + +/// \returns the number of elements in the substring associated with \p N. +static size_t numElementsInSubstring(const SuffixTreeNode* N) { + assert(N && "Got a null node?"); + if (auto* Internal = dyn_cast<SuffixTreeInternalNode>(N)) { + if (Internal->isRoot()) { + return 0; + } + } + return N->getEndIdx() - N->getStartIdx() + 1; +} + +SuffixTree::SuffixTree(const std::vector<unsigned>& Str) : Str(Str) { + Root = insertRoot(); + Active.Node = Root; + + // Keep track of the number of suffixes we have to add of the current + // prefix. + unsigned SuffixesToAdd = 0; + + // Construct the suffix tree iteratively on each prefix of the string. + // PfxEndIdx is the end index of the current prefix. + // End is one past the last element in the string. + for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) { + SuffixesToAdd++; + LeafEndIdx = PfxEndIdx; // Extend each of the leaves. + SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd); + } + + // Set the suffix indices of each leaf. + assert(Root && "Root node can't be nullptr!"); + setSuffixIndices(); +} + +SuffixTreeNode* SuffixTree::insertLeaf(SuffixTreeInternalNode& Parent, + unsigned StartIdx, + unsigned Edge) { + assert(StartIdx <= LeafEndIdx && "String can't start after it ends!"); + auto* N = new (LeafNodeAllocator.Allocate()) + SuffixTreeLeafNode(StartIdx, &LeafEndIdx); + Parent.Children[Edge] = N; + return N; +} + +SuffixTreeInternalNode* +SuffixTree::insertInternalNode(SuffixTreeInternalNode* Parent, + unsigned StartIdx, + unsigned EndIdx, + unsigned Edge) { + assert(StartIdx <= EndIdx && "String can't start after it ends!"); + assert(!(!Parent && StartIdx != SuffixTreeNode::EmptyIdx) && + "Non-root internal nodes must have parents!"); + auto* N = new (InternalNodeAllocator.Allocate()) + SuffixTreeInternalNode(StartIdx, EndIdx, Root); + if (Parent) { + Parent->Children[Edge] = N; + } + return N; +} + +SuffixTreeInternalNode* SuffixTree::insertRoot() { + return insertInternalNode(/*Parent = */ nullptr, + SuffixTreeNode::EmptyIdx, + SuffixTreeNode::EmptyIdx, + /*Edge = */ 0); +} + +void SuffixTree::setSuffixIndices() { + // List of nodes we need to visit along with the current length of the + // string. + std::vector<std::pair<SuffixTreeNode*, unsigned>> ToVisit; + + // Current node being visited. + SuffixTreeNode* CurrNode = Root; + + // Sum of the lengths of the nodes down the path to the current one. + unsigned CurrNodeLen = 0; + ToVisit.push_back({CurrNode, CurrNodeLen}); + while (!ToVisit.empty()) { + std::tie(CurrNode, CurrNodeLen) = ToVisit.back(); + ToVisit.pop_back(); + // Length of the current node from the root down to here. + CurrNode->setConcatLen(CurrNodeLen); + if (auto* InternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode)) { + for (auto& ChildPair : InternalNode->Children) { + assert(ChildPair.second && "Node had a null child!"); + ToVisit.push_back( + {ChildPair.second, + CurrNodeLen + numElementsInSubstring(ChildPair.second)}); + } + } + // No children, so we are at the end of the string. + if (auto* LeafNode = dyn_cast<SuffixTreeLeafNode>(CurrNode)) { + LeafNode->setSuffixIdx(Str.size() - CurrNodeLen); + } + } +} + +unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) { + SuffixTreeInternalNode* NeedsLink = nullptr; + + while (SuffixesToAdd > 0) { + + // Are we waiting to add anything other than just the last character? + if (Active.Len == 0) { + // If not, then say the active index is the end index. + Active.Idx = EndIdx; + } + + assert(Active.Idx <= EndIdx && "Start index can't be after end index!"); + + // The first character in the current substring we're looking at. + unsigned FirstChar = Str[Active.Idx]; + + // Have we inserted anything starting with FirstChar at the current node? + if (Active.Node->Children.count(FirstChar) == 0) { + // If not, then we can just insert a leaf and move to the next step. + insertLeaf(*Active.Node, EndIdx, FirstChar); + + // The active node is an internal node, and we visited it, so it must + // need a link if it doesn't have one. + if (NeedsLink) { + NeedsLink->setLink(Active.Node); + NeedsLink = nullptr; + } + } else { + // There's a match with FirstChar, so look for the point in the tree to + // insert a new node. + SuffixTreeNode* NextNode = Active.Node->Children[FirstChar]; + + unsigned SubstringLen = numElementsInSubstring(NextNode); + + // Is the current suffix we're trying to insert longer than the size of + // the child we want to move to? + if (Active.Len >= SubstringLen) { + // If yes, then consume the characters we've seen and move to the next + // node. + // TODO: Enable the below assert + // assert(isa<SuffixTreeInternalNode>(NextNode) && + // "Expected an internal node?"); + Active.Idx += SubstringLen; + Active.Len -= SubstringLen; + Active.Node = cast<SuffixTreeInternalNode>(NextNode); + continue; + } + + // Otherwise, the suffix we're trying to insert must be contained in the + // next node we want to move to. + unsigned LastChar = Str[EndIdx]; + + // Is the string we're trying to insert a substring of the next node? + if (Str[NextNode->getStartIdx() + Active.Len] == LastChar) { + // If yes, then we're done for this step. Remember our insertion point + // and move to the next end index. At this point, we have an implicit + // suffix tree. + if (NeedsLink && !Active.Node->isRoot()) { + NeedsLink->setLink(Active.Node); + NeedsLink = nullptr; + } + + Active.Len++; + break; + } + + // The string we're trying to insert isn't a substring of the next node, + // but matches up to a point. Split the node. + // + // For example, say we ended our search at a node n and we're trying to + // insert ABD. Then we'll create a new node s for AB, reduce n to just + // representing C, and insert a new leaf node l to represent d. This + // allows us to ensure that if n was a leaf, it remains a leaf. + // + // | ABC ---split---> | AB + // n s + // C / \ D + // n l + + // The node s from the diagram + SuffixTreeInternalNode* SplitNode = + insertInternalNode(Active.Node, + NextNode->getStartIdx(), + NextNode->getStartIdx() + Active.Len - 1, + FirstChar); + + // Insert the new node representing the new substring into the tree as + // a child of the split node. This is the node l from the diagram. + insertLeaf(*SplitNode, EndIdx, LastChar); + + // Make the old node a child of the split node and update its start + // index. This is the node n from the diagram. + NextNode->incrementStartIdx(Active.Len); + SplitNode->Children[Str[NextNode->getStartIdx()]] = NextNode; + + // SplitNode is an internal node, update the suffix link. + if (NeedsLink) { + NeedsLink->setLink(SplitNode); + } + + NeedsLink = SplitNode; + } + + // We've added something new to the tree, so there's one less suffix to + // add. + SuffixesToAdd--; + + if (Active.Node->isRoot()) { + if (Active.Len > 0) { + Active.Len--; + Active.Idx = EndIdx - SuffixesToAdd + 1; + } + } else { + // Start the next phase at the next smallest suffix. + Active.Node = Active.Node->getLink(); + } + } + + return SuffixesToAdd; +} + +void SuffixTree::RepeatedSubstringIterator::advance() { + // Clear the current state. If we're at the end of the range, then this + // is the state we want to be in. + RS = RepeatedSubstring(); + N = nullptr; + + // Each leaf node represents a repeat of a string. + std::vector<unsigned> RepeatedSubstringStarts; + + // Continue visiting nodes until we find one which repeats more than once. + while (!InternalNodesToVisit.empty()) { + RepeatedSubstringStarts.clear(); + auto* Curr = InternalNodesToVisit.back(); + InternalNodesToVisit.pop_back(); + + // Keep track of the length of the string associated with the node. If + // it's too short, we'll quit. + unsigned Length = Curr->getConcatLen(); + + // Iterate over each child, saving internal nodes for visiting, and + // leaf nodes in LeafChildren. Internal nodes represent individual + // strings, which may repeat. + for (auto& ChildPair : Curr->Children) { + // Save all of this node's children for processing. + if (auto* InternalChild = + dyn_cast<SuffixTreeInternalNode>(ChildPair.second)) { + InternalNodesToVisit.push_back(InternalChild); + continue; + } + + if (Length < MinLength) { + continue; + } + + // Have an occurrence of a potentially repeated string. Save it. + auto* Leaf = cast<SuffixTreeLeafNode>(ChildPair.second); + RepeatedSubstringStarts.push_back(Leaf->getSuffixIdx()); + } + + // The root never represents a repeated substring. If we're looking at + // that, then skip it. + if (Curr->isRoot()) { + continue; + } + + // Do we have any repeated substrings? + if (RepeatedSubstringStarts.size() < 2) { + continue; + } + + // Yes. Update the state to reflect this, and then bail out. + N = Curr; + RS.Length = Length; + for (unsigned StartIdx : RepeatedSubstringStarts) { + RS.StartIndices.push_back(StartIdx); + } + break; + } + // At this point, either NewRS is an empty RepeatedSubstring, or it was + // set in the above loop. Similarly, N is either nullptr, or the node + // associated with NewRS. +} + +} // namespace wasm diff --git a/src/support/suffix_tree.h b/src/support/suffix_tree.h new file mode 100644 index 000000000..474984dcd --- /dev/null +++ b/src/support/suffix_tree.h @@ -0,0 +1,211 @@ +// This file began as an import from LLVM, and so it has the same license as +// LLVM, copied below together with the code. + +//===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// A data structure for fast substring queries. +// +// Suffix trees represent the suffixes of their input strings in their leaves. +// A suffix tree is a type of compressed trie structure where each node +// represents an entire substring rather than a single character. Each leaf +// of the tree is a suffix. +// +// A suffix tree can be seen as a type of state machine where each state is a +// substring of the full string. The tree is structured so that, for a string +// of length N, there are exactly N leaves in the tree. This structure allows +// us to quickly find repeated substrings of the input string. +// +// In this implementation, a "string" is a vector of unsigned integers. +// These integers may result from hashing some data type. A suffix tree can +// contain 1 or many strings, which can then be queried as one large string. +// +// The suffix tree is implemented using Ukkonen's algorithm for linear-time +// suffix tree construction. Ukkonen's algorithm is explained in more detail +// in the paper by Esko Ukkonen "On-line construction of suffix trees. The +// paper is available at +// +// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf +//===----------------------------------------------------------------------===// + +#ifndef wasm_support_suffix_tree_h +#define wasm_support_suffix_tree_h + +#include "llvm/Support/Allocator.h" +#include <cassert> +#include <cstddef> +#include <vector> + +#include "support/suffix_tree_node.h" + +using namespace llvm; + +namespace wasm { +class SuffixTree { +public: + /// Each element is an integer representing an instruction in the module. + // TODO: This was an ArrayRef originally + std::vector<unsigned> Str; + + /// A repeated substring in the tree. + struct RepeatedSubstring { + /// The length of the string. + unsigned Length; + + /// The start indices of each occurrence. + std::vector<unsigned> StartIndices; + }; + +private: + /// Maintains internal nodes in the tree. + SpecificBumpPtrAllocator<SuffixTreeInternalNode> InternalNodeAllocator; + /// Maintains leaf nodes in the tree. + SpecificBumpPtrAllocator<SuffixTreeLeafNode> LeafNodeAllocator; + + /// The root of the suffix tree. + /// + /// The root represents the empty string. It is maintained by the + /// \p NodeAllocator like every other node in the tree. + SuffixTreeInternalNode* Root = nullptr; + + /// The end index of each leaf in the tree. + unsigned LeafEndIdx = SuffixTreeNode::EmptyIdx; + + /// Helper struct which keeps track of the next insertion point in + /// Ukkonen's algorithm. + struct ActiveState { + /// The next node to insert at. + SuffixTreeInternalNode* Node = nullptr; + + /// The index of the first character in the substring currently being added. + unsigned Idx = SuffixTreeNode::EmptyIdx; + + /// The length of the substring we have to add at the current step. + unsigned Len = 0; + }; + + /// The point the next insertion will take place at in the + /// construction algorithm. + ActiveState Active; + + /// Allocate a leaf node and add it to the tree. + /// + /// \param Parent The parent of this node. + /// \param StartIdx The start index of this node's associated string. + /// \param Edge The label on the edge leaving \p Parent to this node. + /// + /// \returns A pointer to the allocated leaf node. + SuffixTreeNode* + insertLeaf(SuffixTreeInternalNode& Parent, unsigned StartIdx, unsigned Edge); + + /// Allocate an internal node and add it to the tree. + /// + /// \param Parent The parent of this node. Only null when allocating the root. + /// \param StartIdx The start index of this node's associated string. + /// \param EndIdx The end index of this node's associated string. + /// \param Edge The label on the edge leaving \p Parent to this node. + /// + /// \returns A pointer to the allocated internal node. + SuffixTreeInternalNode* insertInternalNode(SuffixTreeInternalNode* Parent, + unsigned StartIdx, + unsigned EndIdx, + unsigned Edge); + + /// Allocate the root node and add it to the tree. + /// + /// \returns A pointer to the root. + SuffixTreeInternalNode* insertRoot(); + + /// Set the suffix indices of the leaves to the start indices of their + /// respective suffixes. + void setSuffixIndices(); + + /// Construct the suffix tree for the prefix of the input ending at + /// \p EndIdx. + /// + /// Used to construct the full suffix tree iteratively. At the end of each + /// step, the constructed suffix tree is either a valid suffix tree, or a + /// suffix tree with implicit suffixes. At the end of the final step, the + /// suffix tree is a valid tree. + /// + /// \param EndIdx The end index of the current prefix in the main string. + /// \param SuffixesToAdd The number of suffixes that must be added + /// to complete the suffix tree at the current phase. + /// + /// \returns The number of suffixes that have not been added at the end of + /// this step. + unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd); + +public: + /// Construct a suffix tree from a sequence of unsigned integers. + /// + /// \param Str The string to construct the suffix tree for. + SuffixTree(const std::vector<unsigned>& Str); + + /// Iterator for finding all repeated substrings in the suffix tree. + struct RepeatedSubstringIterator { + private: + /// The current node we're visiting. + SuffixTreeNode* N = nullptr; + + /// The repeated substring associated with this node. + RepeatedSubstring RS; + + /// The nodes left to visit. + std::vector<SuffixTreeInternalNode*> InternalNodesToVisit; + + /// The minimum length of a repeated substring to find. + /// Since we're outlining, we want at least two instructions in the range. + /// FIXME: This may not be true for targets like X86 which support many + /// instruction lengths. + const unsigned MinLength = 2; + + /// Move the iterator to the next repeated substring. + void advance(); + + public: + /// Return the current repeated substring. + RepeatedSubstring& operator*() { return RS; } + + RepeatedSubstringIterator& operator++() { + advance(); + return *this; + } + + RepeatedSubstringIterator operator++(int I) { + RepeatedSubstringIterator It(*this); + advance(); + return It; + } + + bool operator==(const RepeatedSubstringIterator& Other) const { + return N == Other.N; + } + bool operator!=(const RepeatedSubstringIterator& Other) const { + return !(*this == Other); + } + + RepeatedSubstringIterator(SuffixTreeInternalNode* N) : N(N) { + // Do we have a non-null node? + if (!N) { + return; + } + // Yes. At the first step, we need to visit all of N's children. + // Note: This means that we visit N last. + InternalNodesToVisit.push_back(N); + advance(); + } + }; + + typedef RepeatedSubstringIterator iterator; + iterator begin() { return iterator(Root); } + iterator end() { return iterator(nullptr); } +}; + +} // namespace wasm + +#endif // wasm_support_suffix_tree_h diff --git a/src/support/suffix_tree_node.cpp b/src/support/suffix_tree_node.cpp new file mode 100644 index 000000000..a5bf2348e --- /dev/null +++ b/src/support/suffix_tree_node.cpp @@ -0,0 +1,45 @@ +// This file began as an import from LLVM, and so it has the same license as +// LLVM, copied below together with the code. + +//===- llvm/ADT/SuffixTreeNode.cpp - Nodes for SuffixTrees --------*- C++ +//-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines nodes for use within a SuffixTree. +// +//===----------------------------------------------------------------------===// + +#include "support/suffix_tree_node.h" +#include <cassert> + +namespace wasm { + +unsigned SuffixTreeNode::getStartIdx() const { return StartIdx; } +void SuffixTreeNode::incrementStartIdx(unsigned Inc) { StartIdx += Inc; } +void SuffixTreeNode::setConcatLen(unsigned Len) { ConcatLen = Len; } +unsigned SuffixTreeNode::getConcatLen() const { return ConcatLen; } + +bool SuffixTreeInternalNode::isRoot() const { + return getStartIdx() == EmptyIdx; +} +unsigned SuffixTreeInternalNode::getEndIdx() const { return EndIdx; } +void SuffixTreeInternalNode::setLink(SuffixTreeInternalNode* L) { + assert(L && "Cannot set a null link?"); + Link = L; +} +SuffixTreeInternalNode* SuffixTreeInternalNode::getLink() const { return Link; } + +unsigned SuffixTreeLeafNode::getEndIdx() const { + assert(EndIdx && "EndIdx is empty?"); + return *EndIdx; +} + +unsigned SuffixTreeLeafNode::getSuffixIdx() const { return SuffixIdx; } +void SuffixTreeLeafNode::setSuffixIdx(unsigned Idx) { SuffixIdx = Idx; } + +} // namespace wasm diff --git a/src/support/suffix_tree_node.h b/src/support/suffix_tree_node.h new file mode 100644 index 000000000..8b34fb21b --- /dev/null +++ b/src/support/suffix_tree_node.h @@ -0,0 +1,179 @@ +// This file began as an import from LLVM, and so it has the same license as +// LLVM, copied below together with the code. + +//===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines nodes for use within a SuffixTree. +// +// Each node has either no children or at least two children, with the root +// being a exception in the empty tree. +// +// Children are represented as a map between unsigned integers and nodes. If +// a node N has a child M on unsigned integer k, then the mapping represented +// by N is a proper prefix of the mapping represented by M. Note that this, +// although similar to a trie is somewhat different: each node stores a full +// substring of the full mapping rather than a single character state. +// +// Each internal node contains a pointer to the internal node representing +// the same string, but with the first character chopped off. This is stored +// in \p Link. Each leaf node stores the start index of its respective +// suffix in \p SuffixIdx. +//===----------------------------------------------------------------------===// + +// clang-format off + +#ifndef wasm_support_suffix_tree_node_h +#define wasm_support_suffix_tree_node_h + +#include <unordered_map> + +namespace wasm { + +/// A node in a suffix tree which represents a substring or suffix. +struct SuffixTreeNode { +public: + /// Represents an undefined index in the suffix tree. + static const unsigned EmptyIdx = -1; + enum class NodeKind { ST_Leaf, ST_Internal }; + +private: + const NodeKind Kind; + + /// The start index of this node's substring in the main string. + unsigned StartIdx = EmptyIdx; + + /// The length of the string formed by concatenating the edge labels from + /// the root to this node. + unsigned ConcatLen = 0; + +public: + // LLVM RTTI boilerplate. + NodeKind getKind() const { return Kind; } + + /// \return the start index of this node's substring in the entire string. + unsigned getStartIdx() const; + + /// \returns the end index of this node. + virtual unsigned getEndIdx() const = 0; + + /// Advance this node's StartIdx by \p Inc. + void incrementStartIdx(unsigned Inc); + + /// Set the length of the string from the root to this node to \p Len. + void setConcatLen(unsigned Len); + + /// \returns the length of the string from the root to this node. + unsigned getConcatLen() const; + + SuffixTreeNode(NodeKind Kind, unsigned StartIdx) + : Kind(Kind), StartIdx(StartIdx) {} + virtual ~SuffixTreeNode() = default; +}; + +// A node with two or more children, or the root. +struct SuffixTreeInternalNode : SuffixTreeNode { +private: + /// The end index of this node's substring in the main string. + /// + /// Every leaf node must have its \p EndIdx incremented at the end of every + /// step in the construction algorithm. To avoid having to update O(N) + /// nodes individually at the end of every step, the end index is stored + /// as a pointer. + unsigned EndIdx = EmptyIdx; + + /// A pointer to the internal node representing the same sequence with the + /// first character chopped off. + /// + /// This acts as a shortcut in Ukkonen's algorithm. One of the things that + /// Ukkonen's algorithm does to achieve linear-time construction is + /// keep track of which node the next insert should be at. This makes each + /// insert O(1), and there are a total of O(N) inserts. The suffix link + /// helps with inserting children of internal nodes. + /// + /// Say we add a child to an internal node with associated mapping S. The + /// next insertion must be at the node representing S - its first character. + /// This is given by the way that we iteratively build the tree in Ukkonen's + /// algorithm. The main idea is to look at the suffixes of each prefix in the + /// string, starting with the longest suffix of the prefix, and ending with + /// the shortest. Therefore, if we keep pointers between such nodes, we can + /// move to the next insertion point in O(1) time. If we don't, then we'd + /// have to query from the root, which takes O(N) time. This would make the + /// construction algorithm O(N^2) rather than O(N). + SuffixTreeInternalNode* Link = nullptr; + +public: + // LLVM RTTI boilerplate. + static bool classof(const SuffixTreeNode* N) { + return N->getKind() == NodeKind::ST_Internal; + } + + /// \returns true if this node is the root of its owning \p SuffixTree. + bool isRoot() const; + + /// \returns the end index of this node's substring in the entire string. + unsigned getEndIdx() const override; + + /// Sets \p Link to \p L. Assumes \p L is not null. + void setLink(SuffixTreeInternalNode* L); + + /// \returns the pointer to the Link node. + SuffixTreeInternalNode* getLink() const; + + /// The children of this node. + /// + /// A child existing on an unsigned integer implies that from the mapping + /// represented by the current node, there is a way to reach another + /// mapping by tacking that character on the end of the current string. + std::unordered_map<unsigned, SuffixTreeNode*> Children; + + SuffixTreeInternalNode(unsigned StartIdx, + unsigned EndIdx, + SuffixTreeInternalNode* Link) + : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx), + Link(Link) {} + + virtual ~SuffixTreeInternalNode() = default; +}; + +// A node representing a suffix. +struct SuffixTreeLeafNode : SuffixTreeNode { +private: + /// The start index of the suffix represented by this leaf. + unsigned SuffixIdx = EmptyIdx; + + /// The end index of this node's substring in the main string. + /// + /// Every leaf node must have its \p EndIdx incremented at the end of every + /// step in the construction algorithm. To avoid having to update O(N) + /// nodes individually at the end of every step, the end index is stored + /// as a pointer. + unsigned* EndIdx = nullptr; + +public: + // LLVM RTTI boilerplate. + static bool classof(const SuffixTreeNode* N) { + return N->getKind() == NodeKind::ST_Leaf; + } + + /// \returns the end index of this node's substring in the entire string. + unsigned getEndIdx() const override; + + /// \returns the start index of the suffix represented by this leaf. + unsigned getSuffixIdx() const; + + /// Sets the start index of the suffix represented by this leaf to \p Idx. + void setSuffixIdx(unsigned Idx); + SuffixTreeLeafNode(unsigned StartIdx, unsigned* EndIdx) + : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {} + + virtual ~SuffixTreeLeafNode() = default; +}; +} // namespace wasm + +#endif // wasm_support_suffix_tree_node_h |