summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAshley Nelson <nashley@google.com>2023-07-17 19:48:29 -0700
committerGitHub <noreply@github.com>2023-07-18 02:48:29 +0000
commit159750d0fccd7ee453ac2f8569128e0ea94ba8a5 (patch)
treed2958b6099ef67bb755141a1d870b380ce5fa6a9 /src
parentf96fcb0e0c15299045b828447e65754727eeab57 (diff)
downloadbinaryen-159750d0fccd7ee453ac2f8569128e0ea94ba8a5.tar.gz
binaryen-159750d0fccd7ee453ac2f8569128e0ea94ba8a5.tar.bz2
binaryen-159750d0fccd7ee453ac2f8569128e0ea94ba8a5.zip
[Outlining] LLVM Suffix Tree (#5821)
This PR adds LLVM's suffix tree data structure to Binaryen. This suffix tree is implemented using Ukkonen's algorithm for linear-time suffix tree construction, and is intended for fast substring queries. Note: All of the .h and .cpp files included are from LLVM. These files were copied directly instead of imported into our existing LLVM integration (in third_party/) to avoid bumping the commit hash and avoid the potential for complications with upstream changes.
Diffstat (limited to 'src')
-rw-r--r--src/support/CMakeLists.txt15
-rw-r--r--src/support/suffix_tree.cpp305
-rw-r--r--src/support/suffix_tree.h211
-rw-r--r--src/support/suffix_tree_node.cpp45
-rw-r--r--src/support/suffix_tree_node.h179
5 files changed, 754 insertions, 1 deletions
diff --git a/src/support/CMakeLists.txt b/src/support/CMakeLists.txt
index 2758ee198..87f835c7b 100644
--- a/src/support/CMakeLists.txt
+++ b/src/support/CMakeLists.txt
@@ -14,4 +14,17 @@ set(support_SOURCES
utilities.cpp
${support_HEADERS}
)
-add_library(support OBJECT ${support_SOURCES})
+
+# The below condition is intended for removal once the suffix_tree and
+# suffix_tree_node source files no longer depend on LLVM code in the
+# third_party folder
+if(EMSCRIPTEN)
+ add_library(support OBJECT ${support_SOURCES})
+else()
+ set(support_with_suffix_tree_SOURCES
+ suffix_tree.cpp
+ suffix_tree_node.cpp
+ ${support_SOURCES}
+ )
+ add_library(support OBJECT ${support_with_suffix_tree_SOURCES})
+endif()
diff --git a/src/support/suffix_tree.cpp b/src/support/suffix_tree.cpp
new file mode 100644
index 000000000..a66bcde38
--- /dev/null
+++ b/src/support/suffix_tree.cpp
@@ -0,0 +1,305 @@
+// This file began as an import from LLVM, and so it has the same license as
+// LLVM, copied below together with the code.
+
+//===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Suffix Tree class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "support/suffix_tree.h"
+#include "support/suffix_tree_node.h"
+#include "llvm/Support/Casting.h"
+
+using namespace llvm;
+
+namespace wasm {
+
+/// \returns the number of elements in the substring associated with \p N.
+static size_t numElementsInSubstring(const SuffixTreeNode* N) {
+ assert(N && "Got a null node?");
+ if (auto* Internal = dyn_cast<SuffixTreeInternalNode>(N)) {
+ if (Internal->isRoot()) {
+ return 0;
+ }
+ }
+ return N->getEndIdx() - N->getStartIdx() + 1;
+}
+
+SuffixTree::SuffixTree(const std::vector<unsigned>& Str) : Str(Str) {
+ Root = insertRoot();
+ Active.Node = Root;
+
+ // Keep track of the number of suffixes we have to add of the current
+ // prefix.
+ unsigned SuffixesToAdd = 0;
+
+ // Construct the suffix tree iteratively on each prefix of the string.
+ // PfxEndIdx is the end index of the current prefix.
+ // End is one past the last element in the string.
+ for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
+ SuffixesToAdd++;
+ LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
+ SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
+ }
+
+ // Set the suffix indices of each leaf.
+ assert(Root && "Root node can't be nullptr!");
+ setSuffixIndices();
+}
+
+SuffixTreeNode* SuffixTree::insertLeaf(SuffixTreeInternalNode& Parent,
+ unsigned StartIdx,
+ unsigned Edge) {
+ assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
+ auto* N = new (LeafNodeAllocator.Allocate())
+ SuffixTreeLeafNode(StartIdx, &LeafEndIdx);
+ Parent.Children[Edge] = N;
+ return N;
+}
+
+SuffixTreeInternalNode*
+SuffixTree::insertInternalNode(SuffixTreeInternalNode* Parent,
+ unsigned StartIdx,
+ unsigned EndIdx,
+ unsigned Edge) {
+ assert(StartIdx <= EndIdx && "String can't start after it ends!");
+ assert(!(!Parent && StartIdx != SuffixTreeNode::EmptyIdx) &&
+ "Non-root internal nodes must have parents!");
+ auto* N = new (InternalNodeAllocator.Allocate())
+ SuffixTreeInternalNode(StartIdx, EndIdx, Root);
+ if (Parent) {
+ Parent->Children[Edge] = N;
+ }
+ return N;
+}
+
+SuffixTreeInternalNode* SuffixTree::insertRoot() {
+ return insertInternalNode(/*Parent = */ nullptr,
+ SuffixTreeNode::EmptyIdx,
+ SuffixTreeNode::EmptyIdx,
+ /*Edge = */ 0);
+}
+
+void SuffixTree::setSuffixIndices() {
+ // List of nodes we need to visit along with the current length of the
+ // string.
+ std::vector<std::pair<SuffixTreeNode*, unsigned>> ToVisit;
+
+ // Current node being visited.
+ SuffixTreeNode* CurrNode = Root;
+
+ // Sum of the lengths of the nodes down the path to the current one.
+ unsigned CurrNodeLen = 0;
+ ToVisit.push_back({CurrNode, CurrNodeLen});
+ while (!ToVisit.empty()) {
+ std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
+ ToVisit.pop_back();
+ // Length of the current node from the root down to here.
+ CurrNode->setConcatLen(CurrNodeLen);
+ if (auto* InternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode)) {
+ for (auto& ChildPair : InternalNode->Children) {
+ assert(ChildPair.second && "Node had a null child!");
+ ToVisit.push_back(
+ {ChildPair.second,
+ CurrNodeLen + numElementsInSubstring(ChildPair.second)});
+ }
+ }
+ // No children, so we are at the end of the string.
+ if (auto* LeafNode = dyn_cast<SuffixTreeLeafNode>(CurrNode)) {
+ LeafNode->setSuffixIdx(Str.size() - CurrNodeLen);
+ }
+ }
+}
+
+unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
+ SuffixTreeInternalNode* NeedsLink = nullptr;
+
+ while (SuffixesToAdd > 0) {
+
+ // Are we waiting to add anything other than just the last character?
+ if (Active.Len == 0) {
+ // If not, then say the active index is the end index.
+ Active.Idx = EndIdx;
+ }
+
+ assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
+
+ // The first character in the current substring we're looking at.
+ unsigned FirstChar = Str[Active.Idx];
+
+ // Have we inserted anything starting with FirstChar at the current node?
+ if (Active.Node->Children.count(FirstChar) == 0) {
+ // If not, then we can just insert a leaf and move to the next step.
+ insertLeaf(*Active.Node, EndIdx, FirstChar);
+
+ // The active node is an internal node, and we visited it, so it must
+ // need a link if it doesn't have one.
+ if (NeedsLink) {
+ NeedsLink->setLink(Active.Node);
+ NeedsLink = nullptr;
+ }
+ } else {
+ // There's a match with FirstChar, so look for the point in the tree to
+ // insert a new node.
+ SuffixTreeNode* NextNode = Active.Node->Children[FirstChar];
+
+ unsigned SubstringLen = numElementsInSubstring(NextNode);
+
+ // Is the current suffix we're trying to insert longer than the size of
+ // the child we want to move to?
+ if (Active.Len >= SubstringLen) {
+ // If yes, then consume the characters we've seen and move to the next
+ // node.
+ // TODO: Enable the below assert
+ // assert(isa<SuffixTreeInternalNode>(NextNode) &&
+ // "Expected an internal node?");
+ Active.Idx += SubstringLen;
+ Active.Len -= SubstringLen;
+ Active.Node = cast<SuffixTreeInternalNode>(NextNode);
+ continue;
+ }
+
+ // Otherwise, the suffix we're trying to insert must be contained in the
+ // next node we want to move to.
+ unsigned LastChar = Str[EndIdx];
+
+ // Is the string we're trying to insert a substring of the next node?
+ if (Str[NextNode->getStartIdx() + Active.Len] == LastChar) {
+ // If yes, then we're done for this step. Remember our insertion point
+ // and move to the next end index. At this point, we have an implicit
+ // suffix tree.
+ if (NeedsLink && !Active.Node->isRoot()) {
+ NeedsLink->setLink(Active.Node);
+ NeedsLink = nullptr;
+ }
+
+ Active.Len++;
+ break;
+ }
+
+ // The string we're trying to insert isn't a substring of the next node,
+ // but matches up to a point. Split the node.
+ //
+ // For example, say we ended our search at a node n and we're trying to
+ // insert ABD. Then we'll create a new node s for AB, reduce n to just
+ // representing C, and insert a new leaf node l to represent d. This
+ // allows us to ensure that if n was a leaf, it remains a leaf.
+ //
+ // | ABC ---split---> | AB
+ // n s
+ // C / \ D
+ // n l
+
+ // The node s from the diagram
+ SuffixTreeInternalNode* SplitNode =
+ insertInternalNode(Active.Node,
+ NextNode->getStartIdx(),
+ NextNode->getStartIdx() + Active.Len - 1,
+ FirstChar);
+
+ // Insert the new node representing the new substring into the tree as
+ // a child of the split node. This is the node l from the diagram.
+ insertLeaf(*SplitNode, EndIdx, LastChar);
+
+ // Make the old node a child of the split node and update its start
+ // index. This is the node n from the diagram.
+ NextNode->incrementStartIdx(Active.Len);
+ SplitNode->Children[Str[NextNode->getStartIdx()]] = NextNode;
+
+ // SplitNode is an internal node, update the suffix link.
+ if (NeedsLink) {
+ NeedsLink->setLink(SplitNode);
+ }
+
+ NeedsLink = SplitNode;
+ }
+
+ // We've added something new to the tree, so there's one less suffix to
+ // add.
+ SuffixesToAdd--;
+
+ if (Active.Node->isRoot()) {
+ if (Active.Len > 0) {
+ Active.Len--;
+ Active.Idx = EndIdx - SuffixesToAdd + 1;
+ }
+ } else {
+ // Start the next phase at the next smallest suffix.
+ Active.Node = Active.Node->getLink();
+ }
+ }
+
+ return SuffixesToAdd;
+}
+
+void SuffixTree::RepeatedSubstringIterator::advance() {
+ // Clear the current state. If we're at the end of the range, then this
+ // is the state we want to be in.
+ RS = RepeatedSubstring();
+ N = nullptr;
+
+ // Each leaf node represents a repeat of a string.
+ std::vector<unsigned> RepeatedSubstringStarts;
+
+ // Continue visiting nodes until we find one which repeats more than once.
+ while (!InternalNodesToVisit.empty()) {
+ RepeatedSubstringStarts.clear();
+ auto* Curr = InternalNodesToVisit.back();
+ InternalNodesToVisit.pop_back();
+
+ // Keep track of the length of the string associated with the node. If
+ // it's too short, we'll quit.
+ unsigned Length = Curr->getConcatLen();
+
+ // Iterate over each child, saving internal nodes for visiting, and
+ // leaf nodes in LeafChildren. Internal nodes represent individual
+ // strings, which may repeat.
+ for (auto& ChildPair : Curr->Children) {
+ // Save all of this node's children for processing.
+ if (auto* InternalChild =
+ dyn_cast<SuffixTreeInternalNode>(ChildPair.second)) {
+ InternalNodesToVisit.push_back(InternalChild);
+ continue;
+ }
+
+ if (Length < MinLength) {
+ continue;
+ }
+
+ // Have an occurrence of a potentially repeated string. Save it.
+ auto* Leaf = cast<SuffixTreeLeafNode>(ChildPair.second);
+ RepeatedSubstringStarts.push_back(Leaf->getSuffixIdx());
+ }
+
+ // The root never represents a repeated substring. If we're looking at
+ // that, then skip it.
+ if (Curr->isRoot()) {
+ continue;
+ }
+
+ // Do we have any repeated substrings?
+ if (RepeatedSubstringStarts.size() < 2) {
+ continue;
+ }
+
+ // Yes. Update the state to reflect this, and then bail out.
+ N = Curr;
+ RS.Length = Length;
+ for (unsigned StartIdx : RepeatedSubstringStarts) {
+ RS.StartIndices.push_back(StartIdx);
+ }
+ break;
+ }
+ // At this point, either NewRS is an empty RepeatedSubstring, or it was
+ // set in the above loop. Similarly, N is either nullptr, or the node
+ // associated with NewRS.
+}
+
+} // namespace wasm
diff --git a/src/support/suffix_tree.h b/src/support/suffix_tree.h
new file mode 100644
index 000000000..474984dcd
--- /dev/null
+++ b/src/support/suffix_tree.h
@@ -0,0 +1,211 @@
+// This file began as an import from LLVM, and so it has the same license as
+// LLVM, copied below together with the code.
+
+//===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// A data structure for fast substring queries.
+//
+// Suffix trees represent the suffixes of their input strings in their leaves.
+// A suffix tree is a type of compressed trie structure where each node
+// represents an entire substring rather than a single character. Each leaf
+// of the tree is a suffix.
+//
+// A suffix tree can be seen as a type of state machine where each state is a
+// substring of the full string. The tree is structured so that, for a string
+// of length N, there are exactly N leaves in the tree. This structure allows
+// us to quickly find repeated substrings of the input string.
+//
+// In this implementation, a "string" is a vector of unsigned integers.
+// These integers may result from hashing some data type. A suffix tree can
+// contain 1 or many strings, which can then be queried as one large string.
+//
+// The suffix tree is implemented using Ukkonen's algorithm for linear-time
+// suffix tree construction. Ukkonen's algorithm is explained in more detail
+// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
+// paper is available at
+//
+// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
+//===----------------------------------------------------------------------===//
+
+#ifndef wasm_support_suffix_tree_h
+#define wasm_support_suffix_tree_h
+
+#include "llvm/Support/Allocator.h"
+#include <cassert>
+#include <cstddef>
+#include <vector>
+
+#include "support/suffix_tree_node.h"
+
+using namespace llvm;
+
+namespace wasm {
+class SuffixTree {
+public:
+ /// Each element is an integer representing an instruction in the module.
+ // TODO: This was an ArrayRef originally
+ std::vector<unsigned> Str;
+
+ /// A repeated substring in the tree.
+ struct RepeatedSubstring {
+ /// The length of the string.
+ unsigned Length;
+
+ /// The start indices of each occurrence.
+ std::vector<unsigned> StartIndices;
+ };
+
+private:
+ /// Maintains internal nodes in the tree.
+ SpecificBumpPtrAllocator<SuffixTreeInternalNode> InternalNodeAllocator;
+ /// Maintains leaf nodes in the tree.
+ SpecificBumpPtrAllocator<SuffixTreeLeafNode> LeafNodeAllocator;
+
+ /// The root of the suffix tree.
+ ///
+ /// The root represents the empty string. It is maintained by the
+ /// \p NodeAllocator like every other node in the tree.
+ SuffixTreeInternalNode* Root = nullptr;
+
+ /// The end index of each leaf in the tree.
+ unsigned LeafEndIdx = SuffixTreeNode::EmptyIdx;
+
+ /// Helper struct which keeps track of the next insertion point in
+ /// Ukkonen's algorithm.
+ struct ActiveState {
+ /// The next node to insert at.
+ SuffixTreeInternalNode* Node = nullptr;
+
+ /// The index of the first character in the substring currently being added.
+ unsigned Idx = SuffixTreeNode::EmptyIdx;
+
+ /// The length of the substring we have to add at the current step.
+ unsigned Len = 0;
+ };
+
+ /// The point the next insertion will take place at in the
+ /// construction algorithm.
+ ActiveState Active;
+
+ /// Allocate a leaf node and add it to the tree.
+ ///
+ /// \param Parent The parent of this node.
+ /// \param StartIdx The start index of this node's associated string.
+ /// \param Edge The label on the edge leaving \p Parent to this node.
+ ///
+ /// \returns A pointer to the allocated leaf node.
+ SuffixTreeNode*
+ insertLeaf(SuffixTreeInternalNode& Parent, unsigned StartIdx, unsigned Edge);
+
+ /// Allocate an internal node and add it to the tree.
+ ///
+ /// \param Parent The parent of this node. Only null when allocating the root.
+ /// \param StartIdx The start index of this node's associated string.
+ /// \param EndIdx The end index of this node's associated string.
+ /// \param Edge The label on the edge leaving \p Parent to this node.
+ ///
+ /// \returns A pointer to the allocated internal node.
+ SuffixTreeInternalNode* insertInternalNode(SuffixTreeInternalNode* Parent,
+ unsigned StartIdx,
+ unsigned EndIdx,
+ unsigned Edge);
+
+ /// Allocate the root node and add it to the tree.
+ ///
+ /// \returns A pointer to the root.
+ SuffixTreeInternalNode* insertRoot();
+
+ /// Set the suffix indices of the leaves to the start indices of their
+ /// respective suffixes.
+ void setSuffixIndices();
+
+ /// Construct the suffix tree for the prefix of the input ending at
+ /// \p EndIdx.
+ ///
+ /// Used to construct the full suffix tree iteratively. At the end of each
+ /// step, the constructed suffix tree is either a valid suffix tree, or a
+ /// suffix tree with implicit suffixes. At the end of the final step, the
+ /// suffix tree is a valid tree.
+ ///
+ /// \param EndIdx The end index of the current prefix in the main string.
+ /// \param SuffixesToAdd The number of suffixes that must be added
+ /// to complete the suffix tree at the current phase.
+ ///
+ /// \returns The number of suffixes that have not been added at the end of
+ /// this step.
+ unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);
+
+public:
+ /// Construct a suffix tree from a sequence of unsigned integers.
+ ///
+ /// \param Str The string to construct the suffix tree for.
+ SuffixTree(const std::vector<unsigned>& Str);
+
+ /// Iterator for finding all repeated substrings in the suffix tree.
+ struct RepeatedSubstringIterator {
+ private:
+ /// The current node we're visiting.
+ SuffixTreeNode* N = nullptr;
+
+ /// The repeated substring associated with this node.
+ RepeatedSubstring RS;
+
+ /// The nodes left to visit.
+ std::vector<SuffixTreeInternalNode*> InternalNodesToVisit;
+
+ /// The minimum length of a repeated substring to find.
+ /// Since we're outlining, we want at least two instructions in the range.
+ /// FIXME: This may not be true for targets like X86 which support many
+ /// instruction lengths.
+ const unsigned MinLength = 2;
+
+ /// Move the iterator to the next repeated substring.
+ void advance();
+
+ public:
+ /// Return the current repeated substring.
+ RepeatedSubstring& operator*() { return RS; }
+
+ RepeatedSubstringIterator& operator++() {
+ advance();
+ return *this;
+ }
+
+ RepeatedSubstringIterator operator++(int I) {
+ RepeatedSubstringIterator It(*this);
+ advance();
+ return It;
+ }
+
+ bool operator==(const RepeatedSubstringIterator& Other) const {
+ return N == Other.N;
+ }
+ bool operator!=(const RepeatedSubstringIterator& Other) const {
+ return !(*this == Other);
+ }
+
+ RepeatedSubstringIterator(SuffixTreeInternalNode* N) : N(N) {
+ // Do we have a non-null node?
+ if (!N) {
+ return;
+ }
+ // Yes. At the first step, we need to visit all of N's children.
+ // Note: This means that we visit N last.
+ InternalNodesToVisit.push_back(N);
+ advance();
+ }
+ };
+
+ typedef RepeatedSubstringIterator iterator;
+ iterator begin() { return iterator(Root); }
+ iterator end() { return iterator(nullptr); }
+};
+
+} // namespace wasm
+
+#endif // wasm_support_suffix_tree_h
diff --git a/src/support/suffix_tree_node.cpp b/src/support/suffix_tree_node.cpp
new file mode 100644
index 000000000..a5bf2348e
--- /dev/null
+++ b/src/support/suffix_tree_node.cpp
@@ -0,0 +1,45 @@
+// This file began as an import from LLVM, and so it has the same license as
+// LLVM, copied below together with the code.
+
+//===- llvm/ADT/SuffixTreeNode.cpp - Nodes for SuffixTrees --------*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines nodes for use within a SuffixTree.
+//
+//===----------------------------------------------------------------------===//
+
+#include "support/suffix_tree_node.h"
+#include <cassert>
+
+namespace wasm {
+
+unsigned SuffixTreeNode::getStartIdx() const { return StartIdx; }
+void SuffixTreeNode::incrementStartIdx(unsigned Inc) { StartIdx += Inc; }
+void SuffixTreeNode::setConcatLen(unsigned Len) { ConcatLen = Len; }
+unsigned SuffixTreeNode::getConcatLen() const { return ConcatLen; }
+
+bool SuffixTreeInternalNode::isRoot() const {
+ return getStartIdx() == EmptyIdx;
+}
+unsigned SuffixTreeInternalNode::getEndIdx() const { return EndIdx; }
+void SuffixTreeInternalNode::setLink(SuffixTreeInternalNode* L) {
+ assert(L && "Cannot set a null link?");
+ Link = L;
+}
+SuffixTreeInternalNode* SuffixTreeInternalNode::getLink() const { return Link; }
+
+unsigned SuffixTreeLeafNode::getEndIdx() const {
+ assert(EndIdx && "EndIdx is empty?");
+ return *EndIdx;
+}
+
+unsigned SuffixTreeLeafNode::getSuffixIdx() const { return SuffixIdx; }
+void SuffixTreeLeafNode::setSuffixIdx(unsigned Idx) { SuffixIdx = Idx; }
+
+} // namespace wasm
diff --git a/src/support/suffix_tree_node.h b/src/support/suffix_tree_node.h
new file mode 100644
index 000000000..8b34fb21b
--- /dev/null
+++ b/src/support/suffix_tree_node.h
@@ -0,0 +1,179 @@
+// This file began as an import from LLVM, and so it has the same license as
+// LLVM, copied below together with the code.
+
+//===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines nodes for use within a SuffixTree.
+//
+// Each node has either no children or at least two children, with the root
+// being a exception in the empty tree.
+//
+// Children are represented as a map between unsigned integers and nodes. If
+// a node N has a child M on unsigned integer k, then the mapping represented
+// by N is a proper prefix of the mapping represented by M. Note that this,
+// although similar to a trie is somewhat different: each node stores a full
+// substring of the full mapping rather than a single character state.
+//
+// Each internal node contains a pointer to the internal node representing
+// the same string, but with the first character chopped off. This is stored
+// in \p Link. Each leaf node stores the start index of its respective
+// suffix in \p SuffixIdx.
+//===----------------------------------------------------------------------===//
+
+// clang-format off
+
+#ifndef wasm_support_suffix_tree_node_h
+#define wasm_support_suffix_tree_node_h
+
+#include <unordered_map>
+
+namespace wasm {
+
+/// A node in a suffix tree which represents a substring or suffix.
+struct SuffixTreeNode {
+public:
+ /// Represents an undefined index in the suffix tree.
+ static const unsigned EmptyIdx = -1;
+ enum class NodeKind { ST_Leaf, ST_Internal };
+
+private:
+ const NodeKind Kind;
+
+ /// The start index of this node's substring in the main string.
+ unsigned StartIdx = EmptyIdx;
+
+ /// The length of the string formed by concatenating the edge labels from
+ /// the root to this node.
+ unsigned ConcatLen = 0;
+
+public:
+ // LLVM RTTI boilerplate.
+ NodeKind getKind() const { return Kind; }
+
+ /// \return the start index of this node's substring in the entire string.
+ unsigned getStartIdx() const;
+
+ /// \returns the end index of this node.
+ virtual unsigned getEndIdx() const = 0;
+
+ /// Advance this node's StartIdx by \p Inc.
+ void incrementStartIdx(unsigned Inc);
+
+ /// Set the length of the string from the root to this node to \p Len.
+ void setConcatLen(unsigned Len);
+
+ /// \returns the length of the string from the root to this node.
+ unsigned getConcatLen() const;
+
+ SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
+ : Kind(Kind), StartIdx(StartIdx) {}
+ virtual ~SuffixTreeNode() = default;
+};
+
+// A node with two or more children, or the root.
+struct SuffixTreeInternalNode : SuffixTreeNode {
+private:
+ /// The end index of this node's substring in the main string.
+ ///
+ /// Every leaf node must have its \p EndIdx incremented at the end of every
+ /// step in the construction algorithm. To avoid having to update O(N)
+ /// nodes individually at the end of every step, the end index is stored
+ /// as a pointer.
+ unsigned EndIdx = EmptyIdx;
+
+ /// A pointer to the internal node representing the same sequence with the
+ /// first character chopped off.
+ ///
+ /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
+ /// Ukkonen's algorithm does to achieve linear-time construction is
+ /// keep track of which node the next insert should be at. This makes each
+ /// insert O(1), and there are a total of O(N) inserts. The suffix link
+ /// helps with inserting children of internal nodes.
+ ///
+ /// Say we add a child to an internal node with associated mapping S. The
+ /// next insertion must be at the node representing S - its first character.
+ /// This is given by the way that we iteratively build the tree in Ukkonen's
+ /// algorithm. The main idea is to look at the suffixes of each prefix in the
+ /// string, starting with the longest suffix of the prefix, and ending with
+ /// the shortest. Therefore, if we keep pointers between such nodes, we can
+ /// move to the next insertion point in O(1) time. If we don't, then we'd
+ /// have to query from the root, which takes O(N) time. This would make the
+ /// construction algorithm O(N^2) rather than O(N).
+ SuffixTreeInternalNode* Link = nullptr;
+
+public:
+ // LLVM RTTI boilerplate.
+ static bool classof(const SuffixTreeNode* N) {
+ return N->getKind() == NodeKind::ST_Internal;
+ }
+
+ /// \returns true if this node is the root of its owning \p SuffixTree.
+ bool isRoot() const;
+
+ /// \returns the end index of this node's substring in the entire string.
+ unsigned getEndIdx() const override;
+
+ /// Sets \p Link to \p L. Assumes \p L is not null.
+ void setLink(SuffixTreeInternalNode* L);
+
+ /// \returns the pointer to the Link node.
+ SuffixTreeInternalNode* getLink() const;
+
+ /// The children of this node.
+ ///
+ /// A child existing on an unsigned integer implies that from the mapping
+ /// represented by the current node, there is a way to reach another
+ /// mapping by tacking that character on the end of the current string.
+ std::unordered_map<unsigned, SuffixTreeNode*> Children;
+
+ SuffixTreeInternalNode(unsigned StartIdx,
+ unsigned EndIdx,
+ SuffixTreeInternalNode* Link)
+ : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
+ Link(Link) {}
+
+ virtual ~SuffixTreeInternalNode() = default;
+};
+
+// A node representing a suffix.
+struct SuffixTreeLeafNode : SuffixTreeNode {
+private:
+ /// The start index of the suffix represented by this leaf.
+ unsigned SuffixIdx = EmptyIdx;
+
+ /// The end index of this node's substring in the main string.
+ ///
+ /// Every leaf node must have its \p EndIdx incremented at the end of every
+ /// step in the construction algorithm. To avoid having to update O(N)
+ /// nodes individually at the end of every step, the end index is stored
+ /// as a pointer.
+ unsigned* EndIdx = nullptr;
+
+public:
+ // LLVM RTTI boilerplate.
+ static bool classof(const SuffixTreeNode* N) {
+ return N->getKind() == NodeKind::ST_Leaf;
+ }
+
+ /// \returns the end index of this node's substring in the entire string.
+ unsigned getEndIdx() const override;
+
+ /// \returns the start index of the suffix represented by this leaf.
+ unsigned getSuffixIdx() const;
+
+ /// Sets the start index of the suffix represented by this leaf to \p Idx.
+ void setSuffixIdx(unsigned Idx);
+ SuffixTreeLeafNode(unsigned StartIdx, unsigned* EndIdx)
+ : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}
+
+ virtual ~SuffixTreeLeafNode() = default;
+};
+} // namespace wasm
+
+#endif // wasm_support_suffix_tree_node_h