diff options
Diffstat (limited to 'src/support/suffix_tree_node.h')
-rw-r--r-- | src/support/suffix_tree_node.h | 179 |
1 files changed, 179 insertions, 0 deletions
diff --git a/src/support/suffix_tree_node.h b/src/support/suffix_tree_node.h new file mode 100644 index 000000000..8b34fb21b --- /dev/null +++ b/src/support/suffix_tree_node.h @@ -0,0 +1,179 @@ +// This file began as an import from LLVM, and so it has the same license as +// LLVM, copied below together with the code. + +//===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines nodes for use within a SuffixTree. +// +// Each node has either no children or at least two children, with the root +// being a exception in the empty tree. +// +// Children are represented as a map between unsigned integers and nodes. If +// a node N has a child M on unsigned integer k, then the mapping represented +// by N is a proper prefix of the mapping represented by M. Note that this, +// although similar to a trie is somewhat different: each node stores a full +// substring of the full mapping rather than a single character state. +// +// Each internal node contains a pointer to the internal node representing +// the same string, but with the first character chopped off. This is stored +// in \p Link. Each leaf node stores the start index of its respective +// suffix in \p SuffixIdx. +//===----------------------------------------------------------------------===// + +// clang-format off + +#ifndef wasm_support_suffix_tree_node_h +#define wasm_support_suffix_tree_node_h + +#include <unordered_map> + +namespace wasm { + +/// A node in a suffix tree which represents a substring or suffix. +struct SuffixTreeNode { +public: + /// Represents an undefined index in the suffix tree. + static const unsigned EmptyIdx = -1; + enum class NodeKind { ST_Leaf, ST_Internal }; + +private: + const NodeKind Kind; + + /// The start index of this node's substring in the main string. + unsigned StartIdx = EmptyIdx; + + /// The length of the string formed by concatenating the edge labels from + /// the root to this node. + unsigned ConcatLen = 0; + +public: + // LLVM RTTI boilerplate. + NodeKind getKind() const { return Kind; } + + /// \return the start index of this node's substring in the entire string. + unsigned getStartIdx() const; + + /// \returns the end index of this node. + virtual unsigned getEndIdx() const = 0; + + /// Advance this node's StartIdx by \p Inc. + void incrementStartIdx(unsigned Inc); + + /// Set the length of the string from the root to this node to \p Len. + void setConcatLen(unsigned Len); + + /// \returns the length of the string from the root to this node. + unsigned getConcatLen() const; + + SuffixTreeNode(NodeKind Kind, unsigned StartIdx) + : Kind(Kind), StartIdx(StartIdx) {} + virtual ~SuffixTreeNode() = default; +}; + +// A node with two or more children, or the root. +struct SuffixTreeInternalNode : SuffixTreeNode { +private: + /// The end index of this node's substring in the main string. + /// + /// Every leaf node must have its \p EndIdx incremented at the end of every + /// step in the construction algorithm. To avoid having to update O(N) + /// nodes individually at the end of every step, the end index is stored + /// as a pointer. + unsigned EndIdx = EmptyIdx; + + /// A pointer to the internal node representing the same sequence with the + /// first character chopped off. + /// + /// This acts as a shortcut in Ukkonen's algorithm. One of the things that + /// Ukkonen's algorithm does to achieve linear-time construction is + /// keep track of which node the next insert should be at. This makes each + /// insert O(1), and there are a total of O(N) inserts. The suffix link + /// helps with inserting children of internal nodes. + /// + /// Say we add a child to an internal node with associated mapping S. The + /// next insertion must be at the node representing S - its first character. + /// This is given by the way that we iteratively build the tree in Ukkonen's + /// algorithm. The main idea is to look at the suffixes of each prefix in the + /// string, starting with the longest suffix of the prefix, and ending with + /// the shortest. Therefore, if we keep pointers between such nodes, we can + /// move to the next insertion point in O(1) time. If we don't, then we'd + /// have to query from the root, which takes O(N) time. This would make the + /// construction algorithm O(N^2) rather than O(N). + SuffixTreeInternalNode* Link = nullptr; + +public: + // LLVM RTTI boilerplate. + static bool classof(const SuffixTreeNode* N) { + return N->getKind() == NodeKind::ST_Internal; + } + + /// \returns true if this node is the root of its owning \p SuffixTree. + bool isRoot() const; + + /// \returns the end index of this node's substring in the entire string. + unsigned getEndIdx() const override; + + /// Sets \p Link to \p L. Assumes \p L is not null. + void setLink(SuffixTreeInternalNode* L); + + /// \returns the pointer to the Link node. + SuffixTreeInternalNode* getLink() const; + + /// The children of this node. + /// + /// A child existing on an unsigned integer implies that from the mapping + /// represented by the current node, there is a way to reach another + /// mapping by tacking that character on the end of the current string. + std::unordered_map<unsigned, SuffixTreeNode*> Children; + + SuffixTreeInternalNode(unsigned StartIdx, + unsigned EndIdx, + SuffixTreeInternalNode* Link) + : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx), + Link(Link) {} + + virtual ~SuffixTreeInternalNode() = default; +}; + +// A node representing a suffix. +struct SuffixTreeLeafNode : SuffixTreeNode { +private: + /// The start index of the suffix represented by this leaf. + unsigned SuffixIdx = EmptyIdx; + + /// The end index of this node's substring in the main string. + /// + /// Every leaf node must have its \p EndIdx incremented at the end of every + /// step in the construction algorithm. To avoid having to update O(N) + /// nodes individually at the end of every step, the end index is stored + /// as a pointer. + unsigned* EndIdx = nullptr; + +public: + // LLVM RTTI boilerplate. + static bool classof(const SuffixTreeNode* N) { + return N->getKind() == NodeKind::ST_Leaf; + } + + /// \returns the end index of this node's substring in the entire string. + unsigned getEndIdx() const override; + + /// \returns the start index of the suffix represented by this leaf. + unsigned getSuffixIdx() const; + + /// Sets the start index of the suffix represented by this leaf to \p Idx. + void setSuffixIdx(unsigned Idx); + SuffixTreeLeafNode(unsigned StartIdx, unsigned* EndIdx) + : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {} + + virtual ~SuffixTreeLeafNode() = default; +}; +} // namespace wasm + +#endif // wasm_support_suffix_tree_node_h |