summaryrefslogtreecommitdiff
path: root/src/support/suffix_tree_node.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/support/suffix_tree_node.h')
-rw-r--r--src/support/suffix_tree_node.h179
1 files changed, 179 insertions, 0 deletions
diff --git a/src/support/suffix_tree_node.h b/src/support/suffix_tree_node.h
new file mode 100644
index 000000000..8b34fb21b
--- /dev/null
+++ b/src/support/suffix_tree_node.h
@@ -0,0 +1,179 @@
+// This file began as an import from LLVM, and so it has the same license as
+// LLVM, copied below together with the code.
+
+//===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines nodes for use within a SuffixTree.
+//
+// Each node has either no children or at least two children, with the root
+// being a exception in the empty tree.
+//
+// Children are represented as a map between unsigned integers and nodes. If
+// a node N has a child M on unsigned integer k, then the mapping represented
+// by N is a proper prefix of the mapping represented by M. Note that this,
+// although similar to a trie is somewhat different: each node stores a full
+// substring of the full mapping rather than a single character state.
+//
+// Each internal node contains a pointer to the internal node representing
+// the same string, but with the first character chopped off. This is stored
+// in \p Link. Each leaf node stores the start index of its respective
+// suffix in \p SuffixIdx.
+//===----------------------------------------------------------------------===//
+
+// clang-format off
+
+#ifndef wasm_support_suffix_tree_node_h
+#define wasm_support_suffix_tree_node_h
+
+#include <unordered_map>
+
+namespace wasm {
+
+/// A node in a suffix tree which represents a substring or suffix.
+struct SuffixTreeNode {
+public:
+ /// Represents an undefined index in the suffix tree.
+ static const unsigned EmptyIdx = -1;
+ enum class NodeKind { ST_Leaf, ST_Internal };
+
+private:
+ const NodeKind Kind;
+
+ /// The start index of this node's substring in the main string.
+ unsigned StartIdx = EmptyIdx;
+
+ /// The length of the string formed by concatenating the edge labels from
+ /// the root to this node.
+ unsigned ConcatLen = 0;
+
+public:
+ // LLVM RTTI boilerplate.
+ NodeKind getKind() const { return Kind; }
+
+ /// \return the start index of this node's substring in the entire string.
+ unsigned getStartIdx() const;
+
+ /// \returns the end index of this node.
+ virtual unsigned getEndIdx() const = 0;
+
+ /// Advance this node's StartIdx by \p Inc.
+ void incrementStartIdx(unsigned Inc);
+
+ /// Set the length of the string from the root to this node to \p Len.
+ void setConcatLen(unsigned Len);
+
+ /// \returns the length of the string from the root to this node.
+ unsigned getConcatLen() const;
+
+ SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
+ : Kind(Kind), StartIdx(StartIdx) {}
+ virtual ~SuffixTreeNode() = default;
+};
+
+// A node with two or more children, or the root.
+struct SuffixTreeInternalNode : SuffixTreeNode {
+private:
+ /// The end index of this node's substring in the main string.
+ ///
+ /// Every leaf node must have its \p EndIdx incremented at the end of every
+ /// step in the construction algorithm. To avoid having to update O(N)
+ /// nodes individually at the end of every step, the end index is stored
+ /// as a pointer.
+ unsigned EndIdx = EmptyIdx;
+
+ /// A pointer to the internal node representing the same sequence with the
+ /// first character chopped off.
+ ///
+ /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
+ /// Ukkonen's algorithm does to achieve linear-time construction is
+ /// keep track of which node the next insert should be at. This makes each
+ /// insert O(1), and there are a total of O(N) inserts. The suffix link
+ /// helps with inserting children of internal nodes.
+ ///
+ /// Say we add a child to an internal node with associated mapping S. The
+ /// next insertion must be at the node representing S - its first character.
+ /// This is given by the way that we iteratively build the tree in Ukkonen's
+ /// algorithm. The main idea is to look at the suffixes of each prefix in the
+ /// string, starting with the longest suffix of the prefix, and ending with
+ /// the shortest. Therefore, if we keep pointers between such nodes, we can
+ /// move to the next insertion point in O(1) time. If we don't, then we'd
+ /// have to query from the root, which takes O(N) time. This would make the
+ /// construction algorithm O(N^2) rather than O(N).
+ SuffixTreeInternalNode* Link = nullptr;
+
+public:
+ // LLVM RTTI boilerplate.
+ static bool classof(const SuffixTreeNode* N) {
+ return N->getKind() == NodeKind::ST_Internal;
+ }
+
+ /// \returns true if this node is the root of its owning \p SuffixTree.
+ bool isRoot() const;
+
+ /// \returns the end index of this node's substring in the entire string.
+ unsigned getEndIdx() const override;
+
+ /// Sets \p Link to \p L. Assumes \p L is not null.
+ void setLink(SuffixTreeInternalNode* L);
+
+ /// \returns the pointer to the Link node.
+ SuffixTreeInternalNode* getLink() const;
+
+ /// The children of this node.
+ ///
+ /// A child existing on an unsigned integer implies that from the mapping
+ /// represented by the current node, there is a way to reach another
+ /// mapping by tacking that character on the end of the current string.
+ std::unordered_map<unsigned, SuffixTreeNode*> Children;
+
+ SuffixTreeInternalNode(unsigned StartIdx,
+ unsigned EndIdx,
+ SuffixTreeInternalNode* Link)
+ : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
+ Link(Link) {}
+
+ virtual ~SuffixTreeInternalNode() = default;
+};
+
+// A node representing a suffix.
+struct SuffixTreeLeafNode : SuffixTreeNode {
+private:
+ /// The start index of the suffix represented by this leaf.
+ unsigned SuffixIdx = EmptyIdx;
+
+ /// The end index of this node's substring in the main string.
+ ///
+ /// Every leaf node must have its \p EndIdx incremented at the end of every
+ /// step in the construction algorithm. To avoid having to update O(N)
+ /// nodes individually at the end of every step, the end index is stored
+ /// as a pointer.
+ unsigned* EndIdx = nullptr;
+
+public:
+ // LLVM RTTI boilerplate.
+ static bool classof(const SuffixTreeNode* N) {
+ return N->getKind() == NodeKind::ST_Leaf;
+ }
+
+ /// \returns the end index of this node's substring in the entire string.
+ unsigned getEndIdx() const override;
+
+ /// \returns the start index of the suffix represented by this leaf.
+ unsigned getSuffixIdx() const;
+
+ /// Sets the start index of the suffix represented by this leaf to \p Idx.
+ void setSuffixIdx(unsigned Idx);
+ SuffixTreeLeafNode(unsigned StartIdx, unsigned* EndIdx)
+ : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}
+
+ virtual ~SuffixTreeLeafNode() = default;
+};
+} // namespace wasm
+
+#endif // wasm_support_suffix_tree_node_h