1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
|
// This file began as an import from LLVM, and so it has the same license as
// LLVM, copied below together with the code.
//===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines nodes for use within a SuffixTree.
//
// Each node has either no children or at least two children, with the root
// being a exception in the empty tree.
//
// Children are represented as a map between unsigned integers and nodes. If
// a node N has a child M on unsigned integer k, then the mapping represented
// by N is a proper prefix of the mapping represented by M. Note that this,
// although similar to a trie is somewhat different: each node stores a full
// substring of the full mapping rather than a single character state.
//
// Each internal node contains a pointer to the internal node representing
// the same string, but with the first character chopped off. This is stored
// in \p Link. Each leaf node stores the start index of its respective
// suffix in \p SuffixIdx.
//===----------------------------------------------------------------------===//
// clang-format off
#ifndef wasm_support_suffix_tree_node_h
#define wasm_support_suffix_tree_node_h
#include <unordered_map>
namespace wasm {
/// A node in a suffix tree which represents a substring or suffix.
struct SuffixTreeNode {
public:
/// Represents an undefined index in the suffix tree.
static const unsigned EmptyIdx = -1;
enum class NodeKind { ST_Leaf, ST_Internal };
private:
const NodeKind Kind;
/// The start index of this node's substring in the main string.
unsigned StartIdx = EmptyIdx;
/// The length of the string formed by concatenating the edge labels from
/// the root to this node.
unsigned ConcatLen = 0;
public:
// LLVM RTTI boilerplate.
NodeKind getKind() const { return Kind; }
/// \return the start index of this node's substring in the entire string.
unsigned getStartIdx() const;
/// \returns the end index of this node.
virtual unsigned getEndIdx() const = 0;
/// Advance this node's StartIdx by \p Inc.
void incrementStartIdx(unsigned Inc);
/// Set the length of the string from the root to this node to \p Len.
void setConcatLen(unsigned Len);
/// \returns the length of the string from the root to this node.
unsigned getConcatLen() const;
SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
: Kind(Kind), StartIdx(StartIdx) {}
virtual ~SuffixTreeNode() = default;
};
// A node with two or more children, or the root.
struct SuffixTreeInternalNode : SuffixTreeNode {
private:
/// The end index of this node's substring in the main string.
///
/// Every leaf node must have its \p EndIdx incremented at the end of every
/// step in the construction algorithm. To avoid having to update O(N)
/// nodes individually at the end of every step, the end index is stored
/// as a pointer.
unsigned EndIdx = EmptyIdx;
/// A pointer to the internal node representing the same sequence with the
/// first character chopped off.
///
/// This acts as a shortcut in Ukkonen's algorithm. One of the things that
/// Ukkonen's algorithm does to achieve linear-time construction is
/// keep track of which node the next insert should be at. This makes each
/// insert O(1), and there are a total of O(N) inserts. The suffix link
/// helps with inserting children of internal nodes.
///
/// Say we add a child to an internal node with associated mapping S. The
/// next insertion must be at the node representing S - its first character.
/// This is given by the way that we iteratively build the tree in Ukkonen's
/// algorithm. The main idea is to look at the suffixes of each prefix in the
/// string, starting with the longest suffix of the prefix, and ending with
/// the shortest. Therefore, if we keep pointers between such nodes, we can
/// move to the next insertion point in O(1) time. If we don't, then we'd
/// have to query from the root, which takes O(N) time. This would make the
/// construction algorithm O(N^2) rather than O(N).
SuffixTreeInternalNode* Link = nullptr;
public:
// LLVM RTTI boilerplate.
static bool classof(const SuffixTreeNode* N) {
return N->getKind() == NodeKind::ST_Internal;
}
/// \returns true if this node is the root of its owning \p SuffixTree.
bool isRoot() const;
/// \returns the end index of this node's substring in the entire string.
unsigned getEndIdx() const override;
/// Sets \p Link to \p L. Assumes \p L is not null.
void setLink(SuffixTreeInternalNode* L);
/// \returns the pointer to the Link node.
SuffixTreeInternalNode* getLink() const;
/// The children of this node.
///
/// A child existing on an unsigned integer implies that from the mapping
/// represented by the current node, there is a way to reach another
/// mapping by tacking that character on the end of the current string.
std::unordered_map<unsigned, SuffixTreeNode*> Children;
SuffixTreeInternalNode(unsigned StartIdx,
unsigned EndIdx,
SuffixTreeInternalNode* Link)
: SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
Link(Link) {}
virtual ~SuffixTreeInternalNode() = default;
};
// A node representing a suffix.
struct SuffixTreeLeafNode : SuffixTreeNode {
private:
/// The start index of the suffix represented by this leaf.
unsigned SuffixIdx = EmptyIdx;
/// The end index of this node's substring in the main string.
///
/// Every leaf node must have its \p EndIdx incremented at the end of every
/// step in the construction algorithm. To avoid having to update O(N)
/// nodes individually at the end of every step, the end index is stored
/// as a pointer.
unsigned* EndIdx = nullptr;
public:
// LLVM RTTI boilerplate.
static bool classof(const SuffixTreeNode* N) {
return N->getKind() == NodeKind::ST_Leaf;
}
/// \returns the end index of this node's substring in the entire string.
unsigned getEndIdx() const override;
/// \returns the start index of the suffix represented by this leaf.
unsigned getSuffixIdx() const;
/// Sets the start index of the suffix represented by this leaf to \p Idx.
void setSuffixIdx(unsigned Idx);
SuffixTreeLeafNode(unsigned StartIdx, unsigned* EndIdx)
: SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}
virtual ~SuffixTreeLeafNode() = default;
};
} // namespace wasm
#endif // wasm_support_suffix_tree_node_h
|