summaryrefslogtreecommitdiff
path: root/src/support/suffix_tree.h
blob: 6aa9c2f203b8a5884bcf2cecd5949fe0eab4a720 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
// This file began as an import from LLVM, and so it has the same license as
// LLVM, copied below together with the code.

//===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// A data structure for fast substring queries.
//
// Suffix trees represent the suffixes of their input strings in their leaves.
// A suffix tree is a type of compressed trie structure where each node
// represents an entire substring rather than a single character. Each leaf
// of the tree is a suffix.
//
// A suffix tree can be seen as a type of state machine where each state is a
// substring of the full string. The tree is structured so that, for a string
// of length N, there are exactly N leaves in the tree. This structure allows
// us to quickly find repeated substrings of the input string.
//
// In this implementation, a "string" is a vector of unsigned integers.
// These integers may result from hashing some data type. A suffix tree can
// contain 1 or many strings, which can then be queried as one large string.
//
// The suffix tree is implemented using Ukkonen's algorithm for linear-time
// suffix tree construction. Ukkonen's algorithm is explained in more detail
// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
// paper is available at
//
// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
//===----------------------------------------------------------------------===//

#ifndef wasm_support_suffix_tree_h
#define wasm_support_suffix_tree_h

#include "llvm/Support/Allocator.h"
#include <cassert>
#include <cstddef>
#include <ostream>
#include <vector>

#include "support/suffix_tree_node.h"

using namespace llvm;

namespace wasm {
class SuffixTree {
public:
  /// Each element is an integer representing an instruction in the module.
  // TODO: This was an ArrayRef originally
  std::vector<unsigned> Str;

  /// A repeated substring in the tree.
  struct RepeatedSubstring {
    /// The length of the string.
    unsigned Length;

    /// The start indices of each occurrence.
    std::vector<unsigned> StartIndices;

    bool operator==(const RepeatedSubstring& other) const {
      return Length == other.Length && StartIndices == other.StartIndices;
    }
  };

  friend std::ostream& operator<<(std::ostream& os,
                                  RepeatedSubstring substring) {
    os << "SuffixTree::RepeatedSubstring{" << substring.Length
       << "u, (std::vector<unsigned>{";
    for (unsigned idx = 0; idx < substring.StartIndices.size(); idx++) {
      os << substring.StartIndices[idx];
      if (idx != substring.StartIndices.size() - 1) {
        os << ", ";
      }
    }
    return os << "})}";
  }

private:
  /// Maintains internal nodes in the tree.
  SpecificBumpPtrAllocator<SuffixTreeInternalNode> InternalNodeAllocator;
  /// Maintains leaf nodes in the tree.
  SpecificBumpPtrAllocator<SuffixTreeLeafNode> LeafNodeAllocator;

  /// The root of the suffix tree.
  ///
  /// The root represents the empty string. It is maintained by the
  /// \p NodeAllocator like every other node in the tree.
  SuffixTreeInternalNode* Root = nullptr;

  /// The end index of each leaf in the tree.
  unsigned LeafEndIdx = SuffixTreeNode::EmptyIdx;

  /// Helper struct which keeps track of the next insertion point in
  /// Ukkonen's algorithm.
  struct ActiveState {
    /// The next node to insert at.
    SuffixTreeInternalNode* Node = nullptr;

    /// The index of the first character in the substring currently being added.
    unsigned Idx = SuffixTreeNode::EmptyIdx;

    /// The length of the substring we have to add at the current step.
    unsigned Len = 0;
  };

  /// The point the next insertion will take place at in the
  /// construction algorithm.
  ActiveState Active;

  /// Allocate a leaf node and add it to the tree.
  ///
  /// \param Parent The parent of this node.
  /// \param StartIdx The start index of this node's associated string.
  /// \param Edge The label on the edge leaving \p Parent to this node.
  ///
  /// \returns A pointer to the allocated leaf node.
  SuffixTreeNode*
  insertLeaf(SuffixTreeInternalNode& Parent, unsigned StartIdx, unsigned Edge);

  /// Allocate an internal node and add it to the tree.
  ///
  /// \param Parent The parent of this node. Only null when allocating the root.
  /// \param StartIdx The start index of this node's associated string.
  /// \param EndIdx The end index of this node's associated string.
  /// \param Edge The label on the edge leaving \p Parent to this node.
  ///
  /// \returns A pointer to the allocated internal node.
  SuffixTreeInternalNode* insertInternalNode(SuffixTreeInternalNode* Parent,
                                             unsigned StartIdx,
                                             unsigned EndIdx,
                                             unsigned Edge);

  /// Allocate the root node and add it to the tree.
  ///
  /// \returns A pointer to the root.
  SuffixTreeInternalNode* insertRoot();

  /// Set the suffix indices of the leaves to the start indices of their
  /// respective suffixes.
  void setSuffixIndices();

  /// Construct the suffix tree for the prefix of the input ending at
  /// \p EndIdx.
  ///
  /// Used to construct the full suffix tree iteratively. At the end of each
  /// step, the constructed suffix tree is either a valid suffix tree, or a
  /// suffix tree with implicit suffixes. At the end of the final step, the
  /// suffix tree is a valid tree.
  ///
  /// \param EndIdx The end index of the current prefix in the main string.
  /// \param SuffixesToAdd The number of suffixes that must be added
  /// to complete the suffix tree at the current phase.
  ///
  /// \returns The number of suffixes that have not been added at the end of
  /// this step.
  unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);

public:
  /// Construct a suffix tree from a sequence of unsigned integers.
  ///
  /// \param Str The string to construct the suffix tree for.
  SuffixTree(const std::vector<unsigned>& Str);

  /// Iterator for finding all repeated substrings in the suffix tree.
  struct RepeatedSubstringIterator {
  private:
    /// The current node we're visiting.
    SuffixTreeNode* N = nullptr;

    /// The repeated substring associated with this node.
    RepeatedSubstring RS;

    /// The nodes left to visit.
    std::vector<SuffixTreeInternalNode*> InternalNodesToVisit;

    /// The minimum length of a repeated substring to find.
    /// Since we're outlining, we want at least two instructions in the range.
    /// FIXME: This may not be true for targets like X86 which support many
    /// instruction lengths.
    const unsigned MinLength = 2;

    /// Move the iterator to the next repeated substring.
    void advance();

  public:
    using iterator_category = std::input_iterator_tag;
    using value_type = RepeatedSubstring;
    using difference_type = std::ptrdiff_t;
    using pointer = const RepeatedSubstring*;
    using reference = const RepeatedSubstring&;

    /// Return the current repeated substring.
    RepeatedSubstring& operator*() { return RS; }
    RepeatedSubstring* operator->() { return &RS; }

    RepeatedSubstringIterator& operator++() {
      advance();
      return *this;
    }

    RepeatedSubstringIterator operator++(int I) {
      RepeatedSubstringIterator It(*this);
      advance();
      return It;
    }

    bool operator==(const RepeatedSubstringIterator& Other) const {
      return N == Other.N;
    }
    bool operator!=(const RepeatedSubstringIterator& Other) const {
      return !(*this == Other);
    }

    RepeatedSubstringIterator(SuffixTreeInternalNode* N) : N(N) {
      // Do we have a non-null node?
      if (!N) {
        return;
      }
      // Yes. At the first step, we need to visit all of N's children.
      // Note: This means that we visit N last.
      InternalNodesToVisit.push_back(N);
      advance();
    }
  };

  typedef RepeatedSubstringIterator iterator;
  iterator begin() { return iterator(Root); }
  iterator end() { return iterator(nullptr); }
};

} // namespace wasm

#endif // wasm_support_suffix_tree_h