diff options
Diffstat (limited to 'src/support/suffix_tree.h')
-rw-r--r-- | src/support/suffix_tree.h | 211 |
1 files changed, 211 insertions, 0 deletions
diff --git a/src/support/suffix_tree.h b/src/support/suffix_tree.h new file mode 100644 index 000000000..474984dcd --- /dev/null +++ b/src/support/suffix_tree.h @@ -0,0 +1,211 @@ +// This file began as an import from LLVM, and so it has the same license as +// LLVM, copied below together with the code. + +//===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// A data structure for fast substring queries. +// +// Suffix trees represent the suffixes of their input strings in their leaves. +// A suffix tree is a type of compressed trie structure where each node +// represents an entire substring rather than a single character. Each leaf +// of the tree is a suffix. +// +// A suffix tree can be seen as a type of state machine where each state is a +// substring of the full string. The tree is structured so that, for a string +// of length N, there are exactly N leaves in the tree. This structure allows +// us to quickly find repeated substrings of the input string. +// +// In this implementation, a "string" is a vector of unsigned integers. +// These integers may result from hashing some data type. A suffix tree can +// contain 1 or many strings, which can then be queried as one large string. +// +// The suffix tree is implemented using Ukkonen's algorithm for linear-time +// suffix tree construction. Ukkonen's algorithm is explained in more detail +// in the paper by Esko Ukkonen "On-line construction of suffix trees. The +// paper is available at +// +// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf +//===----------------------------------------------------------------------===// + +#ifndef wasm_support_suffix_tree_h +#define wasm_support_suffix_tree_h + +#include "llvm/Support/Allocator.h" +#include <cassert> +#include <cstddef> +#include <vector> + +#include "support/suffix_tree_node.h" + +using namespace llvm; + +namespace wasm { +class SuffixTree { +public: + /// Each element is an integer representing an instruction in the module. + // TODO: This was an ArrayRef originally + std::vector<unsigned> Str; + + /// A repeated substring in the tree. + struct RepeatedSubstring { + /// The length of the string. + unsigned Length; + + /// The start indices of each occurrence. + std::vector<unsigned> StartIndices; + }; + +private: + /// Maintains internal nodes in the tree. + SpecificBumpPtrAllocator<SuffixTreeInternalNode> InternalNodeAllocator; + /// Maintains leaf nodes in the tree. + SpecificBumpPtrAllocator<SuffixTreeLeafNode> LeafNodeAllocator; + + /// The root of the suffix tree. + /// + /// The root represents the empty string. It is maintained by the + /// \p NodeAllocator like every other node in the tree. + SuffixTreeInternalNode* Root = nullptr; + + /// The end index of each leaf in the tree. + unsigned LeafEndIdx = SuffixTreeNode::EmptyIdx; + + /// Helper struct which keeps track of the next insertion point in + /// Ukkonen's algorithm. + struct ActiveState { + /// The next node to insert at. + SuffixTreeInternalNode* Node = nullptr; + + /// The index of the first character in the substring currently being added. + unsigned Idx = SuffixTreeNode::EmptyIdx; + + /// The length of the substring we have to add at the current step. + unsigned Len = 0; + }; + + /// The point the next insertion will take place at in the + /// construction algorithm. + ActiveState Active; + + /// Allocate a leaf node and add it to the tree. + /// + /// \param Parent The parent of this node. + /// \param StartIdx The start index of this node's associated string. + /// \param Edge The label on the edge leaving \p Parent to this node. + /// + /// \returns A pointer to the allocated leaf node. + SuffixTreeNode* + insertLeaf(SuffixTreeInternalNode& Parent, unsigned StartIdx, unsigned Edge); + + /// Allocate an internal node and add it to the tree. + /// + /// \param Parent The parent of this node. Only null when allocating the root. + /// \param StartIdx The start index of this node's associated string. + /// \param EndIdx The end index of this node's associated string. + /// \param Edge The label on the edge leaving \p Parent to this node. + /// + /// \returns A pointer to the allocated internal node. + SuffixTreeInternalNode* insertInternalNode(SuffixTreeInternalNode* Parent, + unsigned StartIdx, + unsigned EndIdx, + unsigned Edge); + + /// Allocate the root node and add it to the tree. + /// + /// \returns A pointer to the root. + SuffixTreeInternalNode* insertRoot(); + + /// Set the suffix indices of the leaves to the start indices of their + /// respective suffixes. + void setSuffixIndices(); + + /// Construct the suffix tree for the prefix of the input ending at + /// \p EndIdx. + /// + /// Used to construct the full suffix tree iteratively. At the end of each + /// step, the constructed suffix tree is either a valid suffix tree, or a + /// suffix tree with implicit suffixes. At the end of the final step, the + /// suffix tree is a valid tree. + /// + /// \param EndIdx The end index of the current prefix in the main string. + /// \param SuffixesToAdd The number of suffixes that must be added + /// to complete the suffix tree at the current phase. + /// + /// \returns The number of suffixes that have not been added at the end of + /// this step. + unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd); + +public: + /// Construct a suffix tree from a sequence of unsigned integers. + /// + /// \param Str The string to construct the suffix tree for. + SuffixTree(const std::vector<unsigned>& Str); + + /// Iterator for finding all repeated substrings in the suffix tree. + struct RepeatedSubstringIterator { + private: + /// The current node we're visiting. + SuffixTreeNode* N = nullptr; + + /// The repeated substring associated with this node. + RepeatedSubstring RS; + + /// The nodes left to visit. + std::vector<SuffixTreeInternalNode*> InternalNodesToVisit; + + /// The minimum length of a repeated substring to find. + /// Since we're outlining, we want at least two instructions in the range. + /// FIXME: This may not be true for targets like X86 which support many + /// instruction lengths. + const unsigned MinLength = 2; + + /// Move the iterator to the next repeated substring. + void advance(); + + public: + /// Return the current repeated substring. + RepeatedSubstring& operator*() { return RS; } + + RepeatedSubstringIterator& operator++() { + advance(); + return *this; + } + + RepeatedSubstringIterator operator++(int I) { + RepeatedSubstringIterator It(*this); + advance(); + return It; + } + + bool operator==(const RepeatedSubstringIterator& Other) const { + return N == Other.N; + } + bool operator!=(const RepeatedSubstringIterator& Other) const { + return !(*this == Other); + } + + RepeatedSubstringIterator(SuffixTreeInternalNode* N) : N(N) { + // Do we have a non-null node? + if (!N) { + return; + } + // Yes. At the first step, we need to visit all of N's children. + // Note: This means that we visit N last. + InternalNodesToVisit.push_back(N); + advance(); + } + }; + + typedef RepeatedSubstringIterator iterator; + iterator begin() { return iterator(Root); } + iterator end() { return iterator(nullptr); } +}; + +} // namespace wasm + +#endif // wasm_support_suffix_tree_h |