[Outlining] Adds SuffixTree::RepeatSubstring dedupe test (#5972)

This PR adds a StringProcessor struct intended to hold functions that filter vectors of SuffixTree::RepeatedSubstring, and a test of its first functionality, removing overlapping repeated substrings.
author: Ashley Nelson <nashley@google.com> 2023-10-04 01:58:48 -0700
committer: GitHub <noreply@github.com> 2023-10-04 08:58:48 +0000
commit: 2abc1a80d42ccd3d8e85543d104b5a6dee127248 (patch)
tree: f5b807fdacc07d38db0353ee0becf321f77c2f33 /src/passes/hash-stringify-walker.cpp
parent: 2ed773f3981a6014c55d039ad88b8e9889ce9eca (diff)
download: binaryen-2abc1a80d42ccd3d8e85543d104b5a6dee127248.tar.gz
binaryen-2abc1a80d42ccd3d8e85543d104b5a6dee127248.tar.bz2
binaryen-2abc1a80d42ccd3d8e85543d104b5a6dee127248.zip
1 files changed, 40 insertions, 0 deletions
diff --git a/src/passes/hash-stringify-walker.cpp b/src/passes/hash-stringify-walker.cpp
index 3df01b290..9a7cee08f 100644
--- a/src/passes/hash-stringify-walker.cpp
+++ b/src/passes/hash-stringify-walker.cpp
@@ -68,4 +68,44 @@ void HashStringifyWalker::visitExpression(Expression* curr) {
   }
 }
 
+// Deduplicate substrings by iterating through the list of substrings, keeping
+// only those whose list of end indices is disjoint from the set of end indices
+// for all substrings kept so far. Substrings that are contained within other
+// substrings will always share an end index with those other substrings. Note
+// that this deduplication may be over-aggressive, since it will remove
+// substrings that are contained within any previous substring, even if they
+// have many other occurrences that are not inside other substrings. Part of the
+// reason dedupe can be so aggressive is an assumption 1) that the input
+// substrings have been sorted so that the longest substrings with the most
+// repeats come first and 2) these are more worthwhile to keep than subsequent
+// substrings of substrings, even if they appear more times.
+std::vector<SuffixTree::RepeatedSubstring> StringifyProcessor::dedupe(
+  const std::vector<SuffixTree::RepeatedSubstring> substrings) {
+  std::unordered_set<uint32_t> seen;
+  std::vector<SuffixTree::RepeatedSubstring> result;
+  for (auto substring : substrings) {
+    std::vector<uint32_t> idxToInsert;
+    bool seenEndIdx = false;
+    for (auto startIdx : substring.StartIndices) {
+      // We are using the end index to ensure that each repeated substring
+      // reported by the SuffixTree is unique. This is because LLVM's SuffixTree
+      // reports back repeat sequences that are substrings of longer repeat
+      // sequences with the same endIdx, and we generally prefer to outline
+      // longer repeat sequences.
+      uint32_t endIdx = substring.Length + startIdx;
+      if (seen.count(endIdx)) {
+        seenEndIdx = true;
+        break;
+      }
+      idxToInsert.push_back(endIdx);
+    }
+    if (!seenEndIdx) {
+      seen.insert(idxToInsert.begin(), idxToInsert.end());
+      result.push_back(substring);
+    }
+  }
+
+  return result;
+}
+
 } // namespace wasm
author	Ashley Nelson <nashley@google.com>	2023-10-04 01:58:48 -0700
committer	GitHub <noreply@github.com>	2023-10-04 08:58:48 +0000
commit	2abc1a80d42ccd3d8e85543d104b5a6dee127248 (patch)
tree	f5b807fdacc07d38db0353ee0becf321f77c2f33 /src/passes/hash-stringify-walker.cpp
parent	2ed773f3981a6014c55d039ad88b8e9889ce9eca (diff)
download	binaryen-2abc1a80d42ccd3d8e85543d104b5a6dee127248.tar.gz binaryen-2abc1a80d42ccd3d8e85543d104b5a6dee127248.tar.bz2 binaryen-2abc1a80d42ccd3d8e85543d104b5a6dee127248.zip