summaryrefslogtreecommitdiff
path: root/src/passes/hash-stringify-walker.cpp
diff options
context:
space:
mode:
authorAshley Nelson <nashley@google.com>2023-10-04 01:58:48 -0700
committerGitHub <noreply@github.com>2023-10-04 08:58:48 +0000
commit2abc1a80d42ccd3d8e85543d104b5a6dee127248 (patch)
treef5b807fdacc07d38db0353ee0becf321f77c2f33 /src/passes/hash-stringify-walker.cpp
parent2ed773f3981a6014c55d039ad88b8e9889ce9eca (diff)
downloadbinaryen-2abc1a80d42ccd3d8e85543d104b5a6dee127248.tar.gz
binaryen-2abc1a80d42ccd3d8e85543d104b5a6dee127248.tar.bz2
binaryen-2abc1a80d42ccd3d8e85543d104b5a6dee127248.zip
[Outlining] Adds SuffixTree::RepeatSubstring dedupe test (#5972)
This PR adds a StringProcessor struct intended to hold functions that filter vectors of SuffixTree::RepeatedSubstring, and a test of its first functionality, removing overlapping repeated substrings.
Diffstat (limited to 'src/passes/hash-stringify-walker.cpp')
-rw-r--r--src/passes/hash-stringify-walker.cpp40
1 files changed, 40 insertions, 0 deletions
diff --git a/src/passes/hash-stringify-walker.cpp b/src/passes/hash-stringify-walker.cpp
index 3df01b290..9a7cee08f 100644
--- a/src/passes/hash-stringify-walker.cpp
+++ b/src/passes/hash-stringify-walker.cpp
@@ -68,4 +68,44 @@ void HashStringifyWalker::visitExpression(Expression* curr) {
}
}
+// Deduplicate substrings by iterating through the list of substrings, keeping
+// only those whose list of end indices is disjoint from the set of end indices
+// for all substrings kept so far. Substrings that are contained within other
+// substrings will always share an end index with those other substrings. Note
+// that this deduplication may be over-aggressive, since it will remove
+// substrings that are contained within any previous substring, even if they
+// have many other occurrences that are not inside other substrings. Part of the
+// reason dedupe can be so aggressive is an assumption 1) that the input
+// substrings have been sorted so that the longest substrings with the most
+// repeats come first and 2) these are more worthwhile to keep than subsequent
+// substrings of substrings, even if they appear more times.
+std::vector<SuffixTree::RepeatedSubstring> StringifyProcessor::dedupe(
+ const std::vector<SuffixTree::RepeatedSubstring> substrings) {
+ std::unordered_set<uint32_t> seen;
+ std::vector<SuffixTree::RepeatedSubstring> result;
+ for (auto substring : substrings) {
+ std::vector<uint32_t> idxToInsert;
+ bool seenEndIdx = false;
+ for (auto startIdx : substring.StartIndices) {
+ // We are using the end index to ensure that each repeated substring
+ // reported by the SuffixTree is unique. This is because LLVM's SuffixTree
+ // reports back repeat sequences that are substrings of longer repeat
+ // sequences with the same endIdx, and we generally prefer to outline
+ // longer repeat sequences.
+ uint32_t endIdx = substring.Length + startIdx;
+ if (seen.count(endIdx)) {
+ seenEndIdx = true;
+ break;
+ }
+ idxToInsert.push_back(endIdx);
+ }
+ if (!seenEndIdx) {
+ seen.insert(idxToInsert.begin(), idxToInsert.end());
+ result.push_back(substring);
+ }
+ }
+
+ return result;
+}
+
} // namespace wasm