diff options
author | Ashley Nelson <nashley@google.com> | 2023-10-04 01:58:48 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-04 08:58:48 +0000 |
commit | 2abc1a80d42ccd3d8e85543d104b5a6dee127248 (patch) | |
tree | f5b807fdacc07d38db0353ee0becf321f77c2f33 /src/passes/hash-stringify-walker.cpp | |
parent | 2ed773f3981a6014c55d039ad88b8e9889ce9eca (diff) | |
download | binaryen-2abc1a80d42ccd3d8e85543d104b5a6dee127248.tar.gz binaryen-2abc1a80d42ccd3d8e85543d104b5a6dee127248.tar.bz2 binaryen-2abc1a80d42ccd3d8e85543d104b5a6dee127248.zip |
[Outlining] Adds SuffixTree::RepeatSubstring dedupe test (#5972)
This PR adds a StringProcessor struct intended to hold functions that filter vectors of SuffixTree::RepeatedSubstring, and a test of its first functionality, removing overlapping repeated substrings.
Diffstat (limited to 'src/passes/hash-stringify-walker.cpp')
-rw-r--r-- | src/passes/hash-stringify-walker.cpp | 40 |
1 files changed, 40 insertions, 0 deletions
diff --git a/src/passes/hash-stringify-walker.cpp b/src/passes/hash-stringify-walker.cpp index 3df01b290..9a7cee08f 100644 --- a/src/passes/hash-stringify-walker.cpp +++ b/src/passes/hash-stringify-walker.cpp @@ -68,4 +68,44 @@ void HashStringifyWalker::visitExpression(Expression* curr) { } } +// Deduplicate substrings by iterating through the list of substrings, keeping +// only those whose list of end indices is disjoint from the set of end indices +// for all substrings kept so far. Substrings that are contained within other +// substrings will always share an end index with those other substrings. Note +// that this deduplication may be over-aggressive, since it will remove +// substrings that are contained within any previous substring, even if they +// have many other occurrences that are not inside other substrings. Part of the +// reason dedupe can be so aggressive is an assumption 1) that the input +// substrings have been sorted so that the longest substrings with the most +// repeats come first and 2) these are more worthwhile to keep than subsequent +// substrings of substrings, even if they appear more times. +std::vector<SuffixTree::RepeatedSubstring> StringifyProcessor::dedupe( + const std::vector<SuffixTree::RepeatedSubstring> substrings) { + std::unordered_set<uint32_t> seen; + std::vector<SuffixTree::RepeatedSubstring> result; + for (auto substring : substrings) { + std::vector<uint32_t> idxToInsert; + bool seenEndIdx = false; + for (auto startIdx : substring.StartIndices) { + // We are using the end index to ensure that each repeated substring + // reported by the SuffixTree is unique. This is because LLVM's SuffixTree + // reports back repeat sequences that are substrings of longer repeat + // sequences with the same endIdx, and we generally prefer to outline + // longer repeat sequences. + uint32_t endIdx = substring.Length + startIdx; + if (seen.count(endIdx)) { + seenEndIdx = true; + break; + } + idxToInsert.push_back(endIdx); + } + if (!seenEndIdx) { + seen.insert(idxToInsert.begin(), idxToInsert.end()); + result.push_back(substring); + } + } + + return result; +} + } // namespace wasm |