diff options
author | Ashley Nelson <nashley@google.com> | 2023-10-04 01:58:48 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-04 08:58:48 +0000 |
commit | 2abc1a80d42ccd3d8e85543d104b5a6dee127248 (patch) | |
tree | f5b807fdacc07d38db0353ee0becf321f77c2f33 /src | |
parent | 2ed773f3981a6014c55d039ad88b8e9889ce9eca (diff) | |
download | binaryen-2abc1a80d42ccd3d8e85543d104b5a6dee127248.tar.gz binaryen-2abc1a80d42ccd3d8e85543d104b5a6dee127248.tar.bz2 binaryen-2abc1a80d42ccd3d8e85543d104b5a6dee127248.zip |
[Outlining] Adds SuffixTree::RepeatSubstring dedupe test (#5972)
This PR adds a StringProcessor struct intended to hold functions that filter vectors of SuffixTree::RepeatedSubstring, and a test of its first functionality, removing overlapping repeated substrings.
Diffstat (limited to 'src')
-rw-r--r-- | src/passes/CMakeLists.txt | 9 | ||||
-rw-r--r-- | src/passes/hash-stringify-walker.cpp | 40 | ||||
-rw-r--r-- | src/passes/stringify-walker.h | 7 |
3 files changed, 56 insertions, 0 deletions
diff --git a/src/passes/CMakeLists.txt b/src/passes/CMakeLists.txt index bd29bfa0b..2d04d9a6f 100644 --- a/src/passes/CMakeLists.txt +++ b/src/passes/CMakeLists.txt @@ -12,6 +12,7 @@ string(APPEND WASM_INTRINSICS_EMBED "0x00") configure_file(WasmIntrinsics.cpp.in WasmIntrinsics.cpp @ONLY) FILE(GLOB passes_HEADERS *.h) + set(passes_SOURCES param-utils.cpp pass.cpp @@ -120,4 +121,12 @@ set(passes_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/WasmIntrinsics.cpp ${passes_HEADERS} ) +# The below condition is intended for removal once the suffix_tree and +# suffix_tree_node source files no longer depend on LLVM code in the +# third_party folder +if(EMSCRIPTEN) + list(REMOVE_ITEM passes_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/stringify-walker.h) + list(REMOVE_ITEM passes_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/stringify-walker-impl.h) + list(REMOVE_ITEM passes_SOURCES "hash-stringify-walker.cpp") +endif() add_library(passes OBJECT ${passes_SOURCES}) diff --git a/src/passes/hash-stringify-walker.cpp b/src/passes/hash-stringify-walker.cpp index 3df01b290..9a7cee08f 100644 --- a/src/passes/hash-stringify-walker.cpp +++ b/src/passes/hash-stringify-walker.cpp @@ -68,4 +68,44 @@ void HashStringifyWalker::visitExpression(Expression* curr) { } } +// Deduplicate substrings by iterating through the list of substrings, keeping +// only those whose list of end indices is disjoint from the set of end indices +// for all substrings kept so far. Substrings that are contained within other +// substrings will always share an end index with those other substrings. Note +// that this deduplication may be over-aggressive, since it will remove +// substrings that are contained within any previous substring, even if they +// have many other occurrences that are not inside other substrings. Part of the +// reason dedupe can be so aggressive is an assumption 1) that the input +// substrings have been sorted so that the longest substrings with the most +// repeats come first and 2) these are more worthwhile to keep than subsequent +// substrings of substrings, even if they appear more times. +std::vector<SuffixTree::RepeatedSubstring> StringifyProcessor::dedupe( + const std::vector<SuffixTree::RepeatedSubstring> substrings) { + std::unordered_set<uint32_t> seen; + std::vector<SuffixTree::RepeatedSubstring> result; + for (auto substring : substrings) { + std::vector<uint32_t> idxToInsert; + bool seenEndIdx = false; + for (auto startIdx : substring.StartIndices) { + // We are using the end index to ensure that each repeated substring + // reported by the SuffixTree is unique. This is because LLVM's SuffixTree + // reports back repeat sequences that are substrings of longer repeat + // sequences with the same endIdx, and we generally prefer to outline + // longer repeat sequences. + uint32_t endIdx = substring.Length + startIdx; + if (seen.count(endIdx)) { + seenEndIdx = true; + break; + } + idxToInsert.push_back(endIdx); + } + if (!seenEndIdx) { + seen.insert(idxToInsert.begin(), idxToInsert.end()); + result.push_back(substring); + } + } + + return result; +} + } // namespace wasm diff --git a/src/passes/stringify-walker.h b/src/passes/stringify-walker.h index 7e18d3d9e..c20d250ec 100644 --- a/src/passes/stringify-walker.h +++ b/src/passes/stringify-walker.h @@ -20,6 +20,7 @@ #include "ir/iteration.h" #include "ir/module-utils.h" #include "ir/utils.h" +#include "support/suffix_tree.h" #include "wasm-traversal.h" #include <queue> @@ -133,6 +134,12 @@ struct HashStringifyWalker : public StringifyWalker<HashStringifyWalker> { void visitExpression(Expression* curr); }; +// Functions that filter vectors of SuffixTree::RepeatedSubstring +struct StringifyProcessor { + static std::vector<SuffixTree::RepeatedSubstring> + dedupe(const std::vector<SuffixTree::RepeatedSubstring>); +}; + } // namespace wasm #endif // wasm_passes_stringify_walker_h |