diff options
-rw-r--r-- | src/support/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/support/dfa_minimization.cpp | 362 | ||||
-rw-r--r-- | src/support/dfa_minimization.h | 103 | ||||
-rw-r--r-- | test/gtest/CMakeLists.txt | 1 | ||||
-rw-r--r-- | test/gtest/dfa_minimization.cpp | 135 |
5 files changed, 602 insertions, 0 deletions
diff --git a/src/support/CMakeLists.txt b/src/support/CMakeLists.txt index a02c1f447..2758ee198 100644 --- a/src/support/CMakeLists.txt +++ b/src/support/CMakeLists.txt @@ -5,6 +5,7 @@ set(support_SOURCES colors.cpp command-line.cpp debug.cpp + dfa_minimization.cpp file.cpp istring.cpp path.cpp diff --git a/src/support/dfa_minimization.cpp b/src/support/dfa_minimization.cpp new file mode 100644 index 000000000..5df67c585 --- /dev/null +++ b/src/support/dfa_minimization.cpp @@ -0,0 +1,362 @@ +/* + * Copyright 2023 WebAssembly Community Group participants + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <map> + +#include "dfa_minimization.h" + +namespace wasm::DFA { + +namespace { + +// A vector of initial partitions, each of which is a vector of elements, each +// of which is a vector of successor indices. +using InputGraph = std::vector<std::vector<std::vector<size_t>>>; + +// The Refined Partitions data structure used in Valmari-Lehtinen DFA +// minimization. The translation from terms used in the Valmari-Lehtinen paper +// to the more expanded terms used here is: +// +// Block => Set +// elems => elements +// loc => elementIndices +// sidx => setIndices +// first => beginnings +// end => endings +// mid => pivots +// +struct Partitions { + // The number of sets. + size_t sets = 0; + + // The partitioned elements. Elements in the same set are next to each other. + // Within each set, "marked" elements come first followed by "unmarked" + // elements. + std::vector<size_t> elements; + + // Maps elements to their indices in `elements`. + std::vector<size_t> elementIndices; + + // Maps elements to their sets, identified by an index. + std::vector<size_t> setIndices; + + // Maps sets to the indices of their first elements in `elements`. + std::vector<size_t> beginnings; + + // Maps sets to (one past) the indices of their ends in `elements`. + std::vector<size_t> endings; + + // Maps sets to the indices of their first unmarked elements in `elements`. + std::vector<size_t> pivots; + + Partitions() = default; + + // Allocate space up front so we never need to re-allocate. The actual + // contents of all the vectors will need to be externally initialized, + // though. + Partitions(size_t size) + : elements(size), elementIndices(size), setIndices(size), beginnings(size), + endings(size), pivots(size) {} + + struct Set { + using Iterator = std::vector<size_t>::iterator; + + Partitions& partitions; + size_t index; + + Set(Partitions& partitions, size_t index) + : partitions(partitions), index(index) {} + + Iterator begin() { + return partitions.elements.begin() + partitions.beginnings[index]; + } + Iterator end() { + return partitions.elements.begin() + partitions.endings[index]; + } + size_t size() { + return partitions.endings[index] - partitions.beginnings[index]; + } + + bool hasMarks() { + return partitions.pivots[index] != partitions.beginnings[index]; + } + + // Split the set between marked and unmarked elements if there are both + // marked and unmarked elements. Unmark all elements of this set regardless. + // Return the index of the new partition or 0 if there was no split. + size_t split() { + size_t begin = partitions.beginnings[index]; + size_t end = partitions.endings[index]; + size_t pivot = partitions.pivots[index]; + if (pivot == begin) { + // No elements marked, so there is nothing to do. + return 0; + } + if (pivot == end) { + // All elements were marked, so just unmark them. + partitions.pivots[index] = begin; + return 0; + } + // Create a new set covering the marked region. + size_t newIndex = partitions.sets++; + partitions.beginnings[newIndex] = begin; + partitions.pivots[newIndex] = begin; + partitions.endings[newIndex] = pivot; + for (size_t i = begin; i < pivot; ++i) { + partitions.setIndices[partitions.elements[i]] = newIndex; + } + // Update the old set. The end and pivot are already correct. + partitions.beginnings[index] = pivot; + return newIndex; + } + }; + + Set getSet(size_t index) { return {*this, index}; } + + // Returns the set containing an element, which can be iterated upon. The set + // may be invalidated by calls to `mark` or `Set::split`. + Set getSetForElem(size_t element) { return getSet(setIndices[element]); } + + void mark(size_t element) { + size_t index = elementIndices[element]; + size_t set = setIndices[element]; + size_t pivot = pivots[set]; + if (index >= pivot) { + // Move the pivot element into the location of the newly marked element. + elements[index] = elements[pivot]; + elementIndices[elements[index]] = index; + // Move the newly marked element into the pivot location. + elements[pivot] = element; + elementIndices[element] = pivot; + // Update the pivot index to mark the element. + ++pivots[set]; + } + } +}; + +Partitions initializeStatePartitions(const InputGraph& inputGraph, + size_t numElements) { + Partitions partitions(numElements); + size_t elementIndex = 0; + for (const auto& partition : inputGraph) { + size_t set = partitions.sets++; + partitions.beginnings[set] = elementIndex; + partitions.pivots[set] = elementIndex; + for (size_t i = 0; i < partition.size(); ++i) { + partitions.elements[elementIndex] = elementIndex; + partitions.elementIndices[elementIndex] = elementIndex; + partitions.setIndices[elementIndex] = set; + ++elementIndex; + } + partitions.endings[set] = elementIndex; + } + return partitions; +} + +// A DFA transition into a state. +struct Transition { + size_t pred; + size_t label; +}; + +void initializeTransitions(const InputGraph& inputGraph, + size_t numElements, + size_t numTransitions, + std::vector<Transition>& transitions, + std::vector<size_t>& transitionIndices) { + // Find the transitions into each state. Map destinations to input + // transitions. + std::map<size_t, std::vector<Transition>> transitionMap; + size_t elementIndex = 0; + for (const auto& partition : inputGraph) { + for (const auto& elem : partition) { + size_t label = 0; + for (const auto& succ : elem) { + transitionMap[succ].push_back({elementIndex, label++}); + } + ++elementIndex; + } + } + + // Populate `transitions` and `transitionIndices`. + transitions.reserve(numTransitions); + transitionIndices.reserve(numElements + 1); + for (size_t dest = 0; dest < numElements; ++dest) { + // Record the first index of transitions leading to `dest`. + transitionIndices.push_back(transitions.size()); + if (auto it = transitionMap.find(dest); it != transitionMap.end()) { + transitions.insert( + transitions.end(), it->second.begin(), it->second.end()); + } + } + // Record one-past the end of the transitions leading to the final `dest`. + transitionIndices.push_back(transitions.size()); +} + +Partitions +initializeSplitterPartitions(Partitions& partitions, + const std::vector<Transition>& transitions, + const std::vector<size_t>& transitionIndices) { + // The initial sets of splitters are partitioned by destination state + // partition and transition label. + Partitions splitters(transitions.size()); + size_t elementIndex = 0; + for (size_t statePartition = 0; statePartition < partitions.sets; + ++statePartition) { + // The in-transitions leading to states in the current partition, organized + // by transition label. + std::map<size_t, std::vector<size_t>> currTransitions; + for (size_t state : partitions.getSet(statePartition)) { + for (size_t transition = transitionIndices[state], + end = transitionIndices[state + 1]; + transition < end; + ++transition) { + currTransitions[transitions[transition].label].push_back(transition); + } + } + // Create a splitter partition for each in-transition label leading to the + // current state partition. + for (auto& pair : currTransitions) { + size_t set = splitters.sets++; + splitters.beginnings[set] = elementIndex; + splitters.pivots[set] = elementIndex; + for (size_t transition : pair.second) { + splitters.elements[elementIndex] = transition; + splitters.elementIndices[transition] = elementIndex; + splitters.setIndices[transition] = set; + ++elementIndex; + } + splitters.endings[set] = elementIndex; + } + } + return splitters; +} + +} // anonymous namespace + +namespace Internal { + +std::vector<std::vector<size_t>> +refinePartitionsImpl(const InputGraph& inputGraph) { + // Find the number of states and transitions. + size_t numElements = 0; + size_t numTransitions = 0; + for (const auto& partition : inputGraph) { + numElements += partition.size(); + for (const auto& elem : partition) { + numTransitions += elem.size(); + } + } + + // The partitions of DFA states. + Partitions partitions = initializeStatePartitions(inputGraph, numElements); + + // The transitions arranged such that the transitions leading to state `q` are + // `transitions[transitionIndices[q] : transitionIndices[q+1]]`. + std::vector<Transition> transitions; + std::vector<size_t> transitionIndices; + initializeTransitions( + inputGraph, numElements, numTransitions, transitions, transitionIndices); + + // The splitters, which are partitions of the input transitions. + Partitions splitters = + initializeSplitterPartitions(partitions, transitions, transitionIndices); + + // The list of splitter partitions that might be able to split states in some + // state partition. Starts out containing all splitter partitions. + std::vector<size_t> potentialSplitters; + potentialSplitters.reserve(splitters.sets); + for (size_t i = 0; i < splitters.sets; ++i) { + potentialSplitters.push_back(i); + } + + while (!potentialSplitters.empty()) { + size_t potentialSplitter = potentialSplitters.back(); + potentialSplitters.pop_back(); + + // The partitions that may be able to be split. + std::vector<size_t> markedPartitions; + + // Mark states that are predecessors via this splitter partition. + for (size_t transition : splitters.getSet(potentialSplitter)) { + size_t state = transitions[transition].pred; + auto partition = partitions.getSetForElem(state); + if (!partition.hasMarks()) { + markedPartitions.push_back(partition.index); + } + partitions.mark(state); + } + + // Try to split each partition with marked states. + for (size_t partition : markedPartitions) { + size_t newPartition = partitions.getSet(partition).split(); + if (!newPartition) { + // There was nothing to split. + continue; + } + + // We only want to keep using the smaller of the two split partitions. + if (partitions.getSet(newPartition).size() < + partitions.getSet(partition).size()) { + newPartition = partition; + } + + // The splitter partitions that may need to be split to match the new + // split of the state partitions. + std::vector<size_t> markedSplitters; + + // Mark transitions that lead to the newly split off states. + for (size_t state : partitions.getSet(newPartition)) { + for (size_t t = transitionIndices[state], + end = transitionIndices[state + 1]; + t < end; + ++t) { + auto splitter = splitters.getSetForElem(t); + if (!splitter.hasMarks()) { + markedSplitters.push_back(splitter.index); + } + splitters.mark(t); + } + } + + // Split the splitters and update `potentialSplitters`. + for (size_t splitter : markedSplitters) { + size_t newSplitter = splitters.getSet(splitter).split(); + if (newSplitter) { + potentialSplitters.push_back(newSplitter); + } + } + } + } + + // Return the refined partitions. + std::vector<std::vector<size_t>> resultPartitions; + resultPartitions.reserve(partitions.sets); + for (size_t p = 0; p < partitions.sets; ++p) { + auto partition = partitions.getSet(p); + std::vector<size_t> resultPartition; + resultPartition.reserve(partition.size()); + for (size_t elem : partition) { + resultPartition.push_back(elem); + } + resultPartitions.emplace_back(std::move(resultPartition)); + } + return resultPartitions; +} + +} // namespace Internal + +} // namespace wasm::DFA diff --git a/src/support/dfa_minimization.h b/src/support/dfa_minimization.h new file mode 100644 index 000000000..a641022a7 --- /dev/null +++ b/src/support/dfa_minimization.h @@ -0,0 +1,103 @@ +/* + * Copyright 2023 WebAssembly Community Group participants + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Use the Valmari-Lehtinen DFA minimization algorithm +// (https://arxiv.org/pdf/0802.2826.pdf) to find equivalent elements in a +// user-provided DFA. + +#ifndef wasm_support_dfa_minimization_h +#define wasm_support_dfa_minimization_h + +#include <cassert> +#include <cstddef> +#include <unordered_map> +#include <vector> + +namespace wasm::DFA { + +namespace Internal { + +std::vector<std::vector<size_t>> +refinePartitionsImpl(const std::vector<std::vector<std::vector<size_t>>>&); + +} // namespace Internal + +// A DFA state with an ordered list of successor states. +template<typename T> struct State { + T val; + std::vector<T> succs; +}; + +// Given a vector of initial state partitions in which states known to be +// different are in different partitions, return a vector of refined partitions, +// each containing a different equivalence class of states. No value should +// appear more than once in the input. All successor values should appear in +// some top-level partition. +template<typename T> +std::vector<std::vector<T>> +refinePartitions(const std::vector<std::vector<State<T>>>& partitions) { + // Map values to indices and vice versa. + std::unordered_map<T, size_t> indices; + std::vector<T> values; + for (const auto& partition : partitions) { + for (const auto& state : partition) { + [[maybe_unused]] bool inserted = + indices.insert({state.val, indices.size()}).second; + assert(inserted && "unexpected repeated value"); + values.push_back(state.val); + } + } + + // Create a copy of the DFA that uses indices instead of the original values. + std::vector<std::vector<std::vector<size_t>>> indexPartitions; + indexPartitions.reserve(partitions.size()); + for (const auto& partition : partitions) { + std::vector<std::vector<size_t>> indexPartition; + indexPartition.reserve(partition.size()); + for (const auto& state : partition) { + std::vector<size_t> succIndices; + succIndices.reserve(state.succs.size()); + for (const auto& succ : state.succs) { + auto it = indices.find(succ); + assert(it != indices.end() && "unknown successor value"); + succIndices.push_back(it->second); + } + indexPartition.emplace_back(std::move(succIndices)); + } + indexPartitions.emplace_back(std::move(indexPartition)); + } + + // Refine the partitions. + auto indexResults = Internal::refinePartitionsImpl(indexPartitions); + + // Map the refined partitions of indices back to values. + std::vector<std::vector<T>> results; + results.reserve(indexResults.size()); + for (const auto& indexPartition : indexResults) { + std::vector<T> partition; + partition.reserve(indexPartition.size()); + for (size_t index : indexPartition) { + partition.push_back(values[index]); + } + results.emplace_back(std::move(partition)); + } + + return results; +} + +} // namespace wasm::DFA + +#endif // wasm_support_dfa_minimization_h diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt index 548f01ff6..21b7ad4ad 100644 --- a/test/gtest/CMakeLists.txt +++ b/test/gtest/CMakeLists.txt @@ -2,6 +2,7 @@ include_directories(../../third_party/googletest/googletest/include) include_directories(../../src/wasm) set(unittest_SOURCES + dfa_minimization.cpp possible-contents.cpp type-builder.cpp wat-lexer.cpp diff --git a/test/gtest/dfa_minimization.cpp b/test/gtest/dfa_minimization.cpp new file mode 100644 index 000000000..a63eefde1 --- /dev/null +++ b/test/gtest/dfa_minimization.cpp @@ -0,0 +1,135 @@ +/* + * Copyright 2023 WebAssembly Community Group participants + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <set> +#include <string> +#include <vector> + +#include "support/dfa_minimization.h" +#include "gtest/gtest.h" + +using namespace wasm; + +using State = DFA::State<std::string>; +using Graph = std::vector<std::vector<State>>; +using Results = std::vector<std::vector<std::string>>; +using SetResults = std::set<std::set<std::string>>; + +// We don't care about order between or within the output partitions. +SetResults settify(const Results& vecs) { + SetResults sets; + for (const auto& vec : vecs) { + sets.emplace(vec.begin(), vec.end()); + } + return sets; +} + +// Check that the same values appear in the input and output and that each value +// appears once. +void validate(const Graph& graph, const Results& results) { + std::set<std::string> inputVals; + for (const auto& partition : graph) { + for (const auto& state : partition) { + bool inserted = inputVals.insert(state.val).second; + ASSERT_TRUE(inserted); + } + } + + std::set<std::string> outputVals; + for (const auto& partition : results) { + for (const auto& val : partition) { + ASSERT_NE(inputVals.find(val), inputVals.end()); + ASSERT_TRUE(outputVals.insert(val).second); + } + } + + ASSERT_EQ(outputVals.size(), inputVals.size()); +} + +TEST(DFATest, Empty) { + Graph graph = {}; + auto results = DFA::refinePartitions(graph); + validate(graph, results); + EXPECT_EQ(settify(results), SetResults{}); +} + +TEST(DFATest, Singleton) { + Graph graph = {{{"A", {}}}}; + auto results = DFA::refinePartitions(graph); + validate(graph, results); + EXPECT_EQ(settify(results), SetResults{{"A"}}); +} + +TEST(DFATest, CyclePartitioned) { + // The partitions are already maximally refined, so shouldn't change. + State a{"A", {"B"}}; + State b{"B", {"C"}}; + State c{"C", {"A"}}; + Graph graph = {{a}, {b}, {c}}; + auto results = DFA::refinePartitions(graph); + validate(graph, results); + EXPECT_EQ(settify(results), (SetResults{{"A"}, {"B"}, {"C"}})); +} + +TEST(DFATest, CycleGrouped) { + // All the states are identical, so the the partition shouldn't change. + State a{"A", {"B"}}; + State b{"B", {"C"}}; + State c{"C", {"A"}}; + Graph graph = {{a, b, c}}; + auto results = DFA::refinePartitions(graph); + validate(graph, results); + EXPECT_EQ(settify(results), (SetResults{{"A", "B", "C"}})); +} + +TEST(DFATest, CycleRefined) { + // A and B are distinguishable, so the partitions should be refined. + State a{"A", {"B"}}; + State b{"B", {"C"}}; + State c{"C", {"A"}}; + Graph graph = {{a, b}, {c}}; + auto results = DFA::refinePartitions(graph); + validate(graph, results); + EXPECT_EQ(settify(results), (SetResults{{"A"}, {"B"}, {"C"}})); +} + +TEST(DFATest, ChainZip) { + // Chain A->B->C->D is identical to chain W->X->Y->Z. + State a{"A", {"B"}}, w{"W", {"X"}}; + State b{"B", {"C"}}, x{"X", {"Y"}}; + State c{"C", {"D"}}, y{"Y", {"Z"}}; + State d{"D", {}}, z{"Z", {}}; + Graph graph = {{a, b, c, d, w, x, y, z}}; + auto results = DFA::refinePartitions(graph); + validate(graph, results); + EXPECT_EQ(settify(results), + (SetResults{{"A", "W"}, {"B", "X"}, {"C", "Y"}, {"D", "Z"}})); +} + +TEST(DFATest, ChainUnzip) { + // Chain A->B->C->D looks a lot like chain W->X->Y->Z, but since Z is + // distinguishable, the full chains end up being separated. + State a{"A", {"B"}}, w{"W", {"X"}}; + State b{"B", {"C"}}, x{"X", {"Y"}}; + State c{"C", {"D"}}, y{"Y", {"Z"}}; + State d{"D", {}}, z{"Z", {}}; + Graph graph = {{a, b, c, d, w, x, y}, {z}}; + auto results = DFA::refinePartitions(graph); + validate(graph, results); + EXPECT_EQ( + settify(results), + (SetResults{{"A"}, {"W"}, {"B"}, {"X"}, {"C"}, {"Y"}, {"D"}, {"Z"}})); +} |