5 files changed, 602 insertions, 0 deletions
diff --git a/src/support/CMakeLists.txt b/src/support/CMakeLists.txt
index a02c1f447..2758ee198 100644
--- a/src/support/CMakeLists.txt
+++ b/src/support/CMakeLists.txt
@@ -5,6 +5,7 @@ set(support_SOURCES
   colors.cpp
   command-line.cpp
   debug.cpp
+  dfa_minimization.cpp
   file.cpp
   istring.cpp
   path.cpp
diff --git a/src/support/dfa_minimization.cpp b/src/support/dfa_minimization.cpp
new file mode 100644
index 000000000..5df67c585
--- /dev/null
+++ b/src/support/dfa_minimization.cpp
@@ -0,0 +1,362 @@
+/*
+ * Copyright 2023 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <map>
+
+#include "dfa_minimization.h"
+
+namespace wasm::DFA {
+
+namespace {
+
+// A vector of initial partitions, each of which is a vector of elements, each
+// of which is a vector of successor indices.
+using InputGraph = std::vector<std::vector<std::vector<size_t>>>;
+
+// The Refined Partitions data structure used in Valmari-Lehtinen DFA
+// minimization. The translation from terms used in the Valmari-Lehtinen paper
+// to the more expanded terms used here is:
+//
+//   Block => Set
+//   elems => elements
+//   loc => elementIndices
+//   sidx => setIndices
+//   first => beginnings
+//   end => endings
+//   mid => pivots
+//
+struct Partitions {
+  // The number of sets.
+  size_t sets = 0;
+
+  // The partitioned elements. Elements in the same set are next to each other.
+  // Within each set, "marked" elements come first followed by "unmarked"
+  // elements.
+  std::vector<size_t> elements;
+
+  // Maps elements to their indices in `elements`.
+  std::vector<size_t> elementIndices;
+
+  // Maps elements to their sets, identified by an index.
+  std::vector<size_t> setIndices;
+
+  // Maps sets to the indices of their first elements in `elements`.
+  std::vector<size_t> beginnings;
+
+  // Maps sets to (one past) the indices of their ends in `elements`.
+  std::vector<size_t> endings;
+
+  // Maps sets to the indices of their first unmarked elements in `elements`.
+  std::vector<size_t> pivots;
+
+  Partitions() = default;
+
+  // Allocate space up front so we never need to re-allocate. The actual
+  // contents of all the vectors will need to be externally initialized,
+  // though.
+  Partitions(size_t size)
+    : elements(size), elementIndices(size), setIndices(size), beginnings(size),
+      endings(size), pivots(size) {}
+
+  struct Set {
+    using Iterator = std::vector<size_t>::iterator;
+
+    Partitions& partitions;
+    size_t index;
+
+    Set(Partitions& partitions, size_t index)
+      : partitions(partitions), index(index) {}
+
+    Iterator begin() {
+      return partitions.elements.begin() + partitions.beginnings[index];
+    }
+    Iterator end() {
+      return partitions.elements.begin() + partitions.endings[index];
+    }
+    size_t size() {
+      return partitions.endings[index] - partitions.beginnings[index];
+    }
+
+    bool hasMarks() {
+      return partitions.pivots[index] != partitions.beginnings[index];
+    }
+
+    // Split the set between marked and unmarked elements if there are both
+    // marked and unmarked elements. Unmark all elements of this set regardless.
+    // Return the index of the new partition or 0 if there was no split.
+    size_t split() {
+      size_t begin = partitions.beginnings[index];
+      size_t end = partitions.endings[index];
+      size_t pivot = partitions.pivots[index];
+      if (pivot == begin) {
+        // No elements marked, so there is nothing to do.
+        return 0;
+      }
+      if (pivot == end) {
+        // All elements were marked, so just unmark them.
+        partitions.pivots[index] = begin;
+        return 0;
+      }
+      // Create a new set covering the marked region.
+      size_t newIndex = partitions.sets++;
+      partitions.beginnings[newIndex] = begin;
+      partitions.pivots[newIndex] = begin;
+      partitions.endings[newIndex] = pivot;
+      for (size_t i = begin; i < pivot; ++i) {
+        partitions.setIndices[partitions.elements[i]] = newIndex;
+      }
+      // Update the old set. The end and pivot are already correct.
+      partitions.beginnings[index] = pivot;
+      return newIndex;
+    }
+  };
+
+  Set getSet(size_t index) { return {*this, index}; }
+
+  // Returns the set containing an element, which can be iterated upon. The set
+  // may be invalidated by calls to `mark` or `Set::split`.
+  Set getSetForElem(size_t element) { return getSet(setIndices[element]); }
+
+  void mark(size_t element) {
+    size_t index = elementIndices[element];
+    size_t set = setIndices[element];
+    size_t pivot = pivots[set];
+    if (index >= pivot) {
+      // Move the pivot element into the location of the newly marked element.
+      elements[index] = elements[pivot];
+      elementIndices[elements[index]] = index;
+      // Move the newly marked element into the pivot location.
+      elements[pivot] = element;
+      elementIndices[element] = pivot;
+      // Update the pivot index to mark the element.
+      ++pivots[set];
+    }
+  }
+};
+
+Partitions initializeStatePartitions(const InputGraph& inputGraph,
+                                     size_t numElements) {
+  Partitions partitions(numElements);
+  size_t elementIndex = 0;
+  for (const auto& partition : inputGraph) {
+    size_t set = partitions.sets++;
+    partitions.beginnings[set] = elementIndex;
+    partitions.pivots[set] = elementIndex;
+    for (size_t i = 0; i < partition.size(); ++i) {
+      partitions.elements[elementIndex] = elementIndex;
+      partitions.elementIndices[elementIndex] = elementIndex;
+      partitions.setIndices[elementIndex] = set;
+      ++elementIndex;
+    }
+    partitions.endings[set] = elementIndex;
+  }
+  return partitions;
+}
+
+// A DFA transition into a state.
+struct Transition {
+  size_t pred;
+  size_t label;
+};
+
+void initializeTransitions(const InputGraph& inputGraph,
+                           size_t numElements,
+                           size_t numTransitions,
+                           std::vector<Transition>& transitions,
+                           std::vector<size_t>& transitionIndices) {
+  // Find the transitions into each state. Map destinations to input
+  // transitions.
+  std::map<size_t, std::vector<Transition>> transitionMap;
+  size_t elementIndex = 0;
+  for (const auto& partition : inputGraph) {
+    for (const auto& elem : partition) {
+      size_t label = 0;
+      for (const auto& succ : elem) {
+        transitionMap[succ].push_back({elementIndex, label++});
+      }
+      ++elementIndex;
+    }
+  }
+
+  // Populate `transitions` and `transitionIndices`.
+  transitions.reserve(numTransitions);
+  transitionIndices.reserve(numElements + 1);
+  for (size_t dest = 0; dest < numElements; ++dest) {
+    // Record the first index of transitions leading to `dest`.
+    transitionIndices.push_back(transitions.size());
+    if (auto it = transitionMap.find(dest); it != transitionMap.end()) {
+      transitions.insert(
+        transitions.end(), it->second.begin(), it->second.end());
+    }
+  }
+  // Record one-past the end of the transitions leading to the final `dest`.
+  transitionIndices.push_back(transitions.size());
+}
+
+Partitions
+initializeSplitterPartitions(Partitions& partitions,
+                             const std::vector<Transition>& transitions,
+                             const std::vector<size_t>& transitionIndices) {
+  // The initial sets of splitters are partitioned by destination state
+  // partition and transition label.
+  Partitions splitters(transitions.size());
+  size_t elementIndex = 0;
+  for (size_t statePartition = 0; statePartition < partitions.sets;
+       ++statePartition) {
+    // The in-transitions leading to states in the current partition, organized
+    // by transition label.
+    std::map<size_t, std::vector<size_t>> currTransitions;
+    for (size_t state : partitions.getSet(statePartition)) {
+      for (size_t transition = transitionIndices[state],
+                  end = transitionIndices[state + 1];
+           transition < end;
+           ++transition) {
+        currTransitions[transitions[transition].label].push_back(transition);
+      }
+    }
+    // Create a splitter partition for each in-transition label leading to the
+    // current state partition.
+    for (auto& pair : currTransitions) {
+      size_t set = splitters.sets++;
+      splitters.beginnings[set] = elementIndex;
+      splitters.pivots[set] = elementIndex;
+      for (size_t transition : pair.second) {
+        splitters.elements[elementIndex] = transition;
+        splitters.elementIndices[transition] = elementIndex;
+        splitters.setIndices[transition] = set;
+        ++elementIndex;
+      }
+      splitters.endings[set] = elementIndex;
+    }
+  }
+  return splitters;
+}
+
+} // anonymous namespace
+
+namespace Internal {
+
+std::vector<std::vector<size_t>>
+refinePartitionsImpl(const InputGraph& inputGraph) {
+  // Find the number of states and transitions.
+  size_t numElements = 0;
+  size_t numTransitions = 0;
+  for (const auto& partition : inputGraph) {
+    numElements += partition.size();
+    for (const auto& elem : partition) {
+      numTransitions += elem.size();
+    }
+  }
+
+  // The partitions of DFA states.
+  Partitions partitions = initializeStatePartitions(inputGraph, numElements);
+
+  // The transitions arranged such that the transitions leading to state `q` are
+  // `transitions[transitionIndices[q] : transitionIndices[q+1]]`.
+  std::vector<Transition> transitions;
+  std::vector<size_t> transitionIndices;
+  initializeTransitions(
+    inputGraph, numElements, numTransitions, transitions, transitionIndices);
+
+  // The splitters, which are partitions of the input transitions.
+  Partitions splitters =
+    initializeSplitterPartitions(partitions, transitions, transitionIndices);
+
+  // The list of splitter partitions that might be able to split states in some
+  // state partition. Starts out containing all splitter partitions.
+  std::vector<size_t> potentialSplitters;
+  potentialSplitters.reserve(splitters.sets);
+  for (size_t i = 0; i < splitters.sets; ++i) {
+    potentialSplitters.push_back(i);
+  }
+
+  while (!potentialSplitters.empty()) {
+    size_t potentialSplitter = potentialSplitters.back();
+    potentialSplitters.pop_back();
+
+    // The partitions that may be able to be split.
+    std::vector<size_t> markedPartitions;
+
+    // Mark states that are predecessors via this splitter partition.
+    for (size_t transition : splitters.getSet(potentialSplitter)) {
+      size_t state = transitions[transition].pred;
+      auto partition = partitions.getSetForElem(state);
+      if (!partition.hasMarks()) {
+        markedPartitions.push_back(partition.index);
+      }
+      partitions.mark(state);
+    }
+
+    // Try to split each partition with marked states.
+    for (size_t partition : markedPartitions) {
+      size_t newPartition = partitions.getSet(partition).split();
+      if (!newPartition) {
+        // There was nothing to split.
+        continue;
+      }
+
+      // We only want to keep using the smaller of the two split partitions.
+      if (partitions.getSet(newPartition).size() <
+          partitions.getSet(partition).size()) {
+        newPartition = partition;
+      }
+
+      // The splitter partitions that may need to be split to match the new
+      // split of the state partitions.
+      std::vector<size_t> markedSplitters;
+
+      // Mark transitions that lead to the newly split off states.
+      for (size_t state : partitions.getSet(newPartition)) {
+        for (size_t t = transitionIndices[state],
+                    end = transitionIndices[state + 1];
+             t < end;
+             ++t) {
+          auto splitter = splitters.getSetForElem(t);
+          if (!splitter.hasMarks()) {
+            markedSplitters.push_back(splitter.index);
+          }
+          splitters.mark(t);
+        }
+      }
+
+      // Split the splitters and update `potentialSplitters`.
+      for (size_t splitter : markedSplitters) {
+        size_t newSplitter = splitters.getSet(splitter).split();
+        if (newSplitter) {
+          potentialSplitters.push_back(newSplitter);
+        }
+      }
+    }
+  }
+
+  // Return the refined partitions.
+  std::vector<std::vector<size_t>> resultPartitions;
+  resultPartitions.reserve(partitions.sets);
+  for (size_t p = 0; p < partitions.sets; ++p) {
+    auto partition = partitions.getSet(p);
+    std::vector<size_t> resultPartition;
+    resultPartition.reserve(partition.size());
+    for (size_t elem : partition) {
+      resultPartition.push_back(elem);
+    }
+    resultPartitions.emplace_back(std::move(resultPartition));
+  }
+  return resultPartitions;
+}
+
+} // namespace Internal
+
+} // namespace wasm::DFA
diff --git a/src/support/dfa_minimization.h b/src/support/dfa_minimization.h
new file mode 100644
index 000000000..a641022a7
--- /dev/null
+++ b/src/support/dfa_minimization.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright 2023 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Use the Valmari-Lehtinen DFA minimization algorithm
+// (https://arxiv.org/pdf/0802.2826.pdf) to find equivalent elements in a
+// user-provided DFA.
+
+#ifndef wasm_support_dfa_minimization_h
+#define wasm_support_dfa_minimization_h
+
+#include <cassert>
+#include <cstddef>
+#include <unordered_map>
+#include <vector>
+
+namespace wasm::DFA {
+
+namespace Internal {
+
+std::vector<std::vector<size_t>>
+refinePartitionsImpl(const std::vector<std::vector<std::vector<size_t>>>&);
+
+} // namespace Internal
+
+// A DFA state with an ordered list of successor states.
+template<typename T> struct State {
+  T val;
+  std::vector<T> succs;
+};
+
+// Given a vector of initial state partitions in which states known to be
+// different are in different partitions, return a vector of refined partitions,
+// each containing a different equivalence class of states. No value should
+// appear more than once in the input. All successor values should appear in
+// some top-level partition.
+template<typename T>
+std::vector<std::vector<T>>
+refinePartitions(const std::vector<std::vector<State<T>>>& partitions) {
+  // Map values to indices and vice versa.
+  std::unordered_map<T, size_t> indices;
+  std::vector<T> values;
+  for (const auto& partition : partitions) {
+    for (const auto& state : partition) {
+      [[maybe_unused]] bool inserted =
+        indices.insert({state.val, indices.size()}).second;
+      assert(inserted && "unexpected repeated value");
+      values.push_back(state.val);
+    }
+  }
+
+  // Create a copy of the DFA that uses indices instead of the original values.
+  std::vector<std::vector<std::vector<size_t>>> indexPartitions;
+  indexPartitions.reserve(partitions.size());
+  for (const auto& partition : partitions) {
+    std::vector<std::vector<size_t>> indexPartition;
+    indexPartition.reserve(partition.size());
+    for (const auto& state : partition) {
+      std::vector<size_t> succIndices;
+      succIndices.reserve(state.succs.size());
+      for (const auto& succ : state.succs) {
+        auto it = indices.find(succ);
+        assert(it != indices.end() && "unknown successor value");
+        succIndices.push_back(it->second);
+      }
+      indexPartition.emplace_back(std::move(succIndices));
+    }
+    indexPartitions.emplace_back(std::move(indexPartition));
+  }
+
+  // Refine the partitions.
+  auto indexResults = Internal::refinePartitionsImpl(indexPartitions);
+
+  // Map the refined partitions of indices back to values.
+  std::vector<std::vector<T>> results;
+  results.reserve(indexResults.size());
+  for (const auto& indexPartition : indexResults) {
+    std::vector<T> partition;
+    partition.reserve(indexPartition.size());
+    for (size_t index : indexPartition) {
+      partition.push_back(values[index]);
+    }
+    results.emplace_back(std::move(partition));
+  }
+
+  return results;
+}
+
+} // namespace wasm::DFA
+
+#endif // wasm_support_dfa_minimization_h
diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt
index 548f01ff6..21b7ad4ad 100644
--- a/test/gtest/CMakeLists.txt
+++ b/test/gtest/CMakeLists.txt
@@ -2,6 +2,7 @@ include_directories(../../third_party/googletest/googletest/include)
 include_directories(../../src/wasm)
 
 set(unittest_SOURCES
+  dfa_minimization.cpp
   possible-contents.cpp
   type-builder.cpp
   wat-lexer.cpp
diff --git a/test/gtest/dfa_minimization.cpp b/test/gtest/dfa_minimization.cpp
new file mode 100644
index 000000000..a63eefde1
--- /dev/null
+++ b/test/gtest/dfa_minimization.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2023 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <set>
+#include <string>
+#include <vector>
+
+#include "support/dfa_minimization.h"
+#include "gtest/gtest.h"
+
+using namespace wasm;
+
+using State = DFA::State<std::string>;
+using Graph = std::vector<std::vector<State>>;
+using Results = std::vector<std::vector<std::string>>;
+using SetResults = std::set<std::set<std::string>>;
+
+// We don't care about order between or within the output partitions.
+SetResults settify(const Results& vecs) {
+  SetResults sets;
+  for (const auto& vec : vecs) {
+    sets.emplace(vec.begin(), vec.end());
+  }
+  return sets;
+}
+
+// Check that the same values appear in the input and output and that each value
+// appears once.
+void validate(const Graph& graph, const Results& results) {
+  std::set<std::string> inputVals;
+  for (const auto& partition : graph) {
+    for (const auto& state : partition) {
+      bool inserted = inputVals.insert(state.val).second;
+      ASSERT_TRUE(inserted);
+    }
+  }
+
+  std::set<std::string> outputVals;
+  for (const auto& partition : results) {
+    for (const auto& val : partition) {
+      ASSERT_NE(inputVals.find(val), inputVals.end());
+      ASSERT_TRUE(outputVals.insert(val).second);
+    }
+  }
+
+  ASSERT_EQ(outputVals.size(), inputVals.size());
+}
+
+TEST(DFATest, Empty) {
+  Graph graph = {};
+  auto results = DFA::refinePartitions(graph);
+  validate(graph, results);
+  EXPECT_EQ(settify(results), SetResults{});
+}
+
+TEST(DFATest, Singleton) {
+  Graph graph = {{{"A", {}}}};
+  auto results = DFA::refinePartitions(graph);
+  validate(graph, results);
+  EXPECT_EQ(settify(results), SetResults{{"A"}});
+}
+
+TEST(DFATest, CyclePartitioned) {
+  // The partitions are already maximally refined, so shouldn't change.
+  State a{"A", {"B"}};
+  State b{"B", {"C"}};
+  State c{"C", {"A"}};
+  Graph graph = {{a}, {b}, {c}};
+  auto results = DFA::refinePartitions(graph);
+  validate(graph, results);
+  EXPECT_EQ(settify(results), (SetResults{{"A"}, {"B"}, {"C"}}));
+}
+
+TEST(DFATest, CycleGrouped) {
+  // All the states are identical, so the the partition shouldn't change.
+  State a{"A", {"B"}};
+  State b{"B", {"C"}};
+  State c{"C", {"A"}};
+  Graph graph = {{a, b, c}};
+  auto results = DFA::refinePartitions(graph);
+  validate(graph, results);
+  EXPECT_EQ(settify(results), (SetResults{{"A", "B", "C"}}));
+}
+
+TEST(DFATest, CycleRefined) {
+  // A and B are distinguishable, so the partitions should be refined.
+  State a{"A", {"B"}};
+  State b{"B", {"C"}};
+  State c{"C", {"A"}};
+  Graph graph = {{a, b}, {c}};
+  auto results = DFA::refinePartitions(graph);
+  validate(graph, results);
+  EXPECT_EQ(settify(results), (SetResults{{"A"}, {"B"}, {"C"}}));
+}
+
+TEST(DFATest, ChainZip) {
+  // Chain A->B->C->D is identical to chain W->X->Y->Z.
+  State a{"A", {"B"}}, w{"W", {"X"}};
+  State b{"B", {"C"}}, x{"X", {"Y"}};
+  State c{"C", {"D"}}, y{"Y", {"Z"}};
+  State d{"D", {}}, z{"Z", {}};
+  Graph graph = {{a, b, c, d, w, x, y, z}};
+  auto results = DFA::refinePartitions(graph);
+  validate(graph, results);
+  EXPECT_EQ(settify(results),
+            (SetResults{{"A", "W"}, {"B", "X"}, {"C", "Y"}, {"D", "Z"}}));
+}
+
+TEST(DFATest, ChainUnzip) {
+  // Chain A->B->C->D looks a lot like chain W->X->Y->Z, but since Z is
+  // distinguishable, the full chains end up being separated.
+  State a{"A", {"B"}}, w{"W", {"X"}};
+  State b{"B", {"C"}}, x{"X", {"Y"}};
+  State c{"C", {"D"}}, y{"Y", {"Z"}};
+  State d{"D", {}}, z{"Z", {}};
+  Graph graph = {{a, b, c, d, w, x, y}, {z}};
+  auto results = DFA::refinePartitions(graph);
+  validate(graph, results);
+  EXPECT_EQ(
+    settify(results),
+    (SetResults{{"A"}, {"W"}, {"B"}, {"X"}, {"C"}, {"Y"}, {"D"}, {"Z"}}));
+}