1 files changed, 362 insertions, 0 deletions
diff --git a/src/support/dfa_minimization.cpp b/src/support/dfa_minimization.cpp
new file mode 100644
index 000000000..5df67c585
--- /dev/null
+++ b/src/support/dfa_minimization.cpp
@@ -0,0 +1,362 @@
+/*
+ * Copyright 2023 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <map>
+
+#include "dfa_minimization.h"
+
+namespace wasm::DFA {
+
+namespace {
+
+// A vector of initial partitions, each of which is a vector of elements, each
+// of which is a vector of successor indices.
+using InputGraph = std::vector<std::vector<std::vector<size_t>>>;
+
+// The Refined Partitions data structure used in Valmari-Lehtinen DFA
+// minimization. The translation from terms used in the Valmari-Lehtinen paper
+// to the more expanded terms used here is:
+//
+//   Block => Set
+//   elems => elements
+//   loc => elementIndices
+//   sidx => setIndices
+//   first => beginnings
+//   end => endings
+//   mid => pivots
+//
+struct Partitions {
+  // The number of sets.
+  size_t sets = 0;
+
+  // The partitioned elements. Elements in the same set are next to each other.
+  // Within each set, "marked" elements come first followed by "unmarked"
+  // elements.
+  std::vector<size_t> elements;
+
+  // Maps elements to their indices in `elements`.
+  std::vector<size_t> elementIndices;
+
+  // Maps elements to their sets, identified by an index.
+  std::vector<size_t> setIndices;
+
+  // Maps sets to the indices of their first elements in `elements`.
+  std::vector<size_t> beginnings;
+
+  // Maps sets to (one past) the indices of their ends in `elements`.
+  std::vector<size_t> endings;
+
+  // Maps sets to the indices of their first unmarked elements in `elements`.
+  std::vector<size_t> pivots;
+
+  Partitions() = default;
+
+  // Allocate space up front so we never need to re-allocate. The actual
+  // contents of all the vectors will need to be externally initialized,
+  // though.
+  Partitions(size_t size)
+    : elements(size), elementIndices(size), setIndices(size), beginnings(size),
+      endings(size), pivots(size) {}
+
+  struct Set {
+    using Iterator = std::vector<size_t>::iterator;
+
+    Partitions& partitions;
+    size_t index;
+
+    Set(Partitions& partitions, size_t index)
+      : partitions(partitions), index(index) {}
+
+    Iterator begin() {
+      return partitions.elements.begin() + partitions.beginnings[index];
+    }
+    Iterator end() {
+      return partitions.elements.begin() + partitions.endings[index];
+    }
+    size_t size() {
+      return partitions.endings[index] - partitions.beginnings[index];
+    }
+
+    bool hasMarks() {
+      return partitions.pivots[index] != partitions.beginnings[index];
+    }
+
+    // Split the set between marked and unmarked elements if there are both
+    // marked and unmarked elements. Unmark all elements of this set regardless.
+    // Return the index of the new partition or 0 if there was no split.
+    size_t split() {
+      size_t begin = partitions.beginnings[index];
+      size_t end = partitions.endings[index];
+      size_t pivot = partitions.pivots[index];
+      if (pivot == begin) {
+        // No elements marked, so there is nothing to do.
+        return 0;
+      }
+      if (pivot == end) {
+        // All elements were marked, so just unmark them.
+        partitions.pivots[index] = begin;
+        return 0;
+      }
+      // Create a new set covering the marked region.
+      size_t newIndex = partitions.sets++;
+      partitions.beginnings[newIndex] = begin;
+      partitions.pivots[newIndex] = begin;
+      partitions.endings[newIndex] = pivot;
+      for (size_t i = begin; i < pivot; ++i) {
+        partitions.setIndices[partitions.elements[i]] = newIndex;
+      }
+      // Update the old set. The end and pivot are already correct.
+      partitions.beginnings[index] = pivot;
+      return newIndex;
+    }
+  };
+
+  Set getSet(size_t index) { return {*this, index}; }
+
+  // Returns the set containing an element, which can be iterated upon. The set
+  // may be invalidated by calls to `mark` or `Set::split`.
+  Set getSetForElem(size_t element) { return getSet(setIndices[element]); }
+
+  void mark(size_t element) {
+    size_t index = elementIndices[element];
+    size_t set = setIndices[element];
+    size_t pivot = pivots[set];
+    if (index >= pivot) {
+      // Move the pivot element into the location of the newly marked element.
+      elements[index] = elements[pivot];
+      elementIndices[elements[index]] = index;
+      // Move the newly marked element into the pivot location.
+      elements[pivot] = element;
+      elementIndices[element] = pivot;
+      // Update the pivot index to mark the element.
+      ++pivots[set];
+    }
+  }
+};
+
+Partitions initializeStatePartitions(const InputGraph& inputGraph,
+                                     size_t numElements) {
+  Partitions partitions(numElements);
+  size_t elementIndex = 0;
+  for (const auto& partition : inputGraph) {
+    size_t set = partitions.sets++;
+    partitions.beginnings[set] = elementIndex;
+    partitions.pivots[set] = elementIndex;
+    for (size_t i = 0; i < partition.size(); ++i) {
+      partitions.elements[elementIndex] = elementIndex;
+      partitions.elementIndices[elementIndex] = elementIndex;
+      partitions.setIndices[elementIndex] = set;
+      ++elementIndex;
+    }
+    partitions.endings[set] = elementIndex;
+  }
+  return partitions;
+}
+
+// A DFA transition into a state.
+struct Transition {
+  size_t pred;
+  size_t label;
+};
+
+void initializeTransitions(const InputGraph& inputGraph,
+                           size_t numElements,
+                           size_t numTransitions,
+                           std::vector<Transition>& transitions,
+                           std::vector<size_t>& transitionIndices) {
+  // Find the transitions into each state. Map destinations to input
+  // transitions.
+  std::map<size_t, std::vector<Transition>> transitionMap;
+  size_t elementIndex = 0;
+  for (const auto& partition : inputGraph) {
+    for (const auto& elem : partition) {
+      size_t label = 0;
+      for (const auto& succ : elem) {
+        transitionMap[succ].push_back({elementIndex, label++});
+      }
+      ++elementIndex;
+    }
+  }
+
+  // Populate `transitions` and `transitionIndices`.
+  transitions.reserve(numTransitions);
+  transitionIndices.reserve(numElements + 1);
+  for (size_t dest = 0; dest < numElements; ++dest) {
+    // Record the first index of transitions leading to `dest`.
+    transitionIndices.push_back(transitions.size());
+    if (auto it = transitionMap.find(dest); it != transitionMap.end()) {
+      transitions.insert(
+        transitions.end(), it->second.begin(), it->second.end());
+    }
+  }
+  // Record one-past the end of the transitions leading to the final `dest`.
+  transitionIndices.push_back(transitions.size());
+}
+
+Partitions
+initializeSplitterPartitions(Partitions& partitions,
+                             const std::vector<Transition>& transitions,
+                             const std::vector<size_t>& transitionIndices) {
+  // The initial sets of splitters are partitioned by destination state
+  // partition and transition label.
+  Partitions splitters(transitions.size());
+  size_t elementIndex = 0;
+  for (size_t statePartition = 0; statePartition < partitions.sets;
+       ++statePartition) {
+    // The in-transitions leading to states in the current partition, organized
+    // by transition label.
+    std::map<size_t, std::vector<size_t>> currTransitions;
+    for (size_t state : partitions.getSet(statePartition)) {
+      for (size_t transition = transitionIndices[state],
+                  end = transitionIndices[state + 1];
+           transition < end;
+           ++transition) {
+        currTransitions[transitions[transition].label].push_back(transition);
+      }
+    }
+    // Create a splitter partition for each in-transition label leading to the
+    // current state partition.
+    for (auto& pair : currTransitions) {
+      size_t set = splitters.sets++;
+      splitters.beginnings[set] = elementIndex;
+      splitters.pivots[set] = elementIndex;
+      for (size_t transition : pair.second) {
+        splitters.elements[elementIndex] = transition;
+        splitters.elementIndices[transition] = elementIndex;
+        splitters.setIndices[transition] = set;
+        ++elementIndex;
+      }
+      splitters.endings[set] = elementIndex;
+    }
+  }
+  return splitters;
+}
+
+} // anonymous namespace
+
+namespace Internal {
+
+std::vector<std::vector<size_t>>
+refinePartitionsImpl(const InputGraph& inputGraph) {
+  // Find the number of states and transitions.
+  size_t numElements = 0;
+  size_t numTransitions = 0;
+  for (const auto& partition : inputGraph) {
+    numElements += partition.size();
+    for (const auto& elem : partition) {
+      numTransitions += elem.size();
+    }
+  }
+
+  // The partitions of DFA states.
+  Partitions partitions = initializeStatePartitions(inputGraph, numElements);
+
+  // The transitions arranged such that the transitions leading to state `q` are
+  // `transitions[transitionIndices[q] : transitionIndices[q+1]]`.
+  std::vector<Transition> transitions;
+  std::vector<size_t> transitionIndices;
+  initializeTransitions(
+    inputGraph, numElements, numTransitions, transitions, transitionIndices);
+
+  // The splitters, which are partitions of the input transitions.
+  Partitions splitters =
+    initializeSplitterPartitions(partitions, transitions, transitionIndices);
+
+  // The list of splitter partitions that might be able to split states in some
+  // state partition. Starts out containing all splitter partitions.
+  std::vector<size_t> potentialSplitters;
+  potentialSplitters.reserve(splitters.sets);
+  for (size_t i = 0; i < splitters.sets; ++i) {
+    potentialSplitters.push_back(i);
+  }
+
+  while (!potentialSplitters.empty()) {
+    size_t potentialSplitter = potentialSplitters.back();
+    potentialSplitters.pop_back();
+
+    // The partitions that may be able to be split.
+    std::vector<size_t> markedPartitions;
+
+    // Mark states that are predecessors via this splitter partition.
+    for (size_t transition : splitters.getSet(potentialSplitter)) {
+      size_t state = transitions[transition].pred;
+      auto partition = partitions.getSetForElem(state);
+      if (!partition.hasMarks()) {
+        markedPartitions.push_back(partition.index);
+      }
+      partitions.mark(state);
+    }
+
+    // Try to split each partition with marked states.
+    for (size_t partition : markedPartitions) {
+      size_t newPartition = partitions.getSet(partition).split();
+      if (!newPartition) {
+        // There was nothing to split.
+        continue;
+      }
+
+      // We only want to keep using the smaller of the two split partitions.
+      if (partitions.getSet(newPartition).size() <
+          partitions.getSet(partition).size()) {
+        newPartition = partition;
+      }
+
+      // The splitter partitions that may need to be split to match the new
+      // split of the state partitions.
+      std::vector<size_t> markedSplitters;
+
+      // Mark transitions that lead to the newly split off states.
+      for (size_t state : partitions.getSet(newPartition)) {
+        for (size_t t = transitionIndices[state],
+                    end = transitionIndices[state + 1];
+             t < end;
+             ++t) {
+          auto splitter = splitters.getSetForElem(t);
+          if (!splitter.hasMarks()) {
+            markedSplitters.push_back(splitter.index);
+          }
+          splitters.mark(t);
+        }
+      }
+
+      // Split the splitters and update `potentialSplitters`.
+      for (size_t splitter : markedSplitters) {
+        size_t newSplitter = splitters.getSet(splitter).split();
+        if (newSplitter) {
+          potentialSplitters.push_back(newSplitter);
+        }
+      }
+    }
+  }
+
+  // Return the refined partitions.
+  std::vector<std::vector<size_t>> resultPartitions;
+  resultPartitions.reserve(partitions.sets);
+  for (size_t p = 0; p < partitions.sets; ++p) {
+    auto partition = partitions.getSet(p);
+    std::vector<size_t> resultPartition;
+    resultPartition.reserve(partition.size());
+    for (size_t elem : partition) {
+      resultPartition.push_back(elem);
+    }
+    resultPartitions.emplace_back(std::move(resultPartition));
+  }
+  return resultPartitions;
+}
+
+} // namespace Internal
+
+} // namespace wasm::DFA