Lift the restriction in liveness-traversal.h that supported max 65535 locals in a function. (#4567)

* Lift the restriction in liveness-traversal.h that supported max 65535 locals in a function. * Lint * Fix typo * Fix static * Lint * Lint * Lint * Add needed canRun function * lint * Use either a sparse or a dense matrix for tracking liveness copies, depending on the locals count. * Lint * Fix lint * Lint * Implement sparse_square_matrix class and use that as a backing. * Lint * Lint * Lint #includes * Lint * Lint includes * Remove unnecessary code * Fix canonical accesses to copies matrix * Lint * Add missing variable update * Remove canRun() function * Address review * Update expected test results * Update test name * Add asserts to sparse_square_matrix set and get functions that they are not out of bound. * Lint includes * Update test expectation * Use .clear() + .resize() to reset totalCopies vector
author: juj <jujjyl@gmail.com> 2022-04-28 21:31:08 +0300
committer: GitHub <noreply@github.com> 2022-04-28 18:31:08 +0000
commit: ba64ac3ed47cb35a477aaaf185679ae9d0b79862 (patch)
tree: a35322a8d2162e07352c7272b97ffbd195d876a3
parent: 408b2eb7df01f42157e24c2c58f01c4fa5c6b9d6 (diff)
download: binaryen-ba64ac3ed47cb35a477aaaf185679ae9d0b79862.tar.gz
binaryen-ba64ac3ed47cb35a477aaaf185679ae9d0b79862.tar.bz2
binaryen-ba64ac3ed47cb35a477aaaf185679ae9d0b79862.zip
9 files changed, 170 insertions, 60 deletions
diff --git a/src/cfg/liveness-traversal.h b/src/cfg/liveness-traversal.h
index ee56a9925..3193cbd1f 100644
--- a/src/cfg/liveness-traversal.h
+++ b/src/cfg/liveness-traversal.h
@@ -24,6 +24,7 @@
 #include "cfg-traversal.h"
 #include "ir/utils.h"
 #include "support/sorted_vector.h"
+#include "support/sparse_square_matrix.h"
 #include "wasm-builder.h"
 #include "wasm-traversal.h"
 #include "wasm.h"
@@ -105,10 +106,10 @@ struct LivenessWalker : public CFGWalker<SubType, VisitorType, Liveness> {
 
   Index numLocals;
   std::unordered_set<BasicBlock*> liveBlocks;
-  // canonicalized - accesses should check (low, high)
-  // TODO: use a map for high N, as this tends to be sparse? or don't look at
-  // copies at all for big N?
-  std::vector<uint8_t> copies;
+  // access as a upper triangular matrix: i.e. when accessing a pair (i,j),
+  // should access the cell i < j.
+  sparse_square_matrix<uint8_t> copies;
+
   // total # of copies for each local, with all others
   std::vector<Index> totalCopies;
 
@@ -191,30 +192,13 @@ struct LivenessWalker : public CFGWalker<SubType, VisitorType, Liveness> {
     return nullptr;
   }
 
-  // If there are too many locals, we cannot run currently as
-  // numLocals * numLocals might overflow. We may want to add an option for
-  // a sparse matrix at some point TODO
-  bool canRun(Function* func) {
-    Index numLocals = func->getNumLocals();
-    if (uint64_t(numLocals) * uint64_t(numLocals) <=
-        std::numeric_limits<Index>::max()) {
-      return true;
-    }
-    std::cerr << "warning: too many locals (" << numLocals
-              << ") to run liveness analysis in " << this->getFunction()->name
-              << '\n';
-    return false;
-  }
-
   // main entry point
 
   void doWalkFunction(Function* func) {
     numLocals = func->getNumLocals();
-    assert(canRun(func));
-    copies.resize(numLocals * numLocals);
-    std::fill(copies.begin(), copies.end(), 0);
+    copies.recreate(numLocals);
+    totalCopies.clear();
     totalCopies.resize(numLocals);
-    std::fill(totalCopies.begin(), totalCopies.end(), 0);
     // create the CFG by walking the IR
     CFGWalker<SubType, VisitorType, Liveness>::doWalkFunction(func);
     // ignore links to dead blocks, so they don't confuse us and we can see
@@ -297,14 +281,19 @@ struct LivenessWalker : public CFGWalker<SubType, VisitorType, Liveness> {
   }
 
   void addCopy(Index i, Index j) {
-    auto k = std::min(i, j) * numLocals + std::max(i, j);
-    copies[k] = std::min(copies[k], uint8_t(254)) + 1;
+    if (j > i) {
+      std::swap(i, j);
+    }
+    copies.set(i, j, std::min(copies.get(i, j), uint8_t(254)) + 1);
     totalCopies[i]++;
     totalCopies[j]++;
   }
 
   uint8_t getCopies(Index i, Index j) {
-    return copies[std::min(i, j) * numLocals + std::max(i, j)];
+    if (j > i) {
+      std::swap(i, j);
+    }
+    return copies.get(i, j);
   }
 };
 
diff --git a/src/passes/Asyncify.cpp b/src/passes/Asyncify.cpp
index 649f136c5..8a50d49e6 100644
--- a/src/passes/Asyncify.cpp
+++ b/src/passes/Asyncify.cpp
@@ -1344,14 +1344,6 @@ private:
 
     RelevantLiveLocalsWalker walker;
     walker.setFunction(func);
-    if (!walker.canRun(func)) {
-      // We can proceed without this optimization, which will cause more
-      // spilling - assume all locals are relevant.
-      for (Index i = 0; i < func->getNumLocals(); i++) {
-        relevantLiveLocals.insert(i);
-      }
-      return;
-    }
     walker.walkFunctionInModule(func, getModule());
     // The relevant live locals are ones that are alive at an unwind/rewind
     // location. TODO look more precisely inside basic blocks, as one might stop
diff --git a/src/passes/CoalesceLocals.cpp b/src/passes/CoalesceLocals.cpp
index f58e57e72..ba9b87527 100644
--- a/src/passes/CoalesceLocals.cpp
+++ b/src/passes/CoalesceLocals.cpp
@@ -35,6 +35,7 @@
 #include "pass.h"
 #include "support/learning.h"
 #include "support/permutations.h"
+#include "support/sparse_square_matrix.h"
 #include "wasm.h"
 #ifdef CFG_PROFILE
 #include "support/timing.h"
@@ -76,34 +77,31 @@ struct CoalesceLocals
   // interference state
 
   // canonicalized - accesses should check (low, high)
-  std::vector<bool> interferences;
+  sparse_square_matrix<bool> interferences;
 
   void interfere(Index i, Index j) {
     if (i == j) {
       return;
     }
-    interferences[std::min(i, j) * numLocals + std::max(i, j)] = 1;
+    interferences.set(std::min(i, j), std::max(i, j), true);
   }
 
   // optimized version where you know that low < high
   void interfereLowHigh(Index low, Index high) {
     assert(low < high);
-    interferences[low * numLocals + high] = 1;
+    interferences.set(low, high, true);
   }
 
   void unInterfere(Index i, Index j) {
-    interferences[std::min(i, j) * numLocals + std::max(i, j)] = 0;
+    interferences.set(std::min(i, j), std::max(i, j), false);
   }
 
   bool interferes(Index i, Index j) {
-    return interferences[std::min(i, j) * numLocals + std::max(i, j)];
+    return interferences.get(std::min(i, j), std::max(i, j));
   }
 };
 
 void CoalesceLocals::doWalkFunction(Function* func) {
-  if (!canRun(func)) {
-    return;
-  }
   super::doWalkFunction(func);
   // prioritize back edges
   increaseBackEdgePriorities();
@@ -145,8 +143,7 @@ void CoalesceLocals::increaseBackEdgePriorities() {
 }
 
 void CoalesceLocals::calculateInterferences() {
-  interferences.resize(numLocals * numLocals);
-  std::fill(interferences.begin(), interferences.end(), false);
+  interferences.recreate(numLocals);
 
   // We will track the values in each local, using a numbering where each index
   // represents a unique different value. This array maps a local index to the
@@ -379,17 +376,19 @@ void CoalesceLocals::pickIndicesFromOrder(std::vector<Index>& order,
   // registers, for gzip)
   std::vector<Type> types;
   // new index * numLocals => list of all interferences of locals merged to it
-  std::vector<bool> newInterferences;
+  sparse_square_matrix<bool> newInterferences;
+
   // new index * numLocals => list of all copies of locals merged to it
-  std::vector<uint8_t> newCopies;
+  sparse_square_matrix<uint8_t> newCopies;
+
   indices.resize(numLocals);
   types.resize(numLocals);
-  newInterferences.resize(numLocals * numLocals);
-  std::fill(newInterferences.begin(), newInterferences.end(), false);
+
   auto numParams = getFunction()->getNumParams();
-  // start with enough room for the params
-  newCopies.resize(numParams * numLocals);
-  std::fill(newCopies.begin(), newCopies.end(), 0);
+
+  newInterferences.recreate(numLocals);
+  newCopies.recreate(numLocals);
+
   Index nextFree = 0;
   removedCopies = 0;
   // we can't reorder parameters, they are fixed in order, and cannot coalesce
@@ -399,8 +398,8 @@ void CoalesceLocals::pickIndicesFromOrder(std::vector<Index>& order,
     indices[i] = i;
     types[i] = getFunction()->getLocalType(i);
     for (Index j = numParams; j < numLocals; j++) {
-      newInterferences[numLocals * i + j] = interferes(i, j);
-      newCopies[numLocals * i + j] = getCopies(i, j);
+      newInterferences.set(i, j, interferes(i, j));
+      newCopies.set(i, j, getCopies(i, j));
     }
     nextFree++;
   }
@@ -409,13 +408,13 @@ void CoalesceLocals::pickIndicesFromOrder(std::vector<Index>& order,
     Index found = -1;
     uint8_t foundCopies = -1;
     for (Index j = 0; j < nextFree; j++) {
-      if (!newInterferences[j * numLocals + actual] &&
+      if (!newInterferences.get(j, actual) &&
           getFunction()->getLocalType(actual) == types[j]) {
         // this does not interfere, so it might be what we want. but pick the
         // one eliminating the most copies (we could stop looking forward when
         // there are no more items that have copies anyhow, but it doesn't seem
         // to help)
-        auto currCopies = newCopies[j * numLocals + actual];
+        auto currCopies = newCopies.get(j, actual);
         if (found == Index(-1) || currCopies > foundCopies) {
           indices[actual] = found = j;
           foundCopies = currCopies;
@@ -427,7 +426,6 @@ void CoalesceLocals::pickIndicesFromOrder(std::vector<Index>& order,
       types[found] = getFunction()->getLocalType(actual);
       nextFree++;
       removedCopies += getCopies(found, actual);
-      newCopies.resize(nextFree * numLocals);
     } else {
       removedCopies += foundCopies;
     }
@@ -438,9 +436,9 @@ void CoalesceLocals::pickIndicesFromOrder(std::vector<Index>& order,
     for (Index k = i + 1; k < numLocals; k++) {
       // go in the order, we only need to update for those we will see later
       auto j = order[k];
-      newInterferences[found * numLocals + j] =
-        newInterferences[found * numLocals + j] || interferes(actual, j);
-      newCopies[found * numLocals + j] += getCopies(actual, j);
+      newInterferences.set(
+        found, j, newInterferences.get(found, j) || interferes(actual, j));
+      newCopies.set(found, j, newCopies.get(found, j) + getCopies(actual, j));
     }
   }
 }
diff --git a/src/support/sparse_square_matrix.h b/src/support/sparse_square_matrix.h
new file mode 100644
index 000000000..03ebcb5c6
--- /dev/null
+++ b/src/support/sparse_square_matrix.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2022 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// A helper type for potentially sparse N*N matrix container.
+
+#pragma once
+
+#include <assert.h>
+#include <stdint.h>
+#include <unordered_map>
+#include <vector>
+
+template<typename Ty> class sparse_square_matrix {
+  std::vector<Ty> denseStorage;
+  std::unordered_map<uint64_t, Ty> sparseStorage;
+  uint32_t N;
+
+public:
+  sparse_square_matrix() : N(0) {}
+
+  explicit sparse_square_matrix(int N) : N(N) {
+    if (N < DenseLimit) {
+      denseStorage.resize(N * N);
+    }
+  }
+
+  static const size_t DenseLimit = 8192;
+
+  uint32_t width() const { return N; }
+
+  bool usingDenseStorage() const { return !denseStorage.empty(); }
+
+  void set(uint32_t i, uint32_t j, const Ty& value) {
+    assert(i < N);
+    assert(j < N);
+    if (usingDenseStorage()) {
+      denseStorage[i * N + j] = value;
+    } else {
+      sparseStorage[i * N + j] = value;
+    }
+  }
+
+  const Ty get(uint32_t i, uint32_t j) const {
+    assert(i < N);
+    assert(j < N);
+    if (usingDenseStorage()) {
+      return denseStorage[i * N + j];
+    }
+    auto iter = sparseStorage.find(i * N + j);
+    return iter == sparseStorage.end() ? Ty() : iter->second;
+  }
+
+  // Resizes the matrix to a new n*n size, and clears all entries
+  // to the default-initialized value.
+  void recreate(uint32_t n) {
+    N = n;
+    denseStorage.clear();
+    sparseStorage.clear();
+    if (N < DenseLimit) {
+      denseStorage.resize(N * N);
+    }
+  }
+};
diff --git a/test/example/sparse_square_matrix.cpp b/test/example/sparse_square_matrix.cpp
new file mode 100644
index 000000000..ad1f71ade
--- /dev/null
+++ b/test/example/sparse_square_matrix.cpp
@@ -0,0 +1,54 @@
+#include <iostream>
+
+#include "support/sparse_square_matrix.h"
+
+int main() {
+  sparse_square_matrix<uint32_t> m;
+
+  // New matrix should initialize to 0x0 size.
+  assert(m.width() == 0);
+
+  // Recreating should resize the matrix.
+  m.recreate(100);
+  assert(m.width() == 100);
+
+  // Small matrices should use dense storage.
+  assert(m.usingDenseStorage());
+
+  // Setting and getting element values in dense storage should work.
+  for (int y = 0; y < 100; ++y)
+    for (int x = 0; x < 100; ++x)
+      m.set(y, x, y * 100 + x);
+  for (int y = 0; y < 100; ++y)
+    for (int x = 0; x < 100; ++x)
+      assert(m.get(y, x) == y * 100 + x);
+
+  // Recreating should clear the matrix elements to zero,
+  // even if recreating to same size as before.
+  assert(m.width() == 100);
+  m.recreate(100);
+  for (int y = 0; y < 100; ++y)
+    for (int x = 0; x < 100; ++x)
+      assert(m.get(y, x) == 0);
+
+  // Large matrices should use sparse storage.
+  m.recreate(m.DenseLimit);
+  assert(!m.usingDenseStorage());
+
+  // Setting and getting element values in sparse storage should work.
+  for (int y = 0; y < m.DenseLimit; y += 128)
+    for (int x = 0; x < m.DenseLimit; x += 128)
+      m.set(y, x, y * m.DenseLimit + x);
+  for (int y = 0; y < m.DenseLimit; y += 128)
+    for (int x = 0; x < m.DenseLimit; x += 128)
+      assert(m.get(y, x) == y * m.DenseLimit + x);
+
+  // Recreating matrix in sparse mode should reset values in sparse
+  // storage to zero.
+  m.recreate(m.DenseLimit + 1);
+  for (int y = 0; y < m.width(); y += 128)
+    for (int x = 0; x < m.width(); x += 128)
+      assert(m.get(y, x) == 0);
+
+  std::cout << "ok.\n";
+}
diff --git a/test/example/sparse_square_matrix.txt b/test/example/sparse_square_matrix.txt
new file mode 100644
index 000000000..90b5016ef
--- /dev/null
+++ b/test/example/sparse_square_matrix.txt
@@ -0,0 +1 @@
+ok.
diff --git a/test/passes/too_much_for_liveness.bin.txt b/test/passes/65536_locals_for_liveness.bin.txt
index 793f9578c..0fc7f873a 100644
--- a/test/passes/too_much_for_liveness.bin.txt
+++ b/test/passes/65536_locals_for_liveness.bin.txt
@@ -19,7 +19,7 @@ total
  [tables]       : 0       
  [tags]         : 0       
  [total]        : 4       
- [vars]         : 65536   
+ [vars]         : 1         -65535
  Block          : 1       
  Const          : 1       
  LocalGet       : 1       
diff --git a/test/passes/too_much_for_liveness.passes b/test/passes/65536_locals_for_liveness.passes
index 719d0bbcd..719d0bbcd 100644
--- a/test/passes/too_much_for_liveness.passes
+++ b/test/passes/65536_locals_for_liveness.passes
diff --git a/test/passes/too_much_for_liveness.wasm b/test/passes/65536_locals_for_liveness.wasm
index 88929243d..88929243d 100644
--- a/test/passes/too_much_for_liveness.wasm
+++ b/test/passes/65536_locals_for_liveness.wasm
author	juj <jujjyl@gmail.com>	2022-04-28 21:31:08 +0300
committer	GitHub <noreply@github.com>	2022-04-28 18:31:08 +0000
commit	ba64ac3ed47cb35a477aaaf185679ae9d0b79862 (patch)
tree	a35322a8d2162e07352c7272b97ffbd195d876a3
parent	408b2eb7df01f42157e24c2c58f01c4fa5c6b9d6 (diff)
download	binaryen-ba64ac3ed47cb35a477aaaf185679ae9d0b79862.tar.gz binaryen-ba64ac3ed47cb35a477aaaf185679ae9d0b79862.tar.bz2 binaryen-ba64ac3ed47cb35a477aaaf185679ae9d0b79862.zip