Refactor hashing (#3023)

* Unifies internal hashing helpers to naturally integrate with std::hash * Removes the previous custom implementation * Computed hashes are now always size_t * Introduces a hash_combine helper * Fixes an overwritten partial hash in Relooper.cpp
author: Daniel Wirtz <dcode@dcode.io> 2020-08-13 02:48:52 +0200
committer: GitHub <noreply@github.com> 2020-08-12 17:48:52 -0700
commit: 902469769dee0a3f61e7e5aaca597d3cbac139ad (patch)
tree: 024c9e010e9a1d406632994df747a1657eff937a
parent: f067a45c1e88124173af992e66a7125fe6ab366a (diff)
download: binaryen-902469769dee0a3f61e7e5aaca597d3cbac139ad.tar.gz
binaryen-902469769dee0a3f61e7e5aaca597d3cbac139ad.tar.bz2
binaryen-902469769dee0a3f61e7e5aaca597d3cbac139ad.zip
11 files changed, 86 insertions, 92 deletions
diff --git a/src/cfg/Relooper.cpp b/src/cfg/Relooper.cpp
index 0c9b581bd..b3bbe0153 100644
--- a/src/cfg/Relooper.cpp
+++ b/src/cfg/Relooper.cpp
@@ -669,8 +669,7 @@ struct Optimizer : public RelooperRecursor {
       std::cout << "at parent " << ParentBlock->Id << '\n';
 #endif
       if (ParentBlock->BranchesOut.size() >= 2) {
-        std::unordered_map<wasm::HashType, std::vector<BranchBlock>>
-          HashedBranchesOut;
+        std::unordered_map<size_t, std::vector<BranchBlock>> HashedBranchesOut;
         std::vector<Block*> BlocksToErase;
         for (auto& iter : ParentBlock->BranchesOut) {
           Block* CurrBlock = iter.first;
@@ -999,41 +998,42 @@ private:
   // (like Shapes). Only partially hashes the branches out, no
   // recursion: hashes the branch infos, looks at raw pointers
   // for the blocks.
-  wasm::HashType Hash(Block* Curr) {
-    wasm::HashType Ret = wasm::ExpressionAnalyzer::hash(Curr->Code);
-    Ret = wasm::rehash(Ret, 1);
+  size_t Hash(Block* Curr) {
+    auto digest = wasm::ExpressionAnalyzer::hash(Curr->Code);
+    wasm::rehash(digest, uint8_t(1));
     if (Curr->SwitchCondition) {
-      Ret = wasm::ExpressionAnalyzer::hash(Curr->SwitchCondition);
+      wasm::hash_combine(digest,
+                         wasm::ExpressionAnalyzer::hash(Curr->SwitchCondition));
     }
-    Ret = wasm::rehash(Ret, 2);
+    wasm::rehash(digest, uint8_t(2));
     for (auto& Pair : Curr->BranchesOut) {
       // Hash the Block* as a pointer TODO: full hash?
-      Ret =
-        wasm::rehash(Ret, wasm::HashType(reinterpret_cast<size_t>(Pair.first)));
+      wasm::rehash(digest, reinterpret_cast<size_t>(Pair.first));
       // Hash the Branch info properly
-      Ret = wasm::rehash(Ret, Hash(Pair.second));
+      wasm::hash_combine(digest, Hash(Pair.second));
     }
-    return Ret;
+    return digest;
   }
 
   // Hashes the direct block contents, but not Relooper internals
   // (like Shapes).
-  wasm::HashType Hash(Branch* Curr) {
-    wasm::HashType Ret = 0;
+  size_t Hash(Branch* Curr) {
+    auto digest = wasm::hash(0);
     if (Curr->SwitchValues) {
       for (auto i : *Curr->SwitchValues) {
-        Ret = wasm::rehash(Ret, i); // TODO hash i
+        wasm::rehash(digest, i); // TODO hash i
       }
     } else {
       if (Curr->Condition) {
-        Ret = wasm::ExpressionAnalyzer::hash(Curr->Condition);
+        wasm::hash_combine(digest,
+                           wasm::ExpressionAnalyzer::hash(Curr->Condition));
       }
     }
-    Ret = wasm::rehash(Ret, 1);
+    wasm::rehash(digest, uint8_t(1));
     if (Curr->Code) {
-      Ret = wasm::ExpressionAnalyzer::hash(Curr->Code);
+      wasm::hash_combine(digest, wasm::ExpressionAnalyzer::hash(Curr->Code));
     }
-    return Ret;
+    return digest;
   }
 };
 
diff --git a/src/ir/ExpressionAnalyzer.cpp b/src/ir/ExpressionAnalyzer.cpp
index a6f4b601f..cdbcd9ae0 100644
--- a/src/ir/ExpressionAnalyzer.cpp
+++ b/src/ir/ExpressionAnalyzer.cpp
@@ -408,9 +408,9 @@ bool ExpressionAnalyzer::flexibleEqual(Expression* left,
 }
 
 // hash an expression, ignoring superficial details like specific internal names
-HashType ExpressionAnalyzer::hash(Expression* curr) {
+size_t ExpressionAnalyzer::hash(Expression* curr) {
   struct Hasher {
-    HashType digest = 0;
+    size_t digest = wasm::hash(0);
 
     Index internalCounter = 0;
     // for each internal name, its unique id
@@ -432,7 +432,7 @@ HashType ExpressionAnalyzer::hash(Expression* curr) {
         if (!curr) {
           continue;
         }
-        hash(curr->_id);
+        rehash(digest, curr->_id);
         // we often don't need to hash the type, as it is tied to other values
         // we are hashing anyhow, but there are exceptions: for example, a
         // local.get's type is determined by the function, so if we are
@@ -441,7 +441,7 @@ HashType ExpressionAnalyzer::hash(Expression* curr) {
         // if we hash between modules, then we need to take int account
         // call_imports type, etc. The simplest thing is just to hash the
         // type for all of them.
-        hash(curr->type.getID());
+        rehash(digest, curr->type.getID());
         // Blocks and loops introduce scoping.
         if (auto* block = curr->dynCast<Block>()) {
           noteScopeName(block->name);
@@ -459,15 +459,10 @@ HashType ExpressionAnalyzer::hash(Expression* curr) {
         }
         // Sometimes children are optional, e.g. return, so we must hash
         // their number as well.
-        hash(counter);
+        rehash(digest, counter);
       }
     }
 
-    void hash(HashType hash) { digest = rehash(digest, hash); }
-    void hash64(uint64_t hash) {
-      digest = rehash(rehash(digest, HashType(hash >> 32)), HashType(hash));
-    }
-
     void visitScopeName(Name curr) {
       // Names are relative, we give the same hash for
       // (block $x (br $x))
@@ -475,21 +470,21 @@ HashType ExpressionAnalyzer::hash(Expression* curr) {
       static_assert(sizeof(Index) == sizeof(int32_t),
                     "wasm64 will need changes here");
       assert(internalNames.find(curr) != internalNames.end());
-      return hash(internalNames[curr]);
+      rehash(digest, internalNames[curr]);
     }
-    void visitNonScopeName(Name curr) { return hash64(uint64_t(curr.str)); }
-    void visitInt(int32_t curr) { hash(curr); }
-    void visitLiteral(Literal curr) { hash(std::hash<Literal>()(curr)); }
-    void visitType(Type curr) { hash(int32_t(curr.getSingle())); }
+    void visitNonScopeName(Name curr) { rehash(digest, uint64_t(curr.str)); }
+    void visitInt(int32_t curr) { rehash(digest, curr); }
+    void visitLiteral(Literal curr) { rehash(digest, curr); }
+    void visitType(Type curr) { rehash(digest, curr.getID()); }
     void visitIndex(Index curr) {
       static_assert(sizeof(Index) == sizeof(int32_t),
                     "wasm64 will need changes here");
-      hash(int32_t(curr));
+      rehash(digest, curr);
     }
     void visitAddress(Address curr) {
       static_assert(sizeof(Address) == sizeof(int32_t),
                     "wasm64 will need changes here");
-      hash(int32_t(curr));
+      rehash(digest, curr.addr);
     }
   };
 
diff --git a/src/ir/ExpressionManipulator.cpp b/src/ir/ExpressionManipulator.cpp
index a9fd3c599..57048b9bd 100644
--- a/src/ir/ExpressionManipulator.cpp
+++ b/src/ir/ExpressionManipulator.cpp
@@ -16,7 +16,6 @@
 
 #include "ir/load-utils.h"
 #include "ir/utils.h"
-#include "support/hash.h"
 
 namespace wasm {
 
diff --git a/src/ir/hashed.h b/src/ir/hashed.h
index 563e3abfe..fe959936a 100644
--- a/src/ir/hashed.h
+++ b/src/ir/hashed.h
@@ -26,16 +26,18 @@ namespace wasm {
 // An expression with a cached hash value
 struct HashedExpression {
   Expression* expr;
-  HashType hash;
+  size_t digest;
 
   HashedExpression(Expression* expr) : expr(expr) {
     if (expr) {
-      hash = ExpressionAnalyzer::hash(expr);
+      digest = ExpressionAnalyzer::hash(expr);
+    } else {
+      digest = hash(0);
     }
   }
 
   HashedExpression(const HashedExpression& other)
-    : expr(other.expr), hash(other.hash) {}
+    : expr(other.expr), digest(other.digest) {}
 };
 
 // A pass that hashes all functions
@@ -43,7 +45,7 @@ struct HashedExpression {
 struct FunctionHasher : public WalkerPass<PostWalker<FunctionHasher>> {
   bool isFunctionParallel() override { return true; }
 
-  struct Map : public std::map<Function*, HashType> {};
+  struct Map : public std::map<Function*, size_t> {};
 
   FunctionHasher(Map* output) : output(output) {}
 
@@ -54,22 +56,21 @@ struct FunctionHasher : public WalkerPass<PostWalker<FunctionHasher>> {
     for (auto& func : module->functions) {
       // ensure an entry for each function - we must not modify the map shape in
       // parallel, just the values
-      hashes[func.get()] = 0;
+      hashes[func.get()] = hash(0);
     }
     return hashes;
   }
 
   void doWalkFunction(Function* func) { output->at(func) = hashFunction(func); }
 
-  static HashType hashFunction(Function* func) {
-    HashType ret = 0;
-    ret = rehash(ret, (HashType)func->sig.params.getID());
-    ret = rehash(ret, (HashType)func->sig.results.getID());
+  static size_t hashFunction(Function* func) {
+    auto digest = hash(func->sig.params.getID());
+    rehash(digest, func->sig.results.getID());
     for (auto type : func->vars) {
-      ret = rehash(ret, (HashType)type.getID());
+      rehash(digest, type.getID());
     }
-    ret = rehash(ret, (HashType)ExpressionAnalyzer::hash(func->body));
-    return ret;
+    hash_combine(digest, ExpressionAnalyzer::hash(func->body));
+    return digest;
   }
 
 private:
diff --git a/src/ir/utils.h b/src/ir/utils.h
index a7f6f59bf..176699591 100644
--- a/src/ir/utils.h
+++ b/src/ir/utils.h
@@ -78,7 +78,7 @@ struct ExpressionAnalyzer {
 
   // hash an expression, ignoring superficial details like specific internal
   // names
-  static HashType hash(Expression* curr);
+  static size_t hash(Expression* curr);
 };
 
 // Re-Finalizes all node types. This can be run after code was modified in
diff --git a/src/literal.h b/src/literal.h
index 4f936153d..c58dbb35c 100644
--- a/src/literal.h
+++ b/src/literal.h
@@ -528,18 +528,19 @@ template<> struct hash<wasm::Literal> {
     a.getBits(bytes);
     int64_t chunks[2];
     memcpy(chunks, bytes, sizeof(chunks));
-    return wasm::rehash(wasm::rehash(uint64_t(hash<uint32_t>()(a.type.getID())),
-                                     uint64_t(hash<int64_t>()(chunks[0]))),
-                        uint64_t(hash<int64_t>()(chunks[1])));
+    auto digest = wasm::hash(a.type.getID());
+    wasm::rehash(digest, chunks[0]);
+    wasm::rehash(digest, chunks[1]);
+    return digest;
   }
 };
 template<> struct hash<wasm::Literals> {
   size_t operator()(const wasm::Literals& a) const {
-    size_t h = wasm::rehash(uint64_t(0), uint64_t(a.size()));
+    auto digest = wasm::hash(a.size());
     for (const auto& lit : a) {
-      h = wasm::rehash(uint64_t(h), uint64_t(hash<wasm::Literal>{}(lit)));
+      wasm::rehash(digest, lit);
     }
-    return h;
+    return digest;
   }
 };
 template<> struct less<wasm::Literal> {
diff --git a/src/passes/CodeFolding.cpp b/src/passes/CodeFolding.cpp
index 04819a04d..ee54c50a6 100644
--- a/src/passes/CodeFolding.cpp
+++ b/src/passes/CodeFolding.cpp
@@ -606,25 +606,25 @@ private:
     if (next.size() >= 2) {
       // now we want to find a mergeable item - any item that is equal among a
       // subset
-      std::map<Expression*, HashType> hashes; // expression => hash value
+      std::map<Expression*, size_t> hashes; // expression => hash value
       // hash value => expressions with that hash
-      std::map<HashType, std::vector<Expression*>> hashed;
+      std::map<size_t, std::vector<Expression*>> hashed;
       for (auto& tail : next) {
         auto* item = getItem(tail, num);
         auto hash = hashes[item] = ExpressionAnalyzer::hash(item);
         hashed[hash].push_back(item);
       }
       // look at each hash value exactly once. we do this in a deterministic
-      // order.
-      std::set<HashType> seen;
+      // order by iterating over a vector retaining insertion order.
+      std::set<size_t> seen;
       for (auto& tail : next) {
         auto* item = getItem(tail, num);
-        auto hash = hashes[item];
-        if (seen.count(hash)) {
+        auto digest = hashes[item];
+        if (seen.count(digest)) {
           continue;
         }
-        seen.insert(hash);
-        auto& items = hashed[hash];
+        seen.insert(digest);
+        auto& items = hashed[digest];
         if (items.size() == 1) {
           continue;
         }
diff --git a/src/passes/LocalCSE.cpp b/src/passes/LocalCSE.cpp
index 81a82d1c6..f5cf323ec 100644
--- a/src/passes/LocalCSE.cpp
+++ b/src/passes/LocalCSE.cpp
@@ -62,14 +62,16 @@ struct LocalCSE : public WalkerPass<LinearExecutionWalker<LocalCSE>> {
   };
 
   struct UsableHasher {
-    HashType operator()(const Usable value) const {
-      return rehash(uint64_t(value.hashed.hash), value.localType.getID());
+    size_t operator()(const Usable value) const {
+      auto digest = value.hashed.digest;
+      wasm::rehash(digest, value.localType.getID());
+      return digest;
     }
   };
 
   struct UsableComparer {
     bool operator()(const Usable a, const Usable b) const {
-      if (a.hashed.hash != b.hashed.hash || a.localType != b.localType) {
+      if (a.hashed.digest != b.hashed.digest || a.localType != b.localType) {
         return false;
       }
       return ExpressionAnalyzer::equal(a.hashed.expr, b.hashed.expr);
diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp
index 5a0dedf37..73e5094e8 100644
--- a/src/passes/pass.cpp
+++ b/src/passes/pass.cpp
@@ -686,7 +686,7 @@ struct AfterEffectFunctionChecker {
   // Check Stack IR state: if the main IR changes, there should be no
   // stack IR, as the stack IR would be wrong.
   bool beganWithStackIR;
-  HashType originalFunctionHash;
+  size_t originalFunctionHash;
 
   // In the creator we can scan the state of the module and function before the
   // pass runs.
diff --git a/src/support/hash.h b/src/support/hash.h
index 647369ac8..50ecec563 100644
--- a/src/support/hash.h
+++ b/src/support/hash.h
@@ -22,27 +22,22 @@
 
 namespace wasm {
 
-typedef uint32_t HashType;
-
-inline HashType rehash(HashType x, HashType y) {
-  // see http://www.cse.yorku.ca/~oz/hash.html and
-  // https://stackoverflow.com/a/2595226/1176841
-  HashType hash = 5381;
-  while (x) {
-    hash = ((hash << 5) + hash) ^ (x & 0xff);
-    x >>= 8;
-  }
-  while (y) {
-    hash = ((hash << 5) + hash) ^ (y & 0xff);
-    y >>= 8;
-  }
-  return hash;
+// Computes the digest of `value`.
+template<typename T> inline std::size_t hash(const T& value) {
+  return std::hash<T>{}(value);
 }
 
-inline uint64_t rehash(uint64_t x, uint64_t y) {
-  auto ret = rehash(HashType(x), HashType(x >> 32));
-  ret = rehash(ret, HashType(y));
-  return rehash(ret, HashType(y >> 32));
+// Combines two digests into the first digest. Use instead of `rehash` if
+// `otherDigest` is another digest and not a `size_t` value.
+static inline void hash_combine(std::size_t& digest, std::size_t otherDigest) {
+  // see boost/container_hash/hash.hpp
+  digest ^= otherDigest + 0x9e3779b9 + (digest << 6) + (digest >> 2);
+}
+
+// Hashes `value` and combines the resulting digest into the existing digest.
+// Use instead of `hash_combine` if `value` is not another digest.
+template<typename T> inline void rehash(std::size_t& digest, const T& value) {
+  hash_combine(digest, hash(value));
 }
 
 } // namespace wasm
diff --git a/src/wasm/wasm-type.cpp b/src/wasm/wasm-type.cpp
index b49f0338f..c8d8a94c5 100644
--- a/src/wasm/wasm-type.cpp
+++ b/src/wasm/wasm-type.cpp
@@ -30,21 +30,22 @@ namespace std {
 template<> class hash<vector<wasm::Type>> {
 public:
   size_t operator()(const vector<wasm::Type>& types) const {
-    uint64_t res = wasm::rehash(0, uint32_t(types.size()));
+    auto digest = wasm::hash(types.size());
     for (auto t : types) {
-      res = wasm::rehash(res, t.getID());
+      wasm::rehash(digest, t.getID());
     }
-    return res;
+    return digest;
   }
 };
 
 size_t hash<wasm::Type>::operator()(const wasm::Type& type) const {
-  return hash<uint64_t>{}(type.getID());
+  return wasm::hash(type.getID());
 }
 
 size_t hash<wasm::Signature>::operator()(const wasm::Signature& sig) const {
-  return wasm::rehash(uint64_t(hash<uint64_t>{}(sig.params.getID())),
-                      uint64_t(hash<uint64_t>{}(sig.results.getID())));
+  auto digest = wasm::hash(sig.params.getID());
+  wasm::rehash(digest, sig.results.getID());
+  return digest;
 }
 
 } // namespace std
author	Daniel Wirtz <dcode@dcode.io>	2020-08-13 02:48:52 +0200
committer	GitHub <noreply@github.com>	2020-08-12 17:48:52 -0700
commit	902469769dee0a3f61e7e5aaca597d3cbac139ad (patch)
tree	024c9e010e9a1d406632994df747a1657eff937a
parent	f067a45c1e88124173af992e66a7125fe6ab366a (diff)
download	binaryen-902469769dee0a3f61e7e5aaca597d3cbac139ad.tar.gz binaryen-902469769dee0a3f61e7e5aaca597d3cbac139ad.tar.bz2 binaryen-902469769dee0a3f61e7e5aaca597d3cbac139ad.zip