diff options
author | Daniel Wirtz <dcode@dcode.io> | 2020-08-13 02:48:52 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-08-12 17:48:52 -0700 |
commit | 902469769dee0a3f61e7e5aaca597d3cbac139ad (patch) | |
tree | 024c9e010e9a1d406632994df747a1657eff937a | |
parent | f067a45c1e88124173af992e66a7125fe6ab366a (diff) | |
download | binaryen-902469769dee0a3f61e7e5aaca597d3cbac139ad.tar.gz binaryen-902469769dee0a3f61e7e5aaca597d3cbac139ad.tar.bz2 binaryen-902469769dee0a3f61e7e5aaca597d3cbac139ad.zip |
Refactor hashing (#3023)
* Unifies internal hashing helpers to naturally integrate with std::hash
* Removes the previous custom implementation
* Computed hashes are now always size_t
* Introduces a hash_combine helper
* Fixes an overwritten partial hash in Relooper.cpp
-rw-r--r-- | src/cfg/Relooper.cpp | 36 | ||||
-rw-r--r-- | src/ir/ExpressionAnalyzer.cpp | 29 | ||||
-rw-r--r-- | src/ir/ExpressionManipulator.cpp | 1 | ||||
-rw-r--r-- | src/ir/hashed.h | 25 | ||||
-rw-r--r-- | src/ir/utils.h | 2 | ||||
-rw-r--r-- | src/literal.h | 13 | ||||
-rw-r--r-- | src/passes/CodeFolding.cpp | 16 | ||||
-rw-r--r-- | src/passes/LocalCSE.cpp | 8 | ||||
-rw-r--r-- | src/passes/pass.cpp | 2 | ||||
-rw-r--r-- | src/support/hash.h | 33 | ||||
-rw-r--r-- | src/wasm/wasm-type.cpp | 13 |
11 files changed, 86 insertions, 92 deletions
diff --git a/src/cfg/Relooper.cpp b/src/cfg/Relooper.cpp index 0c9b581bd..b3bbe0153 100644 --- a/src/cfg/Relooper.cpp +++ b/src/cfg/Relooper.cpp @@ -669,8 +669,7 @@ struct Optimizer : public RelooperRecursor { std::cout << "at parent " << ParentBlock->Id << '\n'; #endif if (ParentBlock->BranchesOut.size() >= 2) { - std::unordered_map<wasm::HashType, std::vector<BranchBlock>> - HashedBranchesOut; + std::unordered_map<size_t, std::vector<BranchBlock>> HashedBranchesOut; std::vector<Block*> BlocksToErase; for (auto& iter : ParentBlock->BranchesOut) { Block* CurrBlock = iter.first; @@ -999,41 +998,42 @@ private: // (like Shapes). Only partially hashes the branches out, no // recursion: hashes the branch infos, looks at raw pointers // for the blocks. - wasm::HashType Hash(Block* Curr) { - wasm::HashType Ret = wasm::ExpressionAnalyzer::hash(Curr->Code); - Ret = wasm::rehash(Ret, 1); + size_t Hash(Block* Curr) { + auto digest = wasm::ExpressionAnalyzer::hash(Curr->Code); + wasm::rehash(digest, uint8_t(1)); if (Curr->SwitchCondition) { - Ret = wasm::ExpressionAnalyzer::hash(Curr->SwitchCondition); + wasm::hash_combine(digest, + wasm::ExpressionAnalyzer::hash(Curr->SwitchCondition)); } - Ret = wasm::rehash(Ret, 2); + wasm::rehash(digest, uint8_t(2)); for (auto& Pair : Curr->BranchesOut) { // Hash the Block* as a pointer TODO: full hash? - Ret = - wasm::rehash(Ret, wasm::HashType(reinterpret_cast<size_t>(Pair.first))); + wasm::rehash(digest, reinterpret_cast<size_t>(Pair.first)); // Hash the Branch info properly - Ret = wasm::rehash(Ret, Hash(Pair.second)); + wasm::hash_combine(digest, Hash(Pair.second)); } - return Ret; + return digest; } // Hashes the direct block contents, but not Relooper internals // (like Shapes). - wasm::HashType Hash(Branch* Curr) { - wasm::HashType Ret = 0; + size_t Hash(Branch* Curr) { + auto digest = wasm::hash(0); if (Curr->SwitchValues) { for (auto i : *Curr->SwitchValues) { - Ret = wasm::rehash(Ret, i); // TODO hash i + wasm::rehash(digest, i); // TODO hash i } } else { if (Curr->Condition) { - Ret = wasm::ExpressionAnalyzer::hash(Curr->Condition); + wasm::hash_combine(digest, + wasm::ExpressionAnalyzer::hash(Curr->Condition)); } } - Ret = wasm::rehash(Ret, 1); + wasm::rehash(digest, uint8_t(1)); if (Curr->Code) { - Ret = wasm::ExpressionAnalyzer::hash(Curr->Code); + wasm::hash_combine(digest, wasm::ExpressionAnalyzer::hash(Curr->Code)); } - return Ret; + return digest; } }; diff --git a/src/ir/ExpressionAnalyzer.cpp b/src/ir/ExpressionAnalyzer.cpp index a6f4b601f..cdbcd9ae0 100644 --- a/src/ir/ExpressionAnalyzer.cpp +++ b/src/ir/ExpressionAnalyzer.cpp @@ -408,9 +408,9 @@ bool ExpressionAnalyzer::flexibleEqual(Expression* left, } // hash an expression, ignoring superficial details like specific internal names -HashType ExpressionAnalyzer::hash(Expression* curr) { +size_t ExpressionAnalyzer::hash(Expression* curr) { struct Hasher { - HashType digest = 0; + size_t digest = wasm::hash(0); Index internalCounter = 0; // for each internal name, its unique id @@ -432,7 +432,7 @@ HashType ExpressionAnalyzer::hash(Expression* curr) { if (!curr) { continue; } - hash(curr->_id); + rehash(digest, curr->_id); // we often don't need to hash the type, as it is tied to other values // we are hashing anyhow, but there are exceptions: for example, a // local.get's type is determined by the function, so if we are @@ -441,7 +441,7 @@ HashType ExpressionAnalyzer::hash(Expression* curr) { // if we hash between modules, then we need to take int account // call_imports type, etc. The simplest thing is just to hash the // type for all of them. - hash(curr->type.getID()); + rehash(digest, curr->type.getID()); // Blocks and loops introduce scoping. if (auto* block = curr->dynCast<Block>()) { noteScopeName(block->name); @@ -459,15 +459,10 @@ HashType ExpressionAnalyzer::hash(Expression* curr) { } // Sometimes children are optional, e.g. return, so we must hash // their number as well. - hash(counter); + rehash(digest, counter); } } - void hash(HashType hash) { digest = rehash(digest, hash); } - void hash64(uint64_t hash) { - digest = rehash(rehash(digest, HashType(hash >> 32)), HashType(hash)); - } - void visitScopeName(Name curr) { // Names are relative, we give the same hash for // (block $x (br $x)) @@ -475,21 +470,21 @@ HashType ExpressionAnalyzer::hash(Expression* curr) { static_assert(sizeof(Index) == sizeof(int32_t), "wasm64 will need changes here"); assert(internalNames.find(curr) != internalNames.end()); - return hash(internalNames[curr]); + rehash(digest, internalNames[curr]); } - void visitNonScopeName(Name curr) { return hash64(uint64_t(curr.str)); } - void visitInt(int32_t curr) { hash(curr); } - void visitLiteral(Literal curr) { hash(std::hash<Literal>()(curr)); } - void visitType(Type curr) { hash(int32_t(curr.getSingle())); } + void visitNonScopeName(Name curr) { rehash(digest, uint64_t(curr.str)); } + void visitInt(int32_t curr) { rehash(digest, curr); } + void visitLiteral(Literal curr) { rehash(digest, curr); } + void visitType(Type curr) { rehash(digest, curr.getID()); } void visitIndex(Index curr) { static_assert(sizeof(Index) == sizeof(int32_t), "wasm64 will need changes here"); - hash(int32_t(curr)); + rehash(digest, curr); } void visitAddress(Address curr) { static_assert(sizeof(Address) == sizeof(int32_t), "wasm64 will need changes here"); - hash(int32_t(curr)); + rehash(digest, curr.addr); } }; diff --git a/src/ir/ExpressionManipulator.cpp b/src/ir/ExpressionManipulator.cpp index a9fd3c599..57048b9bd 100644 --- a/src/ir/ExpressionManipulator.cpp +++ b/src/ir/ExpressionManipulator.cpp @@ -16,7 +16,6 @@ #include "ir/load-utils.h" #include "ir/utils.h" -#include "support/hash.h" namespace wasm { diff --git a/src/ir/hashed.h b/src/ir/hashed.h index 563e3abfe..fe959936a 100644 --- a/src/ir/hashed.h +++ b/src/ir/hashed.h @@ -26,16 +26,18 @@ namespace wasm { // An expression with a cached hash value struct HashedExpression { Expression* expr; - HashType hash; + size_t digest; HashedExpression(Expression* expr) : expr(expr) { if (expr) { - hash = ExpressionAnalyzer::hash(expr); + digest = ExpressionAnalyzer::hash(expr); + } else { + digest = hash(0); } } HashedExpression(const HashedExpression& other) - : expr(other.expr), hash(other.hash) {} + : expr(other.expr), digest(other.digest) {} }; // A pass that hashes all functions @@ -43,7 +45,7 @@ struct HashedExpression { struct FunctionHasher : public WalkerPass<PostWalker<FunctionHasher>> { bool isFunctionParallel() override { return true; } - struct Map : public std::map<Function*, HashType> {}; + struct Map : public std::map<Function*, size_t> {}; FunctionHasher(Map* output) : output(output) {} @@ -54,22 +56,21 @@ struct FunctionHasher : public WalkerPass<PostWalker<FunctionHasher>> { for (auto& func : module->functions) { // ensure an entry for each function - we must not modify the map shape in // parallel, just the values - hashes[func.get()] = 0; + hashes[func.get()] = hash(0); } return hashes; } void doWalkFunction(Function* func) { output->at(func) = hashFunction(func); } - static HashType hashFunction(Function* func) { - HashType ret = 0; - ret = rehash(ret, (HashType)func->sig.params.getID()); - ret = rehash(ret, (HashType)func->sig.results.getID()); + static size_t hashFunction(Function* func) { + auto digest = hash(func->sig.params.getID()); + rehash(digest, func->sig.results.getID()); for (auto type : func->vars) { - ret = rehash(ret, (HashType)type.getID()); + rehash(digest, type.getID()); } - ret = rehash(ret, (HashType)ExpressionAnalyzer::hash(func->body)); - return ret; + hash_combine(digest, ExpressionAnalyzer::hash(func->body)); + return digest; } private: diff --git a/src/ir/utils.h b/src/ir/utils.h index a7f6f59bf..176699591 100644 --- a/src/ir/utils.h +++ b/src/ir/utils.h @@ -78,7 +78,7 @@ struct ExpressionAnalyzer { // hash an expression, ignoring superficial details like specific internal // names - static HashType hash(Expression* curr); + static size_t hash(Expression* curr); }; // Re-Finalizes all node types. This can be run after code was modified in diff --git a/src/literal.h b/src/literal.h index 4f936153d..c58dbb35c 100644 --- a/src/literal.h +++ b/src/literal.h @@ -528,18 +528,19 @@ template<> struct hash<wasm::Literal> { a.getBits(bytes); int64_t chunks[2]; memcpy(chunks, bytes, sizeof(chunks)); - return wasm::rehash(wasm::rehash(uint64_t(hash<uint32_t>()(a.type.getID())), - uint64_t(hash<int64_t>()(chunks[0]))), - uint64_t(hash<int64_t>()(chunks[1]))); + auto digest = wasm::hash(a.type.getID()); + wasm::rehash(digest, chunks[0]); + wasm::rehash(digest, chunks[1]); + return digest; } }; template<> struct hash<wasm::Literals> { size_t operator()(const wasm::Literals& a) const { - size_t h = wasm::rehash(uint64_t(0), uint64_t(a.size())); + auto digest = wasm::hash(a.size()); for (const auto& lit : a) { - h = wasm::rehash(uint64_t(h), uint64_t(hash<wasm::Literal>{}(lit))); + wasm::rehash(digest, lit); } - return h; + return digest; } }; template<> struct less<wasm::Literal> { diff --git a/src/passes/CodeFolding.cpp b/src/passes/CodeFolding.cpp index 04819a04d..ee54c50a6 100644 --- a/src/passes/CodeFolding.cpp +++ b/src/passes/CodeFolding.cpp @@ -606,25 +606,25 @@ private: if (next.size() >= 2) { // now we want to find a mergeable item - any item that is equal among a // subset - std::map<Expression*, HashType> hashes; // expression => hash value + std::map<Expression*, size_t> hashes; // expression => hash value // hash value => expressions with that hash - std::map<HashType, std::vector<Expression*>> hashed; + std::map<size_t, std::vector<Expression*>> hashed; for (auto& tail : next) { auto* item = getItem(tail, num); auto hash = hashes[item] = ExpressionAnalyzer::hash(item); hashed[hash].push_back(item); } // look at each hash value exactly once. we do this in a deterministic - // order. - std::set<HashType> seen; + // order by iterating over a vector retaining insertion order. + std::set<size_t> seen; for (auto& tail : next) { auto* item = getItem(tail, num); - auto hash = hashes[item]; - if (seen.count(hash)) { + auto digest = hashes[item]; + if (seen.count(digest)) { continue; } - seen.insert(hash); - auto& items = hashed[hash]; + seen.insert(digest); + auto& items = hashed[digest]; if (items.size() == 1) { continue; } diff --git a/src/passes/LocalCSE.cpp b/src/passes/LocalCSE.cpp index 81a82d1c6..f5cf323ec 100644 --- a/src/passes/LocalCSE.cpp +++ b/src/passes/LocalCSE.cpp @@ -62,14 +62,16 @@ struct LocalCSE : public WalkerPass<LinearExecutionWalker<LocalCSE>> { }; struct UsableHasher { - HashType operator()(const Usable value) const { - return rehash(uint64_t(value.hashed.hash), value.localType.getID()); + size_t operator()(const Usable value) const { + auto digest = value.hashed.digest; + wasm::rehash(digest, value.localType.getID()); + return digest; } }; struct UsableComparer { bool operator()(const Usable a, const Usable b) const { - if (a.hashed.hash != b.hashed.hash || a.localType != b.localType) { + if (a.hashed.digest != b.hashed.digest || a.localType != b.localType) { return false; } return ExpressionAnalyzer::equal(a.hashed.expr, b.hashed.expr); diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp index 5a0dedf37..73e5094e8 100644 --- a/src/passes/pass.cpp +++ b/src/passes/pass.cpp @@ -686,7 +686,7 @@ struct AfterEffectFunctionChecker { // Check Stack IR state: if the main IR changes, there should be no // stack IR, as the stack IR would be wrong. bool beganWithStackIR; - HashType originalFunctionHash; + size_t originalFunctionHash; // In the creator we can scan the state of the module and function before the // pass runs. diff --git a/src/support/hash.h b/src/support/hash.h index 647369ac8..50ecec563 100644 --- a/src/support/hash.h +++ b/src/support/hash.h @@ -22,27 +22,22 @@ namespace wasm { -typedef uint32_t HashType; - -inline HashType rehash(HashType x, HashType y) { - // see http://www.cse.yorku.ca/~oz/hash.html and - // https://stackoverflow.com/a/2595226/1176841 - HashType hash = 5381; - while (x) { - hash = ((hash << 5) + hash) ^ (x & 0xff); - x >>= 8; - } - while (y) { - hash = ((hash << 5) + hash) ^ (y & 0xff); - y >>= 8; - } - return hash; +// Computes the digest of `value`. +template<typename T> inline std::size_t hash(const T& value) { + return std::hash<T>{}(value); } -inline uint64_t rehash(uint64_t x, uint64_t y) { - auto ret = rehash(HashType(x), HashType(x >> 32)); - ret = rehash(ret, HashType(y)); - return rehash(ret, HashType(y >> 32)); +// Combines two digests into the first digest. Use instead of `rehash` if +// `otherDigest` is another digest and not a `size_t` value. +static inline void hash_combine(std::size_t& digest, std::size_t otherDigest) { + // see boost/container_hash/hash.hpp + digest ^= otherDigest + 0x9e3779b9 + (digest << 6) + (digest >> 2); +} + +// Hashes `value` and combines the resulting digest into the existing digest. +// Use instead of `hash_combine` if `value` is not another digest. +template<typename T> inline void rehash(std::size_t& digest, const T& value) { + hash_combine(digest, hash(value)); } } // namespace wasm diff --git a/src/wasm/wasm-type.cpp b/src/wasm/wasm-type.cpp index b49f0338f..c8d8a94c5 100644 --- a/src/wasm/wasm-type.cpp +++ b/src/wasm/wasm-type.cpp @@ -30,21 +30,22 @@ namespace std { template<> class hash<vector<wasm::Type>> { public: size_t operator()(const vector<wasm::Type>& types) const { - uint64_t res = wasm::rehash(0, uint32_t(types.size())); + auto digest = wasm::hash(types.size()); for (auto t : types) { - res = wasm::rehash(res, t.getID()); + wasm::rehash(digest, t.getID()); } - return res; + return digest; } }; size_t hash<wasm::Type>::operator()(const wasm::Type& type) const { - return hash<uint64_t>{}(type.getID()); + return wasm::hash(type.getID()); } size_t hash<wasm::Signature>::operator()(const wasm::Signature& sig) const { - return wasm::rehash(uint64_t(hash<uint64_t>{}(sig.params.getID())), - uint64_t(hash<uint64_t>{}(sig.results.getID()))); + auto digest = wasm::hash(sig.params.getID()); + wasm::rehash(digest, sig.results.getID()); + return digest; } } // namespace std |