diff options
Diffstat (limited to 'src/passes/CoalesceLocals.cpp')
-rw-r--r-- | src/passes/CoalesceLocals.cpp | 186 |
1 files changed, 117 insertions, 69 deletions
diff --git a/src/passes/CoalesceLocals.cpp b/src/passes/CoalesceLocals.cpp index 621383ca4..a085b61fb 100644 --- a/src/passes/CoalesceLocals.cpp +++ b/src/passes/CoalesceLocals.cpp @@ -14,32 +14,31 @@ * limitations under the License. */ - // // Coalesce locals, in order to reduce the total number of locals. This // is similar to register allocation, however, there is never any // spilling, and there isn't a fixed number of locals. // - #include <algorithm> #include <memory> #include <unordered_set> -#include "wasm.h" -#include "pass.h" -#include "ir/utils.h" #include "cfg/liveness-traversal.h" -#include "wasm-builder.h" +#include "ir/utils.h" +#include "pass.h" #include "support/learning.h" #include "support/permutations.h" +#include "wasm-builder.h" +#include "wasm.h" #ifdef CFG_PROFILE #include "support/timing.h" #endif namespace wasm { -struct CoalesceLocals : public WalkerPass<LivenessWalker<CoalesceLocals, Visitor<CoalesceLocals>>> { +struct CoalesceLocals + : public WalkerPass<LivenessWalker<CoalesceLocals, Visitor<CoalesceLocals>>> { bool isFunctionParallel() override { return true; } Pass* create() override { return new CoalesceLocals; } @@ -54,23 +53,30 @@ struct CoalesceLocals : public WalkerPass<LivenessWalker<CoalesceLocals, Visitor void calculateInterferences(const LocalSet& locals); - void pickIndicesFromOrder(std::vector<Index>& order, std::vector<Index>& indices); - void pickIndicesFromOrder(std::vector<Index>& order, std::vector<Index>& indices, Index& removedCopies); + void pickIndicesFromOrder(std::vector<Index>& order, + std::vector<Index>& indices); + void pickIndicesFromOrder(std::vector<Index>& order, + std::vector<Index>& indices, + Index& removedCopies); - virtual void pickIndices(std::vector<Index>& indices); // returns a vector of oldIndex => newIndex + // returns a vector of oldIndex => newIndex + virtual void pickIndices(std::vector<Index>& indices); void applyIndices(std::vector<Index>& indices, Expression* root); // interference state - std::vector<bool> interferences; // canonicalized - accesses should check (low, high) + // canonicalized - accesses should check (low, high) + std::vector<bool> interferences; void interfere(Index i, Index j) { - if (i == j) return; + if (i == j) + return; interferences[std::min(i, j) * numLocals + std::max(i, j)] = 1; } - void interfereLowHigh(Index low, Index high) { // optimized version where you know that low < high + // optimized version where you know that low < high + void interfereLowHigh(Index low, Index high) { assert(low < high); interferences[low * numLocals + high] = 1; } @@ -97,20 +103,25 @@ void CoalesceLocals::doWalkFunction(Function* func) { applyIndices(indices, func->body); } -// A copy on a backedge can be especially costly, forcing us to branch just to do that copy. -// Add weight to such copies, so we prioritize getting rid of them. +// A copy on a backedge can be especially costly, forcing us to branch just to +// do that copy. Add weight to such copies, so we prioritize getting rid of +// them. void CoalesceLocals::increaseBackEdgePriorities() { for (auto* loopTop : loopTops) { // ignore the first edge, it is the initial entry, we just want backedges auto& in = loopTop->in; for (Index i = 1; i < in.size(); i++) { auto* arrivingBlock = in[i]; - if (arrivingBlock->out.size() > 1) continue; // we just want unconditional branches to the loop top, true phi fragments + if (arrivingBlock->out.size() > 1) + // we just want unconditional branches to the loop top, true phi + // fragments + continue; for (auto& action : arrivingBlock->contents.actions) { if (action.isSet()) { auto* set = (*action.origin)->cast<SetLocal>(); if (auto* get = getCopy(set)) { - // this is indeed a copy, add to the cost (default cost is 2, so this adds 50%, and can mostly break ties) + // this is indeed a copy, add to the cost (default cost is 2, so + // this adds 50%, and can mostly break ties) addCopy(set->index, get->index); } } @@ -123,8 +134,10 @@ void CoalesceLocals::calculateInterferences() { interferences.resize(numLocals * numLocals); std::fill(interferences.begin(), interferences.end(), false); for (auto& curr : basicBlocks) { - if (liveBlocks.count(curr.get()) == 0) continue; // ignore dead blocks - // everything coming in might interfere, as it might come from a different block + if (liveBlocks.count(curr.get()) == 0) + continue; // ignore dead blocks + // everything coming in might interfere, as it might come from a different + // block auto live = curr->contents.end; calculateInterferences(live); // scan through the block itself @@ -166,18 +179,22 @@ void CoalesceLocals::calculateInterferences(const LocalSet& locals) { // Indices decision making -void CoalesceLocals::pickIndicesFromOrder(std::vector<Index>& order, std::vector<Index>& indices) { +void CoalesceLocals::pickIndicesFromOrder(std::vector<Index>& order, + std::vector<Index>& indices) { Index removedCopies; pickIndicesFromOrder(order, indices, removedCopies); } -void CoalesceLocals::pickIndicesFromOrder(std::vector<Index>& order, std::vector<Index>& indices, Index& removedCopies) { - // mostly-simple greedy coloring +void CoalesceLocals::pickIndicesFromOrder(std::vector<Index>& order, + std::vector<Index>& indices, + Index& removedCopies) { +// mostly-simple greedy coloring #if CFG_DEBUG std::cerr << "\npickIndicesFromOrder on " << getFunction()->name << '\n'; std::cerr << getFunction()->body << '\n'; std::cerr << "order:\n"; - for (auto i : order) std::cerr << i << ' '; + for (auto i : order) + std::cerr << i << ' '; std::cerr << '\n'; std::cerr << "interferences:\n"; for (Index i = 0; i < numLocals; i++) { @@ -204,16 +221,20 @@ void CoalesceLocals::pickIndicesFromOrder(std::vector<Index>& order, std::vector std::cerr << " $" << i << ": " << totalCopies[i] << '\n'; } #endif - // TODO: take into account distribution (99-1 is better than 50-50 with two registers, for gzip) + // TODO: take into account distribution (99-1 is better than 50-50 with two + // registers, for gzip) std::vector<Type> types; - std::vector<bool> newInterferences; // new index * numLocals => list of all interferences of locals merged to it - std::vector<uint8_t> newCopies; // new index * numLocals => list of all copies of locals merged to it + // new index * numLocals => list of all interferences of locals merged to it + std::vector<bool> newInterferences; + // new index * numLocals => list of all copies of locals merged to it + std::vector<uint8_t> newCopies; indices.resize(numLocals); types.resize(numLocals); newInterferences.resize(numLocals * numLocals); std::fill(newInterferences.begin(), newInterferences.end(), false); auto numParams = getFunction()->getNumParams(); - newCopies.resize(numParams * numLocals); // start with enough room for the params + // start with enough room for the params + newCopies.resize(numParams * numLocals); std::fill(newCopies.begin(), newCopies.end(), 0); Index nextFree = 0; removedCopies = 0; @@ -234,9 +255,12 @@ void CoalesceLocals::pickIndicesFromOrder(std::vector<Index>& order, std::vector Index found = -1; uint8_t foundCopies = -1; for (Index j = 0; j < nextFree; j++) { - if (!newInterferences[j * numLocals + actual] && getFunction()->getLocalType(actual) == types[j]) { - // this does not interfere, so it might be what we want. but pick the one eliminating the most copies - // (we could stop looking forward when there are no more items that have copies anyhow, but it doesn't seem to help) + if (!newInterferences[j * numLocals + actual] && + getFunction()->getLocalType(actual) == types[j]) { + // this does not interfere, so it might be what we want. but pick the + // one eliminating the most copies (we could stop looking forward when + // there are no more items that have copies anyhow, but it doesn't seem + // to help) auto currCopies = newCopies[j * numLocals + actual]; if (found == Index(-1) || currCopies > foundCopies) { indices[actual] = found = j; @@ -258,46 +282,53 @@ void CoalesceLocals::pickIndicesFromOrder(std::vector<Index>& order, std::vector #endif // merge new interferences and copies for the new index for (Index k = i + 1; k < numLocals; k++) { - auto j = order[k]; // go in the order, we only need to update for those we will see later - newInterferences[found * numLocals + j] = newInterferences[found * numLocals + j] | interferes(actual, j); + // go in the order, we only need to update for those we will see later + auto j = order[k]; + newInterferences[found * numLocals + j] = + newInterferences[found * numLocals + j] | interferes(actual, j); newCopies[found * numLocals + j] += getCopies(actual, j); } } } -// given a baseline order, adjust it based on an important order of priorities (higher values -// are higher priority). The priorities take precedence, unless they are equal and then -// the original order should be kept. -std::vector<Index> adjustOrderByPriorities(std::vector<Index>& baseline, std::vector<Index>& priorities) { +// given a baseline order, adjust it based on an important order of priorities +// (higher values are higher priority). The priorities take precedence, unless +// they are equal and then the original order should be kept. +std::vector<Index> adjustOrderByPriorities(std::vector<Index>& baseline, + std::vector<Index>& priorities) { std::vector<Index> ret = baseline; std::vector<Index> reversed = makeReversed(baseline); std::sort(ret.begin(), ret.end(), [&priorities, &reversed](Index x, Index y) { - return priorities[x] > priorities[y] || (priorities[x] == priorities[y] && reversed[x] < reversed[y]); + return priorities[x] > priorities[y] || + (priorities[x] == priorities[y] && reversed[x] < reversed[y]); }); return ret; } void CoalesceLocals::pickIndices(std::vector<Index>& indices) { - if (numLocals == 0) return; + if (numLocals == 0) + return; if (numLocals == 1) { indices.push_back(0); return; } - // take into account total copies. but we must keep params in place, so give them max priority + // take into account total copies. but we must keep params in place, so give + // them max priority auto adjustedTotalCopies = totalCopies; auto numParams = getFunction()->getNumParams(); for (Index i = 0; i < numParams; i++) { adjustedTotalCopies[i] = std::numeric_limits<Index>::max(); } - // first try the natural order. this is less arbitrary than it seems, as the program - // may have a natural order of locals inherent in it. + // first try the natural order. this is less arbitrary than it seems, as the + // program may have a natural order of locals inherent in it. auto order = makeIdentity(numLocals); order = adjustOrderByPriorities(order, adjustedTotalCopies); Index removedCopies; pickIndicesFromOrder(order, indices, removedCopies); auto maxIndex = *std::max_element(indices.begin(), indices.end()); - // next try the reverse order. this both gives us another chance at something good, - // and also the very naturalness of the simple order may be quite suboptimal + // next try the reverse order. this both gives us another chance at something + // good, and also the very naturalness of the simple order may be quite + // suboptimal setIdentity(order); for (Index i = numParams; i < numLocals; i++) { order[i] = numParams + numLocals - 1 - i; @@ -306,15 +337,18 @@ void CoalesceLocals::pickIndices(std::vector<Index>& indices) { std::vector<Index> reverseIndices; Index reverseRemovedCopies; pickIndicesFromOrder(order, reverseIndices, reverseRemovedCopies); - auto reverseMaxIndex = *std::max_element(reverseIndices.begin(), reverseIndices.end()); - // prefer to remove copies foremost, as it matters more for code size (minus gzip), and - // improves throughput. - if (reverseRemovedCopies > removedCopies || (reverseRemovedCopies == removedCopies && reverseMaxIndex < maxIndex)) { + auto reverseMaxIndex = + *std::max_element(reverseIndices.begin(), reverseIndices.end()); + // prefer to remove copies foremost, as it matters more for code size (minus + // gzip), and improves throughput. + if (reverseRemovedCopies > removedCopies || + (reverseRemovedCopies == removedCopies && reverseMaxIndex < maxIndex)) { indices.swap(reverseIndices); } } -void CoalesceLocals::applyIndices(std::vector<Index>& indices, Expression* root) { +void CoalesceLocals::applyIndices(std::vector<Index>& indices, + Expression* root) { assert(indices.size() == numLocals); for (auto& curr : basicBlocks) { auto& actions = curr->contents.actions; @@ -325,15 +359,19 @@ void CoalesceLocals::applyIndices(std::vector<Index>& indices, Expression* root) } else if (action.isSet()) { auto* set = (*action.origin)->cast<SetLocal>(); set->index = indices[set->index]; - // in addition, we can optimize out redundant copies and ineffective sets + // in addition, we can optimize out redundant copies and ineffective + // sets GetLocal* get; - if ((get = set->value->dynCast<GetLocal>()) && get->index == set->index) { + if ((get = set->value->dynCast<GetLocal>()) && + get->index == set->index) { action.removeCopy(); continue; } // remove ineffective actions if (!action.effective) { - *action.origin = set->value; // value may have no side effects, further optimizations can eliminate it + // value may have no side effects, further optimizations can eliminate + // it + *action.origin = set->value; if (!set->isTee()) { // we need to drop it Drop* drop = ExpressionManipulator::convert<SetLocal, Drop>(set); @@ -382,10 +420,12 @@ void CoalesceLocalsWithLearning::pickIndices(std::vector<Index>& indices) { double getFitness() { return fitness; } void dump(std::string text) { std::cout << text + ": ( "; - for (Index i = 0; i < size(); i++) std::cout << (*this)[i] << " "; + for (Index i = 0; i < size(); i++) + std::cout << (*this)[i] << " "; std::cout << ")\n"; std::cout << "of quality: " << getFitness() << "\n"; } + private: double fitness; }; @@ -405,9 +445,11 @@ void CoalesceLocalsWithLearning::pickIndices(std::vector<Index>& indices) { // secondarily, it is nice to not reorder locals unnecessarily double fragment = 1.0 / (2.0 * parent->numLocals); for (Index i = 0; i < parent->numLocals; i++) { - if ((*order)[i] == i) fitness += fragment; // boost for each that wasn't moved + if ((*order)[i] == i) + fitness += fragment; // boost for each that wasn't moved } - fitness = (100 * fitness) + removedCopies; // removing copies is a secondary concern + // removing copies is a secondary concern + fitness = (100 * fitness) + removedCopies; order->setFitness(fitness); } @@ -418,16 +460,19 @@ void CoalesceLocalsWithLearning::pickIndices(std::vector<Index>& indices) { (*ret)[i] = i; } if (first) { - // as the first guess, use the natural order. this is not arbitrary for two reasons. - // first, there may be an inherent order in the input (frequent indices are lower, - // etc.). second, by ensuring we start with the natural order, we ensure we are at - // least as good as the non-learning variant. - // TODO: use ::pickIndices from the parent, so we literally get the simpler approach - // as our first option + // as the first guess, use the natural order. this is not arbitrary for + // two reasons. first, there may be an inherent order in the input + // (frequent indices are lower, etc.). second, by ensuring we start with + // the natural order, we ensure we are at least as good as the + // non-learning variant. + // TODO: use ::pickIndices from the parent, so we literally get the + // simpler approach as our first option first = false; } else { // leave params alone, shuffle the rest - std::shuffle(ret->begin() + parent->getFunction()->getNumParams(), ret->end(), noise); + std::shuffle(ret->begin() + parent->getFunction()->getNumParams(), + ret->end(), + noise); } calculateFitness(ret); #ifdef CFG_LEARN_DEBUG @@ -455,7 +500,9 @@ void CoalesceLocalsWithLearning::pickIndices(std::vector<Index>& indices) { // if (i, i + 1) is in reverse order in right, flip them if (reverseRight[(*ret)[i]] > reverseRight[(*ret)[i + 1]]) { std::swap((*ret)[i], (*ret)[i + 1]); - i++; // if we don't skip, we might end up pushing an element all the way to the end, which is not very perturbation-y + // if we don't skip, we might end up pushing an element all the way to + // the end, which is not very perturbation-y + i++; } } calculateFitness(ret); @@ -475,7 +522,8 @@ void CoalesceLocalsWithLearning::pickIndices(std::vector<Index>& indices) { std::cout << "[learning for " << getFunction()->name << "]\n"; #endif auto numVars = this->getFunction()->getNumVars(); - const int GENERATION_SIZE = std::min(Index(numVars * (numVars - 1)), Index(20)); + const int GENERATION_SIZE = + std::min(Index(numVars * (numVars - 1)), Index(20)); Generator generator(this); GeneticLearner<Order, double, Generator> learner(generator, GENERATION_SIZE); #ifdef CFG_LEARN_DEBUG @@ -486,7 +534,8 @@ void CoalesceLocalsWithLearning::pickIndices(std::vector<Index>& indices) { while (1) { learner.runGeneration(); auto newBest = learner.getBest()->getFitness(); - if (newBest == oldBest) break; // unlikely we can improve + if (newBest == oldBest) + break; // unlikely we can improve oldBest = newBest; #ifdef CFG_LEARN_DEBUG learner.getBest()->dump("current best"); @@ -495,16 +544,15 @@ void CoalesceLocalsWithLearning::pickIndices(std::vector<Index>& indices) { #ifdef CFG_LEARN_DEBUG learner.getBest()->dump("the best"); #endif - this->pickIndicesFromOrder(*learner.getBest(), indices); // TODO: cache indices in Orders, at the cost of more memory? + // TODO: cache indices in Orders, at the cost of more memory? + this->pickIndicesFromOrder(*learner.getBest(), indices); } // declare passes -Pass *createCoalesceLocalsPass() { - return new CoalesceLocals(); -} +Pass* createCoalesceLocalsPass() { return new CoalesceLocals(); } -Pass *createCoalesceLocalsWithLearningPass() { +Pass* createCoalesceLocalsWithLearningPass() { return new CoalesceLocalsWithLearning(); } |