summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlon Zakai <azakai@google.com>2019-06-25 13:41:36 -0700
committerGitHub <noreply@github.com>2019-06-25 13:41:36 -0700
commit08e52f94db3737038bb01daca610504316e50fdb (patch)
tree56e377c982af88282d32687e4642ede08995b930 /src
parente63fe0b42d8a334a48c2988f3c05705ee04701f9 (diff)
downloadbinaryen-08e52f94db3737038bb01daca610504316e50fdb.tar.gz
binaryen-08e52f94db3737038bb01daca610504316e50fdb.tar.bz2
binaryen-08e52f94db3737038bb01daca610504316e50fdb.zip
Bysyncify: optimize better by coalescing before instrumenting control flow (#2183)
This results in better code sizes on many testcases, sometimes much better. For example, on SQLite the 150K function has only 27 locals instead of 3,874 which it had before (!). This also reduces total code size on SQLite by 15%. The key issue is that after instrumenting control flow we have a lot bigger live ranges. This must be done rather carefully, as we need to introduce some temp locals early on (for breaking up ifs, for call return values, etc.).
Diffstat (limited to 'src')
-rw-r--r--src/passes/Bysyncify.cpp166
1 files changed, 138 insertions, 28 deletions
diff --git a/src/passes/Bysyncify.cpp b/src/passes/Bysyncify.cpp
index 29b3cf242..e798be562 100644
--- a/src/passes/Bysyncify.cpp
+++ b/src/passes/Bysyncify.cpp
@@ -236,9 +236,54 @@ enum class DataOffset { BStackPos = 0, BStackEnd = 4 };
const auto STACK_ALIGN = 4;
+// A helper class for managing fake global names. Creates the globals and
+// provides mappings for using them.
+class GlobalHelper {
+ Module& module;
+
+public:
+ GlobalHelper(Module& module) : module(module) {
+ map[i32] = "bysyncify_fake_call_global_i32";
+ map[i64] = "bysyncify_fake_call_global_i64";
+ map[f32] = "bysyncify_fake_call_global_f32";
+ map[f64] = "bysyncify_fake_call_global_f64";
+ Builder builder(module);
+ for (auto& pair : map) {
+ auto type = pair.first;
+ auto name = pair.second;
+ rev[name] = type;
+ module.addGlobal(builder.makeGlobal(
+ name, type, LiteralUtils::makeZero(type, module), Builder::Mutable));
+ }
+ }
+
+ ~GlobalHelper() {
+ for (auto& pair : map) {
+ auto name = pair.second;
+ module.removeGlobal(name);
+ }
+ }
+
+ Name getName(Type type) { return map.at(type); }
+
+ Type getTypeOrNone(Name name) {
+ auto iter = rev.find(name);
+ if (iter != rev.end()) {
+ return iter->second;
+ }
+ return none;
+ }
+
+private:
+ std::map<Type, Name> map;
+ std::map<Name, Type> rev;
+};
+
// Analyze the entire module to see which calls may change the state, that
// is, start an unwind or rewind), either in itself or in something called
// by it.
+// Handles global module management, needed from the various parts of this
+// transformation.
class ModuleAnalyzer {
Module& module;
bool canIndirectChangeState;
@@ -268,7 +313,8 @@ public:
ModuleAnalyzer(Module& module,
std::function<bool(Name, Name)> canImportChangeState,
bool canIndirectChangeState)
- : module(module), canIndirectChangeState(canIndirectChangeState) {
+ : module(module), canIndirectChangeState(canIndirectChangeState),
+ globals(module) {
// Scan to see which functions can directly change the state.
// Also handle the bysyncify imports, removing them (as we will implement
// them later), and replace calls to them with calls to the later proper
@@ -429,6 +475,8 @@ public:
}
return walker.canChangeState;
}
+
+ GlobalHelper globals;
};
// Checks if something performs a call: either a direct or indirect call,
@@ -487,8 +535,9 @@ struct BysyncifyFlow : public Pass {
BysyncifyFlow(ModuleAnalyzer* analyzer) : analyzer(analyzer) {}
void
- runOnFunction(PassRunner* runner, Module* module_, Function* func) override {
+ runOnFunction(PassRunner* runner, Module* module_, Function* func_) override {
module = module_;
+ func = func_;
// If the function cannot change our state, we have nothing to do -
// we will never unwind or rewind the stack here.
if (!analyzer->needsInstrumentation(func)) {
@@ -518,6 +567,7 @@ private:
std::unique_ptr<BysyncifyBuilder> builder;
Module* module;
+ Function* func;
// Each call in the function has an index, noted during unwind and checked
// during rewind.
@@ -527,12 +577,15 @@ private:
if (!analyzer->canChangeState(curr)) {
return makeMaybeSkip(curr);
}
- // The IR is in flat form, which makes this much simpler. We basically
- // need to add skips to avoid code when rewinding, and checks around calls
- // with unwinding/rewinding support.
+ // The IR is in flat form, which makes this much simpler: there are no
+ // unnecessarily nested side effects or control flow, so we can add
+ // skips for rewinding in an easy manner, putting a single if around
+ // whole chunks of code. Also calls are separated out so that it is easy
+ // to add the necessary checks for them for unwinding/rewinding support.
//
// Aside from that, we must "linearize" all control flow so that we can
- // reach the right part when unwinding. For example, for an if we do this:
+ // reach the right part when rewinding, which is done by always skipping
+ // forward. For example, for an if we do this:
//
// if (condition()) {
// side1();
@@ -595,25 +648,28 @@ private:
// must be in one of the children.
assert(!analyzer->canChangeState(iff->condition));
// We must linearize this, which means we pass through both arms if we
- // are rewinding. This is pretty simple as in flat form the if condition
- // is either a const or a get, so easily copyable.
- // Start with the first arm, for which we reuse the original if.
- auto* otherArm = iff->ifFalse;
- iff->ifFalse = nullptr;
- auto* originalCondition = iff->condition;
+ // are rewinding.
+ if (!iff->ifFalse) {
+ iff->condition = builder->makeBinary(
+ OrInt32, iff->condition, builder->makeStateCheck(State::Rewinding));
+ iff->ifTrue = process(iff->ifTrue);
+ iff->finalize();
+ return iff;
+ }
+ auto conditionTemp = builder->addVar(func, i32);
+ iff->condition = builder->makeLocalTee(conditionTemp, iff->condition);
iff->condition = builder->makeBinary(
- OrInt32, originalCondition, builder->makeStateCheck(State::Rewinding));
+ OrInt32, iff->condition, builder->makeStateCheck(State::Rewinding));
iff->ifTrue = process(iff->ifTrue);
+ auto* otherArm = iff->ifFalse;
+ iff->ifFalse = nullptr;
iff->finalize();
- if (!otherArm) {
- return iff;
- }
// Add support for the second arm as well.
auto* otherIf = builder->makeIf(
builder->makeBinary(
OrInt32,
- builder->makeUnary(
- EqZInt32, ExpressionManipulator::copy(originalCondition, *module)),
+ builder->makeUnary(EqZInt32,
+ builder->makeLocalGet(conditionTemp, i32)),
builder->makeStateCheck(State::Rewinding)),
process(otherArm));
otherIf->finalize();
@@ -640,19 +696,39 @@ private:
// change the state
assert(doesCall(curr));
assert(curr->type == none);
+ // The case of a set is tricky: we must *not* execute the set when
+ // unwinding, since at that point we have a fake value for the return,
+ // and if we applied it to the local, it would end up saved and then
+ // potentially used later - and with coalescing, that may interfere
+ // with other things. Note also that at this point in the process we
+ // have not yet written the load saving/loading code, so the optimizer
+ // doesn't see that locals will have another use at the beginning and
+ // end of the function. We don't do that yet because we don't want it
+ // to force the final number of locals to be too high, but we also
+ // must be careful to never touch a local unnecessarily to avoid bugs.
+ // To implement this, write to a fake global; we'll fix it up after
+ // BysyncifyLocals locals adds local saving/restoring.
+ auto* set = curr->dynCast<LocalSet>();
+ if (set) {
+ auto name = analyzer->globals.getName(set->value->type);
+ curr = builder->makeGlobalSet(name, set->value);
+ set->value = builder->makeGlobalGet(name, set->value->type);
+ }
+ // Instrument the call itself (or, if a drop, the drop+call).
auto index = callIndex++;
// Execute the call, if either normal execution, or if rewinding and this
// is the right call to go into.
// TODO: we can read the next call index once in each function (but should
// avoid saving/restoring that local later)
- return builder->makeIf(
+ curr = builder->makeIf(
builder->makeIf(builder->makeStateCheck(State::Normal),
builder->makeConst(Literal(int32_t(1))),
makeCallIndexPeek(index)),
- builder->makeSequence(curr, makePossibleUnwind(index)));
+ builder->makeSequence(curr, makePossibleUnwind(index, set)));
+ return curr;
}
- Expression* makePossibleUnwind(Index index) {
+ Expression* makePossibleUnwind(Index index, Expression* ifNotUnwinding) {
// In this pass we emit an "unwind" as a call to a fake intrinsic. We
// will implement it in the later pass. (We can't create the unwind block
// target here, as the optimizer would remove it later; we can only add
@@ -660,7 +736,8 @@ private:
return builder->makeIf(
builder->makeStateCheck(State::Unwinding),
builder->makeCall(
- BYSYNCIFY_UNWIND, {builder->makeConst(Literal(int32_t(index)))}, none));
+ BYSYNCIFY_UNWIND, {builder->makeConst(Literal(int32_t(index)))}, none),
+ ifNotUnwinding);
}
Expression* makeCallIndexPeek(Index index) {
@@ -707,6 +784,29 @@ struct BysyncifyLocals : public WalkerPass<PostWalker<BysyncifyLocals>> {
}
}
+ void visitGlobalSet(GlobalSet* curr) {
+ auto type = analyzer->globals.getTypeOrNone(curr->name);
+ if (type != none) {
+ replaceCurrent(
+ builder->makeLocalSet(getFakeCallLocal(type), curr->value));
+ }
+ }
+
+ void visitGlobalGet(GlobalGet* curr) {
+ auto type = analyzer->globals.getTypeOrNone(curr->name);
+ if (type != none) {
+ replaceCurrent(builder->makeLocalGet(getFakeCallLocal(type), type));
+ }
+ }
+
+ Index getFakeCallLocal(Type type) {
+ auto iter = fakeCallLocals.find(type);
+ if (iter != fakeCallLocals.end()) {
+ return iter->second;
+ }
+ return fakeCallLocals[type] = builder->addVar(getFunction(), type);
+ }
+
void doWalkFunction(Function* func) {
// If the function cannot change our state, we have nothing to do -
// we will never unwind or rewind the stack here.
@@ -763,6 +863,7 @@ private:
Index rewindIndex;
Index numPreservableLocals;
+ std::map<Type, Index> fakeCallLocals;
Expression* makeLocalLoading() {
if (numPreservableLocals == 0) {
@@ -884,13 +985,22 @@ struct Bysyncify : public Pass {
PassRunner runner(module);
runner.add("flatten");
// Dce is useful here, since BysyncifyFlow makes control flow conditional,
- // which may make unreachable code look reachable. We also do some other
- // minimal optimization here in an unconditional way here, to counteract
- // the flattening.
+ // which may make unreachable code look reachable. It also lets us ignore
+ // unreachable code here.
runner.add("dce");
- runner.add("simplify-locals-nonesting");
- runner.add("merge-blocks");
- runner.add("vacuum");
+ if (optimize) {
+ // Optimizing before BsyncifyFlow is crucial, especially coalescing,
+ // because the flow changes add many branches, break up if-elses, etc.,
+ // all of which extend the live ranges of locals. In other words, it is
+ // not possible to coalesce well afterwards.
+ runner.add("simplify-locals-nonesting");
+ runner.add("reorder-locals");
+ runner.add("coalesce-locals");
+ runner.add("simplify-locals-nonesting");
+ runner.add("reorder-locals");
+ runner.add("merge-blocks");
+ runner.add("vacuum");
+ }
runner.add<BysyncifyFlow>(&analyzer);
runner.setIsNested(true);
runner.setValidateGlobally(false);