wasm-reduce tool (#1139)

Reduce an interesting wasm to a smaller still interesting wasm. This takes an arbitrary command to run, and reduces the wasm as much as it can while keeping the behavior of that command fixed. This can be used to reduce compiler bugs in an arbitrary VM, etc.
author: Alon Zakai <alonzakai@gmail.com> 2017-09-01 12:34:03 -0700
committer: GitHub <noreply@github.com> 2017-09-01 12:34:03 -0700
commit: 6de14693110b4f898344ff1cb383caf0d74eb42e (patch)
tree: 6fbcb75fd3c2285191939a086989e14656908282 /src/tools/wasm-reduce.cpp
parent: b013f744e3d70effd9be348cbde7fb93f0a16c6a (diff)
download: binaryen-6de14693110b4f898344ff1cb383caf0d74eb42e.tar.gz
binaryen-6de14693110b4f898344ff1cb383caf0d74eb42e.tar.bz2
binaryen-6de14693110b4f898344ff1cb383caf0d74eb42e.zip
1 files changed, 661 insertions, 0 deletions
diff --git a/src/tools/wasm-reduce.cpp b/src/tools/wasm-reduce.cpp
new file mode 100644
index 000000000..355b70278
--- /dev/null
+++ b/src/tools/wasm-reduce.cpp
@@ -0,0 +1,661 @@
+/*
+ * Copyright 2017 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//
+// Tries to reduce the input wasm into the smallest possible wasm
+// that still generates the same result on a given command. This is
+// useful to reduce bug testcases, for example, if a file crashes
+// the optimizer, you can reduce it to find the smallest file that
+// also crashes it (which generally will show the same bug, in a
+// much more debuggable manner).
+//
+
+#include <memory>
+#include <cstdio>
+#include <cstdlib>
+
+#include "pass.h"
+#include "support/command-line.h"
+#include "support/file.h"
+#include "wasm-io.h"
+#include "wasm-builder.h"
+#include "ast/literal-utils.h"
+#include "wasm-validator.h"
+
+using namespace wasm;
+
+struct ProgramResult {
+  int code;
+  std::string output;
+
+  ProgramResult() {}
+  ProgramResult(std::string command) {
+    getFromExecution(command);
+  }
+
+  // runs the command and notes the output
+  // TODO: also stderr, not just stdout?
+  void getFromExecution(std::string command) {
+    // do this using just core stdio.h and stdlib.h, for portability
+    // sadly this requires two invokes
+    code = system(("timeout 2s " + command + " > /dev/null 2> /dev/null").c_str());
+    const int MAX_BUFFER = 1024;
+    char buffer[MAX_BUFFER];
+    FILE *stream = popen(("timeout 2s " + command + " 2> /dev/null").c_str(), "r");
+    while (fgets(buffer, MAX_BUFFER, stream) != NULL) {
+      output.append(buffer);
+    }
+    pclose(stream);
+  }
+
+  bool operator==(ProgramResult& other) {
+    return code == other.code && output == other.output;
+  }
+  bool operator!=(ProgramResult& other) {
+    return !(*this == other);
+  }
+
+  bool failed() {
+    return code != 0;
+  }
+
+  void dump(std::ostream& o) {
+    o << "[ProgramResult] code: " << code << " stdout: \n" << output << "[/ProgramResult]\n";
+  }
+};
+
+namespace std {
+
+inline std::ostream& operator<<(std::ostream& o, ProgramResult& result) {
+  result.dump(o);
+  return o;
+}
+
+}
+
+ProgramResult expected;
+
+struct Reducer : public WalkerPass<PostWalker<Reducer, UnifiedExpressionVisitor<Reducer>>> {
+  std::string command, test, working;
+  bool verbose, debugInfo;
+
+  // test is the file we write to that the command will operate on
+  // working is the current temporary state, the reduction so far
+  Reducer(std::string command, std::string test, std::string working, bool verbose, bool debugInfo) : command(command), test(test), working(working), verbose(verbose), debugInfo(debugInfo) {}
+
+  // runs passes in order to reduce, until we can't reduce any more
+  // the criterion here is wasm binary size
+  void reduceUsingPasses() {
+    // run optimization passes until we can't shrink it any more
+    std::vector<std::string> passes = {
+      "-Oz",
+      "-Os",
+      "-O1",
+      "-O2",
+      "-O3",
+      "--coalesce-locals --vacuum",
+      "--dce",
+      "--duplicate-function-elimination",
+      "--inlining",
+      "--inlining-optimizing",
+      "--optimize-level=3 --inlining-optimizing",
+      "--local-cse --vacuum",
+      "--memory-packing",
+      "--remove-unused-names --merge-blocks --vacuum",
+      "--optimize-instructions",
+      "--precompute",
+      "--remove-imports",
+      "--remove-memory",
+      "--remove-unused-names --remove-unused-brs",
+      "--remove-unused-module-elements",
+      "--reorder-functions",
+      "--reorder-locals",
+      "--simplify-locals --vacuum",
+      "--vacuum"
+    };
+    auto oldSize = file_size(working);
+    bool more = true;
+    while (more) {
+      //std::cerr << "|    starting passes loop iteration\n";
+      more = false;
+      // try both combining with a generic shrink (so minor pass overhead is compensated for), and without
+      for (auto shrinking : { false, true }) {
+        for (auto pass : passes) {
+          std::string currCommand = "bin/wasm-opt ";
+          if (shrinking) currCommand += " --dce --vacuum ";
+          currCommand += working + " -o " + test + " " + pass;
+          if (debugInfo) currCommand += " -g ";
+          if (verbose) std::cerr << "|    trying pass command: " << currCommand << "\n";
+          if (!ProgramResult(currCommand).failed()) {
+            auto newSize = file_size(test);
+            if (newSize < oldSize) {
+              // the pass didn't fail, and the size looks smaller, so promising
+              // see if it is still has the property we are preserving
+              if (ProgramResult(command) == expected) {
+                std::cerr << "|    command \"" << currCommand << "\" succeeded, reduced size to " << newSize << ", and preserved the property\n";
+                copy_file(test, working);
+                more = true;
+                oldSize = newSize;
+              }
+            }
+          }
+        }
+      }
+    }
+    if (verbose) std::cerr << "|    done with passes for now\n";
+  }
+
+  // does one pass of slow and destructive reduction. returns whether it
+  // succeeded or not
+  // the criterion here is a logical change in the program. this may actually
+  // increase wasm size in some cases, but it should allow more reduction later.
+  // @param factor how much to ignore. starting with a high factor skips through
+  //               most of the file, which is often faster than going one by one
+  //               from the start
+  size_t reduceDestructively(int factor_) {
+    factor = factor_;
+    Module wasm;
+    ModuleReader reader;
+    reader.read(working, wasm);
+    // prepare
+    reduced = 0;
+    builder = make_unique<Builder>(wasm);
+    funcsSeen = 0;
+    // before we do any changes, it should be valid to write out the module:
+    // size should be as expected, and output should be as expected
+    setModule(&wasm);
+    if (!writeAndTestReduction()) {
+      std::cerr << "\n|! WARNING: writing before destructive reduction fails, very unlikely reduction can work\n\n";
+    }
+    // destroy!
+    walkModule(&wasm);
+    return reduced;
+  }
+
+  // destructive reduction state
+
+  size_t reduced;
+  Expression* beforeReduction;
+  std::unique_ptr<Builder> builder;
+  Index funcsSeen;
+  int factor;
+
+  // write the module and see if the command still fails on it as expected
+  bool writeAndTestReduction() {
+    ProgramResult result;
+    return writeAndTestReduction(result);
+  }
+
+  bool writeAndTestReduction(ProgramResult& out) {
+    // write the module out
+    ModuleWriter writer;
+    writer.setBinary(true);
+    writer.setDebugInfo(debugInfo);
+    writer.write(*getModule(), test);
+    // note that it is ok for the destructively-reduced module to be bigger
+    // than the previous - each destructive reduction removes logical code,
+    // and so is strictly better, even if the wasm binary format happens to
+    // encode things slightly less efficiently.
+    // test it
+    out.getFromExecution(command);
+    return out == expected;
+  }
+
+  bool shouldTryToReduce(size_t bonus = 1) {
+    static size_t counter = 0;
+    counter += bonus;
+    return (counter % factor) <= bonus;
+  }
+
+  // tests a reduction on the current traversal node, and undos if it failed
+  bool tryToReplaceCurrent(Expression* with) {
+    auto* curr = getCurrent();
+    //std::cerr << "try " << curr << " => " << with << '\n';
+    if (curr->type != with->type) return false;
+    if (!shouldTryToReduce()) return false;
+    replaceCurrent(with);
+    if (!writeAndTestReduction()) {
+      replaceCurrent(curr);
+      return false;
+    }
+    std::cerr << "|      tryToReplaceCurrent succeeded (in " << getLocation() << ")\n";
+    noteReduction();
+    return true;
+  }
+
+  void noteReduction() {
+    reduced++;
+    copy_file(test, working);
+  }
+
+  // tests a reduction on an arbitrary child
+  bool tryToReplaceChild(Expression*& child, Expression* with) {
+    if (child->type != with->type) return false;
+    if (!shouldTryToReduce()) return false;
+    auto* before = child;
+    child = with;
+    if (!writeAndTestReduction()) {
+      child = before;
+      return false;
+    }
+    std::cerr << "|      tryToReplaceChild succeeded (in " << getLocation() << ")\n";
+    //std::cerr << "|      " << before << " => " << with << '\n';
+    noteReduction();
+    return true;
+  }
+
+  std::string getLocation() {
+    if (getFunction()) return getFunction()->name.str;
+    return "(non-function context)";
+  }
+
+  // visitors. in each we try to remove code in a destructive and nontrivial way.
+  // "nontrivial" means something that optimization passes can't achieve, since we
+  // don't need to duplicate work that they do
+
+  void visitExpression(Expression* curr) {
+    if (curr->type == none) {
+      if (tryToReduceCurrentToNone()) return;
+    } else if (isConcreteWasmType(curr->type)) {
+      if (tryToReduceCurrentToConst()) return;
+    } else {
+      assert(curr->type == unreachable);
+      if (tryToReduceCurrentToUnreachable()) return;
+    }
+    // specific reductions
+    if (auto* iff = curr->dynCast<If>()) {
+      if (iff->type == none) {
+        // perhaps we need just the condition?
+        if (tryToReplaceCurrent(builder->makeDrop(iff->condition))) {
+          return;
+        }
+      }
+      handleCondition(iff->condition);
+    } else if (auto* br = curr->dynCast<Break>()) {
+      handleCondition(br->condition);
+    } else if (auto* select = curr->dynCast<Select>()) {
+      handleCondition(select->condition);
+    } else if (auto* sw = curr->dynCast<Switch>()) {
+      handleCondition(sw->condition);
+    } else if (auto* set = curr->dynCast<SetLocal>()) {
+      if (set->isTee()) {
+        // maybe we don't need the set
+        tryToReplaceCurrent(set->value);
+      }
+    } else if (auto* unary = curr->dynCast<Unary>()) {
+      // maybe we can pass through
+      tryToReplaceCurrent(unary->value);
+    } else if (auto* binary = curr->dynCast<Binary>()) {
+      // maybe we can pass through
+      if (!tryToReplaceCurrent(binary->left)) {
+        tryToReplaceCurrent(binary->right);
+      }
+    } else if (auto* call = curr->dynCast<Call>()) {
+      handleCall(call);
+    } else if (auto* call = curr->dynCast<CallImport>()) {
+      handleCall(call);
+    } else if (auto* call = curr->dynCast<CallIndirect>()) {
+      if (tryToReplaceCurrent(call->target)) return;
+      handleCall(call);
+    }
+  }
+
+  void visitFunction(Function* curr) {
+    // extra chance to work on the function toplevel element, as if it can
+    // be reduced it's great
+    visitExpression(curr->body);
+    // finish function
+    funcsSeen++;
+    static int last = 0;
+    int percentage = (100 * funcsSeen) / getModule()->functions.size();
+    if (std::abs(percentage - last) >= 5) {
+      std::cerr << "|    " << percentage << "% of funcs complete\n";
+      last = percentage;
+    }
+  }
+
+  // TODO: bisection on segment shrinking?
+
+  void visitTable(Table* curr) {
+    std::cerr << "|    try to simplify table\n";
+    Name first;
+    for (auto& segment : curr->segments) {
+      for (auto item : segment.data) {
+        first = item;
+        break;
+      }
+      if (!first.isNull()) break;
+    }
+    visitSegmented(curr, first, 100);
+  }
+
+  void visitMemory(Memory* curr) {
+    std::cerr << "|    try to simplify memory\n";
+    visitSegmented(curr, 0, 2);
+  }
+
+  template<typename T, typename U>
+  void visitSegmented(T* curr, U zero, size_t bonus) {
+    // try to reduce to first function
+    // shrink segment elements
+    for (auto& segment : curr->segments) {
+      auto& data = segment.data;
+      size_t skip = 1; // when we succeed, try to shrink by more and more, similar to bisection
+      for (size_t i = 0; i < data.size() && !data.empty(); i++) {
+        if (!shouldTryToReduce(bonus)) continue;
+        auto save = data;
+        for (size_t j = 0; j < skip; j++) {
+          if (!data.empty()) data.pop_back();
+        }
+        if (writeAndTestReduction()) {
+          std::cerr << "|      shrank segment (skip: " << skip << ")\n";
+          noteReduction();
+          skip = std::min(size_t(factor), 2 * skip);
+        } else {
+          data = save;
+          break;
+        }
+      }
+    }
+    // the "opposite" of shrinking: copy a 'zero' element
+    for (auto& segment : curr->segments) {
+      if (segment.data.empty()) continue;
+      for (auto& item : segment.data) {
+        if (!shouldTryToReduce(bonus)) continue;
+        if (item == zero) continue;
+        auto save = item;
+        item = zero;
+        if (writeAndTestReduction()) {
+          std::cerr << "|      zeroed segment\n";
+          noteReduction();
+        } else {
+          item = save;
+        }
+      }
+    }
+  }
+
+  void visitModule(Module* curr) {
+    // try to remove exports
+    std::cerr << "|    try to remove exports\n";
+    std::vector<Export> exports;
+    for (auto& exp : curr->exports) {
+      if (!shouldTryToReduce(10000)) continue;
+      exports.push_back(*exp);
+    }
+    for (auto& exp : exports) {
+      curr->removeExport(exp.name);
+      if (!writeAndTestReduction()) {
+        curr->addExport(new Export(exp));
+      } else {
+        std::cerr << "|      removed export " << exp.name << '\n';
+        noteReduction();
+      }
+    }
+    // try to remove functions
+    std::cerr << "|    try to remove functions\n";
+    std::vector<Function> functions;
+    for (auto& func : curr->functions) {
+      if (!shouldTryToReduce(10000)) continue;
+      functions.push_back(*func);
+    }
+    for (auto& func : functions) {
+      curr->removeFunction(func.name);
+      WasmValidator validator;
+      validator.quiet = true;
+      if (validator.validate(*curr) && writeAndTestReduction()) {
+        std::cerr << "|      removed function " << func.name << '\n';
+        noteReduction();
+      } else {
+        curr->addFunction(new Function(func));
+      }
+    }
+  }
+
+  // helpers
+
+  // try to replace condition with always true and always false
+  void handleCondition(Expression*& condition) {
+    if (!condition) return;
+    if (condition->is<Const>()) return;
+    auto* c = builder->makeConst(Literal(int32_t(0)));
+    if (!tryToReplaceChild(condition, c)) {
+      c->value = Literal(int32_t(1));
+      tryToReplaceChild(condition, c);
+    }
+  }
+
+  template<typename T>
+  void handleCall(T* call) {
+    for (auto* op : call->operands) {
+      if (tryToReplaceCurrent(op)) return;
+    }
+  }
+
+  bool tryToReduceCurrentToNone() {
+    auto* curr = getCurrent();
+    if (curr->is<Nop>()) return false;
+    // try to replace with a trivial value
+    Nop nop;
+    if (tryToReplaceCurrent(&nop)) {
+      replaceCurrent(builder->makeNop());
+      return true;
+    }
+    return false;
+  }
+
+  // try to replace a concrete value with a trivial constant
+  bool tryToReduceCurrentToConst() {
+    auto* curr = getCurrent();
+    if (curr->is<Const>()) return false;
+    // try to replace with a trivial value
+    Const* c = builder->makeConst(Literal(int32_t(0)));
+    if (tryToReplaceCurrent(c)) return true;
+    c->value = LiteralUtils::makeLiteralFromInt32(1, curr->type);
+    c->type = curr->type;
+    return tryToReplaceCurrent(c);
+  }
+
+  bool tryToReduceCurrentToUnreachable() {
+    auto* curr = getCurrent();
+    if (curr->is<Unreachable>()) return false;
+    // try to replace with a trivial value
+    Unreachable un;
+    if (tryToReplaceCurrent(&un)) {
+      replaceCurrent(builder->makeUnreachable());
+      return true;
+    }
+    // maybe a return? TODO
+    return false;
+  }
+};
+
+//
+// main
+//
+
+int main(int argc, const char* argv[]) {
+  std::string input, test, working, command;
+  bool verbose = false,
+       debugInfo = false,
+       force = false;
+  Options options("wasm-reduce", "Reduce a wasm file to a smaller one that has the same behavior on a given command");
+  options
+      .add("--command", "-cmd", "The command to run on the test, that we want to reduce while keeping the command's output identical. "
+                                "We look at the command's return code and stdout here (TODO: stderr), "
+                                "and we reduce while keeping those unchanged.",
+           Options::Arguments::One,
+           [&](Options* o, const std::string& argument) {
+             command = argument;
+           })
+      .add("--test", "-t", "Test file (this will be written to to test, the given command should read it when we call it)",
+           Options::Arguments::One,
+           [&](Options* o, const std::string& argument) {
+             test = argument;
+           })
+      .add("--working", "-w", "Working file (this will contain the current good state while doing temporary computations, "
+                              "and will contain the final best result at the end)",
+           Options::Arguments::One,
+           [&](Options* o, const std::string& argument) {
+             working = argument;
+           })
+      .add("--verbose", "-v", "Verbose output mode",
+           Options::Arguments::Zero,
+           [&](Options* o, const std::string& argument) {
+             verbose = true;
+           })
+      .add("--debugInfo", "-g", "Keep debug info in binaries",
+           Options::Arguments::Zero,
+           [&](Options* o, const std::string& argument) {
+             debugInfo = true;
+           })
+      .add("--force", "-f", "Force the reduction attempt, ignoring problems that imply it is unlikely to succeed",
+           Options::Arguments::Zero,
+           [&](Options* o, const std::string& argument) {
+             force = true;
+           })
+      .add_positional("INFILE", Options::Arguments::One,
+                      [&](Options* o, const std::string& argument) {
+                        input = argument;
+                      });
+  options.parse(argc, argv);
+
+  if (test.size() == 0) Fatal() << "test file not provided\n";
+  if (working.size() == 0) Fatal() << "working file not provided\n";
+
+  std::cerr << "|input: " << input << '\n';
+  std::cerr << "|test: " << test << '\n';
+  std::cerr << "|working: " << working << '\n';
+
+  // get the expected output
+  copy_file(input, test);
+  expected.getFromExecution(command);
+
+  std::cerr << "|expected result:\n" << expected << '\n';
+
+  auto stopIfNotForced = [&](std::string message, ProgramResult& result) {
+    std::cerr << "|! " << message << '\n' << result << '\n';
+    if (!force) {
+      Fatal() << "|! stopping, as it is very unlikely reduction can succeed (use -f to ignore this check)";
+    }
+  };
+
+  std::cerr << "|checking that command has different behavior on invalid binary\n";
+  {
+    {
+      std::ofstream dst(test, std::ios::binary);
+      dst << "waka waka\n";
+    }
+    ProgramResult result(command);
+    if (result == expected) {
+      stopIfNotForced("running command on an invalid module should give different results", result);
+    }
+  }
+
+  std::cerr << "|checking that command has expected behavior on canonicalized (read-written) binary\n";
+  {
+    // read and write it
+    ProgramResult readWrite(std::string("bin/wasm-opt ") + input + " -o " + test);
+    if (readWrite.failed()) {
+      stopIfNotForced("failed to read and write the binary", readWrite);
+    } else {
+      ProgramResult result(command);
+      if (result != expected) {
+        stopIfNotForced("running command on the canonicalized module should give the same results", result);
+      }
+    }
+  }
+
+  copy_file(input, working);
+  std::cerr << "|input size: " << file_size(working) << "\n";
+
+  std::cerr << "|starting reduction!\n";
+
+  int factor = 4096;
+  size_t lastDestructiveReductions = 0;
+  size_t lastPostPassesSize = 0;
+
+  bool stopping = false;
+
+  while (1) {
+    Reducer reducer(command, test, working, verbose, debugInfo);
+
+    // run binaryen optimization passes to reduce. passes are fast to run
+    // and can often reduce large amounts of code efficiently, as opposed
+    // to detructive reduction (i.e., that doesn't preserve correctness as
+    // passes do) since destrucive must operate one change at a time
+    std::cerr << "|  reduce using passes...\n";
+    auto oldSize = file_size(working);
+    reducer.reduceUsingPasses();
+    auto newSize = file_size(working);
+    auto passProgress = oldSize - newSize;
+    std::cerr << "|  after pass reduction: " << newSize << "\n";
+
+    // always stop after a pass reduction attempt, for final cleanup
+    if (stopping) break;
+
+    // check if the full cycle (destructive/passes) has helped or not
+    if (lastPostPassesSize && newSize >= lastPostPassesSize) {
+      std::cerr << "|  progress has stopped, skipping to the end\n";
+      if (factor == 1) {
+        // this is after doing work with factor 1, so after the remaining work, stop
+        stopping = true;
+      } else {
+        // just try to remove all we can and finish up
+        factor = 1;
+      }
+    }
+    lastPostPassesSize = newSize;
+
+    // if destructive reductions lead to useful proportionate pass reductions, keep
+    // going at the same factor, as pass reductions are far faster
+    std::cerr << "|  pass progress: " << passProgress << ", last destructive: " << lastDestructiveReductions << '\n';
+    if (passProgress >= 4 * lastDestructiveReductions) {
+      // don't change
+      std::cerr << "|  progress is good, do not quickly decrease factor\n";
+    } else {
+      if (factor > 10) {
+        factor = (factor / 3) + 1;
+      } else {
+        factor = (factor + 1) / 2; // stable on 1
+      }
+    }
+
+    // no point in a factor lorger than the size
+    assert(newSize > 4); // wasm modules are >4 bytes anyhow
+    factor = std::min(factor, int(newSize) / 4);
+
+    // try to reduce destructively. if a high factor fails to find anything,
+    // quickly try a lower one (no point in doing passes until we reduce
+    // destructively at least a little)
+    while (1) {
+      std::cerr << "|  reduce destructively... (factor: " << factor << ")\n";
+      lastDestructiveReductions = reducer.reduceDestructively(factor);
+      if (lastDestructiveReductions > 0) break;
+      // we failed to reduce destructively
+      if (factor == 1) {
+        stopping = true;
+        break;
+      }
+      factor = std::max(1, factor / 4); // quickly now, try to find *something* we can reduce
+    }
+
+    std::cerr << "|  destructive reduction led to size: " << file_size(working) << '\n';
+  }
+  std::cerr << "|finished, final size: " << file_size(working) << "\n";
+  copy_file(working, test); // just to avoid confusion
+}
+
author	Alon Zakai <alonzakai@gmail.com>	2017-09-01 12:34:03 -0700
committer	GitHub <noreply@github.com>	2017-09-01 12:34:03 -0700
commit	6de14693110b4f898344ff1cb383caf0d74eb42e (patch)
tree	6fbcb75fd3c2285191939a086989e14656908282 /src/tools/wasm-reduce.cpp
parent	b013f744e3d70effd9be348cbde7fb93f0a16c6a (diff)
download	binaryen-6de14693110b4f898344ff1cb383caf0d74eb42e.tar.gz binaryen-6de14693110b4f898344ff1cb383caf0d74eb42e.tar.bz2 binaryen-6de14693110b4f898344ff1cb383caf0d74eb42e.zip