80 files changed, 7196 insertions, 2405 deletions
diff --git a/src/abi/js.h b/src/abi/js.h
new file mode 100644
index 000000000..bcc7dbb6e
--- /dev/null
+++ b/src/abi/js.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2018 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef wasm_abi_abi_h
+#define wasm_abi_abi_h
+
+#include "wasm.h"
+
+namespace wasm {
+
+namespace ABI {
+
+enum class LegalizationLevel {
+  Full = 0,
+  Minimal = 1
+};
+
+inline std::string getLegalizationPass(LegalizationLevel level) {
+  if (level == LegalizationLevel::Full) {
+    return "legalize-js-interface";
+  } else {
+    return "legalize-js-interface-minimally";
+  }
+}
+
+} // namespace ABI
+
+} // namespace wasm
+
+#endif // wasm_abi_abi_h
diff --git a/src/abi/stack.h b/src/abi/stack.h
index e43be07ec..77e166c2a 100644
--- a/src/abi/stack.h
+++ b/src/abi/stack.h
@@ -39,7 +39,7 @@ inline Index stackAlign(Index size) {
 
 // Allocate some space on the stack, and assign it to a local.
 // The local will have the same constant value in all the function, so you can just
-// get_local it anywhere there.
+// local.get it anywhere there.
 inline void getStackSpace(Index local, Function* func, Index size, Module& wasm) {
   auto* stackPointer = GlobalUtils::getGlobalInitializedToImport(wasm, ENV, "STACKTOP");
   if (!stackPointer) {
diff --git a/src/asm2wasm.h b/src/asm2wasm.h
index dadb9bee3..d958c4b6a 100644
--- a/src/asm2wasm.h
+++ b/src/asm2wasm.h
@@ -41,6 +41,7 @@
 #include "wasm-builder.h"
 #include "wasm-emscripten.h"
 #include "wasm-module-building.h"
+#include "abi/js.h"
 
 namespace wasm {
 
@@ -1452,9 +1453,10 @@ void Asm2WasmBuilder::processAsm(Ref ast) {
   // finalizeCalls also does autoDrop, which is crucial for the non-optimizing case,
   // so that the output of the first pass is valid
   passRunner.add<FinalizeCalls>(this);
-  if (legalizeJavaScriptFFI) {
-    passRunner.add("legalize-js-interface");
-  }
+  passRunner.add(ABI::getLegalizationPass(
+    legalizeJavaScriptFFI ? ABI::LegalizationLevel::Full
+                          : ABI::LegalizationLevel::Minimal
+  ));
   if (runOptimizationPasses) {
     // autodrop can add some garbage
     passRunner.add("vacuum");
@@ -1682,7 +1684,7 @@ Function* Asm2WasmBuilder::processFunction(Ref ast) {
         Fatal() << "error: access of a non-existent global var " << name.str;
       }
       auto* ret = builder.makeSetGlobal(name, process(assign->value()));
-      // set_global does not return; if our value is trivially not used, don't emit a load (if nontrivially not used, opts get it later)
+      // global.set does not return; if our value is trivially not used, don't emit a load (if nontrivially not used, opts get it later)
       auto parent = astStackHelper.getParent();
       if (!parent || parent->isArray(BLOCK) || parent->isArray(IF)) return ret;
       return builder.makeSequence(ret, builder.makeGetGlobal(name, ret->value->type));
diff --git a/src/binaryen-c.cpp b/src/binaryen-c.cpp
index 88a3d033e..80f8d8276 100644
--- a/src/binaryen-c.cpp
+++ b/src/binaryen-c.cpp
@@ -49,7 +49,10 @@ BinaryenLiteral toBinaryenLiteral(Literal x) {
     case Type::i64: ret.i64 = x.geti64(); break;
     case Type::f32: ret.i32 = x.reinterpreti32(); break;
     case Type::f64: ret.i64 = x.reinterpreti64(); break;
-    case Type::v128: assert(false && "v128 not implemented yet");
+    case Type::v128: {
+      memcpy(&ret.v128, x.getv128Ptr(), 16);
+      break;
+    }
     case Type::none:
     case Type::unreachable: WASM_UNREACHABLE();
   }
@@ -62,6 +65,7 @@ Literal fromBinaryenLiteral(BinaryenLiteral x) {
     case Type::i64: return Literal(x.i64);
     case Type::f32: return Literal(x.i32).castToF32();
     case Type::f64: return Literal(x.i64).castToF64();
+    case Type::v128: return Literal(x.v128);
     case Type::none:
     case Type::unreachable: WASM_UNREACHABLE();
   }
@@ -82,9 +86,9 @@ static PassOptions globalPassOptions = PassOptions::getWithDefaultOptimizationOp
 
 static int tracing = 0;
 
-void traceNameOrNULL(const char* name) {
-  if (name) std::cout << "\"" << name << "\"";
-  else std::cout << "NULL";
+void traceNameOrNULL(const char* name, std::ostream &out = std::cout) {
+  if (name) out << "\"" << name << "\"";
+  else out << "NULL";
 }
 
 std::map<BinaryenFunctionTypeRef, size_t> functionTypes;
@@ -101,14 +105,19 @@ size_t noteExpression(BinaryenExpressionRef expression) {
   return id;
 }
 
+std::string getTemp() {
+  static size_t n = 0;
+  return "t" + std::to_string(n++);
+}
+
 template<typename T>
-void printArg(T arg) {
-  std::cout << arg;
+void printArg(std::ostream &setup, std::ostream& out, T arg) {
+  out << arg;
 }
 
 template<>
-void printArg(void* arg) {
-  std::cout << "expressions[" << expressions[arg] << "]";
+void printArg(std::ostream &setup, std::ostream& out, BinaryenExpressionRef arg) {
+  out << "expressions[" << expressions[arg] << "]";
 }
 
 struct StringLit {
@@ -117,60 +126,83 @@ struct StringLit {
 };
 
 template<>
-void printArg(StringLit arg) {
-  traceNameOrNULL(arg.name);
+void printArg(std::ostream &setup, std::ostream& out, StringLit arg) {
+  traceNameOrNULL(arg.name, out);
 }
 
 template<>
-void printArg(BinaryenType arg) {
+void printArg(std::ostream &setup, std::ostream &out, BinaryenType arg) {
   if (arg == BinaryenTypeAuto()) {
-    std::cout << "BinaryenTypeAuto()";
+    out << "BinaryenTypeAuto()";
   } else {
-    std::cout << arg;
+    out << arg;
   }
 }
 
 template<>
-void printArg(BinaryenLiteral arg) {
+void printArg(std::ostream &setup, std::ostream &out, BinaryenLiteral arg) {
   switch (arg.type) {
-    case Type::i32: std::cout << "BinaryenLiteralInt32(" << arg.i32 << ")"; break;
-    case Type::i64: std::cout << "BinaryenLiteralInt64(" << arg.i64 << ")"; break;
+    case Type::i32: out << "BinaryenLiteralInt32(" << arg.i32 << ")"; break;
+    case Type::i64: out << "BinaryenLiteralInt64(" << arg.i64 << ")"; break;
     case Type::f32:
       if (std::isnan(arg.f32)) {
-        std::cout << "BinaryenLiteralFloat32(NAN)"; break;
+        out << "BinaryenLiteralFloat32(NAN)"; break;
       } else {
-        std::cout << "BinaryenLiteralFloat32(" << arg.f32 << ")"; break;
+        out << "BinaryenLiteralFloat32(" << arg.f32 << ")"; break;
       }
     case Type::f64:
       if (std::isnan(arg.f64)) {
-        std::cout << "BinaryenLiteralFloat64(NAN)"; break;
+        out << "BinaryenLiteralFloat64(NAN)"; break;
       } else {
-        std::cout << "BinaryenLiteralFloat64(" << arg.f64 << ")"; break;
+        out << "BinaryenLiteralFloat64(" << arg.f64 << ")"; break;
+      }
+    case Type::v128: {
+      std::string array = getTemp();
+      setup << "uint8_t " << array << "[] = {";
+      for (size_t i = 0; i < 16; ++i) {
+        setup << int(arg.v128[i]);
+        if (i < 15) {
+          setup << ", ";
+        }
       }
-    case Type::v128:
+      setup << "};\n";
+      out << "BinaryenLiteralVec128(" << array << ")";
+      break;
+    }
     case Type::none:
     case Type::unreachable: WASM_UNREACHABLE();
   }
 }
 
 template<typename T>
-void traceArgs(T arg) {
-  printArg(arg);
+void traceArgs(std::ostream &setup, std::ostream &out, T arg) {
+  printArg(setup, out, arg);
 }
 
 template<typename T, typename S, typename ...Ts>
-void traceArgs(T arg, S next, Ts... rest) {
-  printArg(arg);
-  std::cout << ", ";
-  traceArgs(next, rest...);
+void traceArgs(std::ostream &setup, std::ostream &out, T arg, S next, Ts... rest) {
+  printArg(setup, out, arg);
+  out << ", ";
+  traceArgs(setup, out, next, rest...);
 }
 
 template<typename ...Ts>
 void traceExpression(BinaryenExpressionRef expr, const char* constructor, Ts... args) {
   auto id = noteExpression(expr);
-  std::cout << "  expressions[" << id << "] = " << constructor << "(";
-  traceArgs("the_module", args...);
-  std::cout << ");\n";
+  std::stringstream setup, out;
+  out << "expressions[" << id << "] = " << constructor << "(";
+  traceArgs(setup, out, "the_module", args...);
+  out << ");\n";
+  if (!setup.str().empty()) {
+    std::cout << "  {\n";
+    for (std::string line; getline(setup, line);) {
+      std::cout << "    " << line << "\n";
+    }
+    std::cout << "    " << out.str();
+    std::cout << "  }\n";
+  } else {
+    std::cout << "  " << out.str();
+  }
 }
 
 extern "C" {
@@ -226,6 +258,11 @@ BinaryenExpressionId BinaryenAtomicCmpxchgId(void) { return Expression::Id::Atom
 BinaryenExpressionId BinaryenAtomicRMWId(void) { return Expression::Id::AtomicRMWId; }
 BinaryenExpressionId BinaryenAtomicWaitId(void) { return Expression::Id::AtomicWaitId; }
 BinaryenExpressionId BinaryenAtomicWakeId(void) { return Expression::Id::AtomicWakeId; }
+BinaryenExpressionId BinaryenSIMDExtractId(void) { return Expression::Id::SIMDExtractId; }
+BinaryenExpressionId BinaryenSIMDReplaceId(void) { return Expression::Id::SIMDReplaceId; }
+BinaryenExpressionId BinaryenSIMDShuffleId(void) { return Expression::Id::SIMDShuffleId; }
+BinaryenExpressionId BinaryenSIMDBitselectId(void) { return Expression::Id::SIMDBitselectId; }
+BinaryenExpressionId BinaryenSIMDShiftId(void) { return Expression::Id::SIMDShiftId; }
 
 // External kinds
 
@@ -325,6 +362,7 @@ BinaryenLiteral BinaryenLiteralInt32(int32_t x) { return toBinaryenLiteral(Liter
 BinaryenLiteral BinaryenLiteralInt64(int64_t x) { return toBinaryenLiteral(Literal(x)); }
 BinaryenLiteral BinaryenLiteralFloat32(float x) { return toBinaryenLiteral(Literal(x)); }
 BinaryenLiteral BinaryenLiteralFloat64(double x) { return toBinaryenLiteral(Literal(x)); }
+BinaryenLiteral BinaryenLiteralVec128(const uint8_t x[16]) { return toBinaryenLiteral(Literal(x)); }
 BinaryenLiteral BinaryenLiteralFloat32Bits(int32_t x) { return toBinaryenLiteral(Literal(x).castToF32()); }
 BinaryenLiteral BinaryenLiteralFloat64Bits(int64_t x) { return toBinaryenLiteral(Literal(x).castToF64()); }
 
@@ -474,6 +512,141 @@ BinaryenOp BinaryenTruncSatSFloat64ToInt32(void) { return TruncSatSFloat64ToInt3
 BinaryenOp BinaryenTruncSatSFloat64ToInt64(void) { return TruncSatSFloat64ToInt64; }
 BinaryenOp BinaryenTruncSatUFloat64ToInt32(void) { return TruncSatUFloat64ToInt32; }
 BinaryenOp BinaryenTruncSatUFloat64ToInt64(void) { return TruncSatUFloat64ToInt64; }
+BinaryenOp BinaryenSplatVecI8x16(void) { return SplatVecI8x16; }
+BinaryenOp BinaryenExtractLaneSVecI8x16(void) { return ExtractLaneSVecI8x16; }
+BinaryenOp BinaryenExtractLaneUVecI8x16(void) { return ExtractLaneUVecI8x16; }
+BinaryenOp BinaryenReplaceLaneVecI8x16(void) { return ReplaceLaneVecI8x16; }
+BinaryenOp BinaryenSplatVecI16x8(void) { return SplatVecI16x8; }
+BinaryenOp BinaryenExtractLaneSVecI16x8(void) { return ExtractLaneSVecI16x8; }
+BinaryenOp BinaryenExtractLaneUVecI16x8(void) { return ExtractLaneUVecI16x8; }
+BinaryenOp BinaryenReplaceLaneVecI16x8(void) { return ReplaceLaneVecI16x8; }
+BinaryenOp BinaryenSplatVecI32x4(void) { return SplatVecI32x4; }
+BinaryenOp BinaryenExtractLaneVecI32x4(void) { return ExtractLaneVecI32x4; }
+BinaryenOp BinaryenReplaceLaneVecI32x4(void) { return ReplaceLaneVecI32x4; }
+BinaryenOp BinaryenSplatVecI64x2(void) { return SplatVecI64x2; }
+BinaryenOp BinaryenExtractLaneVecI64x2(void) { return ExtractLaneVecI64x2; }
+BinaryenOp BinaryenReplaceLaneVecI64x2(void) { return ReplaceLaneVecI64x2; }
+BinaryenOp BinaryenSplatVecF32x4(void) { return SplatVecF32x4; }
+BinaryenOp BinaryenExtractLaneVecF32x4(void) { return ExtractLaneVecF32x4; }
+BinaryenOp BinaryenReplaceLaneVecF32x4(void) { return ReplaceLaneVecF32x4; }
+BinaryenOp BinaryenSplatVecF64x2(void) { return SplatVecF64x2; }
+BinaryenOp BinaryenExtractLaneVecF64x2(void) { return ExtractLaneVecF64x2; }
+BinaryenOp BinaryenReplaceLaneVecF64x2(void) { return ReplaceLaneVecF64x2; }
+BinaryenOp BinaryenEqVecI8x16(void) { return EqVecI8x16; }
+BinaryenOp BinaryenNeVecI8x16(void) { return NeVecI8x16; }
+BinaryenOp BinaryenLtSVecI8x16(void) { return LtSVecI8x16; }
+BinaryenOp BinaryenLtUVecI8x16(void) { return LtUVecI8x16; }
+BinaryenOp BinaryenGtSVecI8x16(void) { return GtSVecI8x16; }
+BinaryenOp BinaryenGtUVecI8x16(void) { return GtUVecI8x16; }
+BinaryenOp BinaryenLeSVecI8x16(void) { return LeSVecI8x16; }
+BinaryenOp BinaryenLeUVecI8x16(void) { return LeUVecI8x16; }
+BinaryenOp BinaryenGeSVecI8x16(void) { return GeSVecI8x16; }
+BinaryenOp BinaryenGeUVecI8x16(void) { return GeUVecI8x16; }
+BinaryenOp BinaryenEqVecI16x8(void) { return EqVecI16x8; }
+BinaryenOp BinaryenNeVecI16x8(void) { return NeVecI16x8; }
+BinaryenOp BinaryenLtSVecI16x8(void) { return LtSVecI16x8; }
+BinaryenOp BinaryenLtUVecI16x8(void) { return LtUVecI16x8; }
+BinaryenOp BinaryenGtSVecI16x8(void) { return GtSVecI16x8; }
+BinaryenOp BinaryenGtUVecI16x8(void) { return GtUVecI16x8; }
+BinaryenOp BinaryenLeSVecI16x8(void) { return LeSVecI16x8; }
+BinaryenOp BinaryenLeUVecI16x8(void) { return LeUVecI16x8; }
+BinaryenOp BinaryenGeSVecI16x8(void) { return GeSVecI16x8; }
+BinaryenOp BinaryenGeUVecI16x8(void) { return GeUVecI16x8; }
+BinaryenOp BinaryenEqVecI32x4(void) { return EqVecI32x4; }
+BinaryenOp BinaryenNeVecI32x4(void) { return NeVecI32x4; }
+BinaryenOp BinaryenLtSVecI32x4(void) { return LtSVecI32x4; }
+BinaryenOp BinaryenLtUVecI32x4(void) { return LtUVecI32x4; }
+BinaryenOp BinaryenGtSVecI32x4(void) { return GtSVecI32x4; }
+BinaryenOp BinaryenGtUVecI32x4(void) { return GtUVecI32x4; }
+BinaryenOp BinaryenLeSVecI32x4(void) { return LeSVecI32x4; }
+BinaryenOp BinaryenLeUVecI32x4(void) { return LeUVecI32x4; }
+BinaryenOp BinaryenGeSVecI32x4(void) { return GeSVecI32x4; }
+BinaryenOp BinaryenGeUVecI32x4(void) { return GeUVecI32x4; }
+BinaryenOp BinaryenEqVecF32x4(void) { return EqVecF32x4; }
+BinaryenOp BinaryenNeVecF32x4(void) { return NeVecF32x4; }
+BinaryenOp BinaryenLtVecF32x4(void) { return LtVecF32x4; }
+BinaryenOp BinaryenGtVecF32x4(void) { return GtVecF32x4; }
+BinaryenOp BinaryenLeVecF32x4(void) { return LeVecF32x4; }
+BinaryenOp BinaryenGeVecF32x4(void) { return GeVecF32x4; }
+BinaryenOp BinaryenEqVecF64x2(void) { return EqVecF64x2; }
+BinaryenOp BinaryenNeVecF64x2(void) { return NeVecF64x2; }
+BinaryenOp BinaryenLtVecF64x2(void) { return LtVecF64x2; }
+BinaryenOp BinaryenGtVecF64x2(void) { return GtVecF64x2; }
+BinaryenOp BinaryenLeVecF64x2(void) { return LeVecF64x2; }
+BinaryenOp BinaryenGeVecF64x2(void) { return GeVecF64x2; }
+BinaryenOp BinaryenNotVec128(void) { return NotVec128; }
+BinaryenOp BinaryenAndVec128(void) { return AndVec128; }
+BinaryenOp BinaryenOrVec128(void) { return OrVec128; }
+BinaryenOp BinaryenXorVec128(void) { return XorVec128; }
+BinaryenOp BinaryenNegVecI8x16(void) { return NegVecI8x16; }
+BinaryenOp BinaryenAnyTrueVecI8x16(void) { return AnyTrueVecI8x16; }
+BinaryenOp BinaryenAllTrueVecI8x16(void) { return AllTrueVecI8x16; }
+BinaryenOp BinaryenShlVecI8x16(void) { return ShlVecI8x16; }
+BinaryenOp BinaryenShrSVecI8x16(void) { return ShrSVecI8x16; }
+BinaryenOp BinaryenShrUVecI8x16(void) { return ShrUVecI8x16; }
+BinaryenOp BinaryenAddVecI8x16(void) { return AddVecI8x16; }
+BinaryenOp BinaryenAddSatSVecI8x16(void) { return AddSatSVecI8x16; }
+BinaryenOp BinaryenAddSatUVecI8x16(void) { return AddSatUVecI8x16; }
+BinaryenOp BinaryenSubVecI8x16(void) { return SubVecI8x16; }
+BinaryenOp BinaryenSubSatSVecI8x16(void) { return SubSatSVecI8x16; }
+BinaryenOp BinaryenSubSatUVecI8x16(void) { return SubSatUVecI8x16; }
+BinaryenOp BinaryenMulVecI8x16(void) { return MulVecI8x16; }
+BinaryenOp BinaryenNegVecI16x8(void) { return NegVecI16x8; }
+BinaryenOp BinaryenAnyTrueVecI16x8(void) { return AnyTrueVecI16x8; }
+BinaryenOp BinaryenAllTrueVecI16x8(void) { return AllTrueVecI16x8; }
+BinaryenOp BinaryenShlVecI16x8(void) { return ShlVecI16x8; }
+BinaryenOp BinaryenShrSVecI16x8(void) { return ShrSVecI16x8; }
+BinaryenOp BinaryenShrUVecI16x8(void) { return ShrUVecI16x8; }
+BinaryenOp BinaryenAddVecI16x8(void) { return AddVecI16x8; }
+BinaryenOp BinaryenAddSatSVecI16x8(void) { return AddSatSVecI16x8; }
+BinaryenOp BinaryenAddSatUVecI16x8(void) { return AddSatUVecI16x8; }
+BinaryenOp BinaryenSubVecI16x8(void) { return SubVecI16x8; }
+BinaryenOp BinaryenSubSatSVecI16x8(void) { return SubSatSVecI16x8; }
+BinaryenOp BinaryenSubSatUVecI16x8(void) { return SubSatUVecI16x8; }
+BinaryenOp BinaryenMulVecI16x8(void) { return MulVecI16x8; }
+BinaryenOp BinaryenNegVecI32x4(void) { return NegVecI32x4; }
+BinaryenOp BinaryenAnyTrueVecI32x4(void) { return AnyTrueVecI32x4; }
+BinaryenOp BinaryenAllTrueVecI32x4(void) { return AllTrueVecI32x4; }
+BinaryenOp BinaryenShlVecI32x4(void) { return ShlVecI32x4; }
+BinaryenOp BinaryenShrSVecI32x4(void) { return ShrSVecI32x4; }
+BinaryenOp BinaryenShrUVecI32x4(void) { return ShrUVecI32x4; }
+BinaryenOp BinaryenAddVecI32x4(void) { return AddVecI32x4; }
+BinaryenOp BinaryenSubVecI32x4(void) { return SubVecI32x4; }
+BinaryenOp BinaryenMulVecI32x4(void) { return MulVecI32x4; }
+BinaryenOp BinaryenNegVecI64x2(void) { return NegVecI64x2; }
+BinaryenOp BinaryenAnyTrueVecI64x2(void) { return AnyTrueVecI64x2; }
+BinaryenOp BinaryenAllTrueVecI64x2(void) { return AllTrueVecI64x2; }
+BinaryenOp BinaryenShlVecI64x2(void) { return ShlVecI64x2; }
+BinaryenOp BinaryenShrSVecI64x2(void) { return ShrSVecI64x2; }
+BinaryenOp BinaryenShrUVecI64x2(void) { return ShrUVecI64x2; }
+BinaryenOp BinaryenAddVecI64x2(void) { return AddVecI64x2; }
+BinaryenOp BinaryenSubVecI64x2(void) { return SubVecI64x2; }
+BinaryenOp BinaryenAbsVecF32x4(void) { return AbsVecF32x4; }
+BinaryenOp BinaryenNegVecF32x4(void) { return NegVecF32x4; }
+BinaryenOp BinaryenSqrtVecF32x4(void) { return SqrtVecF32x4; }
+BinaryenOp BinaryenAddVecF32x4(void) { return AddVecF32x4; }
+BinaryenOp BinaryenSubVecF32x4(void) { return SubVecF32x4; }
+BinaryenOp BinaryenMulVecF32x4(void) { return MulVecF32x4; }
+BinaryenOp BinaryenDivVecF32x4(void) { return DivVecF32x4; }
+BinaryenOp BinaryenMinVecF32x4(void) { return MinVecF32x4; }
+BinaryenOp BinaryenMaxVecF32x4(void) { return MaxVecF32x4; }
+BinaryenOp BinaryenAbsVecF64x2(void) { return AbsVecF64x2; }
+BinaryenOp BinaryenNegVecF64x2(void) { return NegVecF64x2; }
+BinaryenOp BinaryenSqrtVecF64x2(void) { return SqrtVecF64x2; }
+BinaryenOp BinaryenAddVecF64x2(void) { return AddVecF64x2; }
+BinaryenOp BinaryenSubVecF64x2(void) { return SubVecF64x2; }
+BinaryenOp BinaryenMulVecF64x2(void) { return MulVecF64x2; }
+BinaryenOp BinaryenDivVecF64x2(void) { return DivVecF64x2; }
+BinaryenOp BinaryenMinVecF64x2(void) { return MinVecF64x2; }
+BinaryenOp BinaryenMaxVecF64x2(void) { return MaxVecF64x2; }
+BinaryenOp BinaryenTruncSatSVecF32x4ToVecI32x4(void) { return TruncSatSVecF32x4ToVecI32x4; }
+BinaryenOp BinaryenTruncSatUVecF32x4ToVecI32x4(void) { return TruncSatUVecF32x4ToVecI32x4; }
+BinaryenOp BinaryenTruncSatSVecF64x2ToVecI64x2(void) { return TruncSatSVecF64x2ToVecI64x2; }
+BinaryenOp BinaryenTruncSatUVecF64x2ToVecI64x2(void) { return TruncSatUVecF64x2ToVecI64x2; }
+BinaryenOp BinaryenConvertSVecI32x4ToVecF32x4(void) { return ConvertSVecI32x4ToVecF32x4; }
+BinaryenOp BinaryenConvertUVecI32x4ToVecF32x4(void) { return ConvertUVecI32x4ToVecF32x4; }
+BinaryenOp BinaryenConvertSVecI64x2ToVecF64x2(void) { return ConvertSVecI64x2ToVecF64x2; }
+BinaryenOp BinaryenConvertUVecI64x2ToVecF64x2(void) { return ConvertUVecI64x2ToVecF64x2; }
 
 BinaryenExpressionRef BinaryenBlock(BinaryenModuleRef module, const char* name, BinaryenExpressionRef* children, BinaryenIndex numChildren, BinaryenType type) {
   auto* ret = ((Module*)module)->allocator.alloc<Block>();
@@ -854,6 +1027,53 @@ BinaryenExpressionRef BinaryenAtomicWake(BinaryenModuleRef module, BinaryenExpre
 
   return static_cast<Expression*>(ret);
 }
+BinaryenExpressionRef BinaryenSIMDExtract(BinaryenModuleRef module, BinaryenOp op, BinaryenExpressionRef vec, uint8_t index) {
+  auto* ret = Builder(*((Module*)module)).makeSIMDExtract(SIMDExtractOp(op), (Expression*) vec, index);
+  if (tracing) {
+    traceExpression(ret, "BinaryenSIMDExtract", op, vec, int(index));
+  }
+  return static_cast<Expression*>(ret);
+}
+BinaryenExpressionRef BinaryenSIMDReplace(BinaryenModuleRef module, BinaryenOp op, BinaryenExpressionRef vec, uint8_t index, BinaryenExpressionRef value) {
+  auto* ret = Builder(*((Module*)module)).makeSIMDReplace(SIMDReplaceOp(op), (Expression*) vec, index, (Expression*)value);
+  if (tracing) {
+    traceExpression(ret, "BinaryenSIMDReplace", op, vec, int(index), value);
+  }
+  return static_cast<Expression*>(ret);
+}
+BinaryenExpressionRef BinaryenSIMDShuffle(BinaryenModuleRef module, BinaryenExpressionRef left, BinaryenExpressionRef right, const uint8_t mask_[16]) {
+  std::array<uint8_t, 16> mask;
+  memcpy(mask.data(), mask_, 16);
+  auto* ret = Builder(*((Module*)module)).makeSIMDShuffle((Expression*)left, (Expression*)right, mask);
+  if (tracing) {
+    std::cout << "  {\n";
+    std::cout << "    uint8_t mask[] = {";
+    for (size_t i = 0; i < mask.size(); ++i) {
+      std::cout << int(mask[i]);
+      if (i < mask.size() - 1) {
+        std::cout << ", ";
+      }
+    }
+    std::cout << "};\n  ";
+    traceExpression(ret, "BinaryenSIMDShuffle", left, right, "mask");
+    std::cout << "  }\n";
+  }
+  return static_cast<Expression*>(ret);
+}
+BinaryenExpressionRef BinaryenSIMDBitselect(BinaryenModuleRef module, BinaryenExpressionRef left, BinaryenExpressionRef right, BinaryenExpressionRef cond) {
+  auto* ret = Builder(*((Module*)module)).makeSIMDBitselect((Expression*)left, (Expression*)right, (Expression*)cond);
+  if (tracing) {
+    traceExpression(ret, "BinaryenSIMDBitselect", left, right, cond);
+  }
+  return static_cast<Expression*>(ret);
+}
+BinaryenExpressionRef BinaryenSIMDShift(BinaryenModuleRef module, BinaryenOp op, BinaryenExpressionRef vec, BinaryenExpressionRef shift) {
+  auto* ret = Builder(*((Module*)module)).makeSIMDShift(SIMDShiftOp(op), (Expression*)vec, (Expression*)shift);
+  if (tracing) {
+    traceExpression(ret, "BinaryenSIMDShift", op, vec, shift);
+  }
+  return static_cast<Expression*>(ret);
+}
 
 // Expression utility
 
@@ -1604,6 +1824,155 @@ BinaryenExpressionRef BinaryenAtomicWakeGetWakeCount(BinaryenExpressionRef expr)
   assert(expression->is<AtomicWake>());
   return static_cast<AtomicWake*>(expression)->wakeCount;
 }
+// SIMDExtract
+BinaryenOp BinaryenSIMDExtractGetOp(BinaryenExpressionRef expr) {
+  if (tracing) {
+    std::cout << "  BinaryenSIMDExtractGetOp(expressions[" << expressions[expr] << "]);\n";
+  }
+
+  auto* expression = (Expression*)expr;
+  assert(expression->is<SIMDExtract>());
+  return static_cast<SIMDExtract*>(expression)->op;
+}
+BinaryenExpressionRef BinaryenSIMDExtractGetVec(BinaryenExpressionRef expr) {
+  if (tracing) {
+    std::cout << "  BinaryenSIMDExtractGetVec(expressions[" << expressions[expr] << "]);\n";
+  }
+
+  auto* expression = (Expression*)expr;
+  assert(expression->is<SIMDExtract>());
+  return static_cast<SIMDExtract*>(expression)->vec;
+}
+uint8_t BinaryenSIMDExtractGetIndex(BinaryenExpressionRef expr) {
+  if (tracing) {
+    std::cout << "  BinaryenSIMDExtractGetIndex(expressions[" << expressions[expr] << "]);\n";
+  }
+
+  auto* expression = (Expression*)expr;
+  assert(expression->is<SIMDExtract>());
+  return static_cast<SIMDExtract*>(expression)->index;
+}
+// SIMDReplace
+BinaryenOp BinaryenSIMDReplaceGetOp(BinaryenExpressionRef expr) {
+  if (tracing) {
+    std::cout << "  BinaryenSIMDReplaceGetOp(expressions[" << expressions[expr] << "]);\n";
+  }
+
+  auto* expression = (Expression*)expr;
+  assert(expression->is<SIMDReplace>());
+  return static_cast<SIMDReplace*>(expression)->op;
+}
+BinaryenExpressionRef BinaryenSIMDReplaceGetVec(BinaryenExpressionRef expr) {
+  if (tracing) {
+    std::cout << "  BinaryenSIMDReplaceGetVec(expressions[" << expressions[expr] << "]);\n";
+  }
+
+  auto* expression = (Expression*)expr;
+  assert(expression->is<SIMDReplace>());
+  return static_cast<SIMDReplace*>(expression)->vec;
+}
+uint8_t BinaryenSIMDReplaceGetIndex(BinaryenExpressionRef expr) {
+  if (tracing) {
+    std::cout << "  BinaryenSIMDReplaceGetIndex(expressions[" << expressions[expr] << "]);\n";
+  }
+
+  auto* expression = (Expression*)expr;
+  assert(expression->is<SIMDReplace>());
+  return static_cast<SIMDReplace*>(expression)->index;
+}
+BinaryenExpressionRef BinaryenSIMDReplaceGetValue(BinaryenExpressionRef expr) {
+  if (tracing) {
+    std::cout << "  BinaryenSIMDReplaceGetValue(expressions[" << expressions[expr] << "]);\n";
+  }
+
+  auto* expression = (Expression*)expr;
+  assert(expression->is<SIMDReplace>());
+  return static_cast<SIMDReplace*>(expression)->value;
+}
+// SIMDShuffle
+BinaryenExpressionRef BinaryenSIMDShuffleGetLeft(BinaryenExpressionRef expr) {
+  if (tracing) {
+    std::cout << "  BinaryenSIMDShuffleGetLeft(expressions[" << expressions[expr] << "]);\n";
+  }
+
+  auto* expression = (Expression*)expr;
+  assert(expression->is<SIMDShuffle>());
+  return static_cast<SIMDShuffle*>(expression)->left;
+}
+BinaryenExpressionRef BinaryenSIMDShuffleGetRight(BinaryenExpressionRef expr) {
+  if (tracing) {
+    std::cout << "  BinaryenSIMDShuffleGetRight(expressions[" << expressions[expr] << "]);\n";
+  }
+
+  auto* expression = (Expression*)expr;
+  assert(expression->is<SIMDShuffle>());
+  return static_cast<SIMDShuffle*>(expression)->right;
+}
+void BinaryenSIMDShuffleGetMask(BinaryenExpressionRef expr, uint8_t *mask) {
+  if (tracing) {
+    std::cout << "  BinaryenSIMDShuffleGetMask(expressions[" << expressions[expr] << "]);\n";
+  }
+
+  auto* expression = (Expression*)expr;
+  assert(expression->is<SIMDShuffle>());
+  memcpy(mask, static_cast<SIMDShuffle*>(expression)->mask.data(), 16);
+}
+// SIMDBitselect
+BinaryenExpressionRef BinaryenSIMDBitselectGetLeft(BinaryenExpressionRef expr) {
+  if (tracing) {
+    std::cout << "  BinaryenSIMDBitselectGetLeft(expressions[" << expressions[expr] << "]);\n";
+  }
+
+  auto* expression = (Expression*)expr;
+  assert(expression->is<SIMDBitselect>());
+  return static_cast<SIMDBitselect*>(expression)->left;
+}
+BinaryenExpressionRef BinaryenSIMDBitselectGetRight(BinaryenExpressionRef expr) {
+  if (tracing) {
+    std::cout << "  BinaryenSIMDBitselectGetRight(expressions[" << expressions[expr] << "]);\n";
+  }
+
+  auto* expression = (Expression*)expr;
+  assert(expression->is<SIMDBitselect>());
+  return static_cast<SIMDBitselect*>(expression)->right;
+}
+BinaryenExpressionRef BinaryenSIMDBitselectGetCond(BinaryenExpressionRef expr) {
+  if (tracing) {
+    std::cout << "  BinaryenSIMDBitselectGetCond(expressions[" << expressions[expr] << "]);\n";
+  }
+
+  auto* expression = (Expression*)expr;
+  assert(expression->is<SIMDBitselect>());
+  return static_cast<SIMDBitselect*>(expression)->cond;
+}
+// SIMDShift
+BinaryenOp BinaryenSIMDShiftGetOp(BinaryenExpressionRef expr) {
+  if (tracing) {
+    std::cout << "  BinaryenSIMDShiftGetOp(expressions[" << expressions[expr] << "]);\n";
+  }
+
+  auto* expression = (Expression*)expr;
+  assert(expression->is<SIMDShift>());
+  return static_cast<SIMDShift*>(expression)->op;
+}
+BinaryenExpressionRef BinaryenSIMDShiftGetVec(BinaryenExpressionRef expr) {
+  if (tracing) {
+    std::cout << "  BinaryenSIMDShiftGetVec(expressions[" << expressions[expr] << "]);\n";
+  }
+
+  auto* expression = (Expression*)expr;
+  assert(expression->is<SIMDShift>());
+  return static_cast<SIMDShift*>(expression)->vec;
+}
+BinaryenExpressionRef BinaryenSIMDShiftGetShift(BinaryenExpressionRef expr) {
+  if (tracing) {
+    std::cout << "  BinaryenSIMDShiftGetShift(expressions[" << expressions[expr] << "]);\n";
+  }
+
+  auto* expression = (Expression*)expr;
+  assert(expression->is<SIMDShift>());
+  return static_cast<SIMDShift*>(expression)->shift;
+}
 
 // Functions
 
diff --git a/src/binaryen-c.h b/src/binaryen-c.h
index dc47b379f..2a3254f7b 100644
--- a/src/binaryen-c.h
+++ b/src/binaryen-c.h
@@ -117,6 +117,11 @@ BinaryenExpressionId BinaryenAtomicCmpxchgId(void);
 BinaryenExpressionId BinaryenAtomicRMWId(void);
 BinaryenExpressionId BinaryenAtomicWaitId(void);
 BinaryenExpressionId BinaryenAtomicWakeId(void);
+BinaryenExpressionId BinaryenSIMDExtractId(void);
+BinaryenExpressionId BinaryenSIMDReplaceId(void);
+BinaryenExpressionId BinaryenSIMDShuffleId(void);
+BinaryenExpressionId BinaryenSIMDBitselectId(void);
+BinaryenExpressionId BinaryenSIMDShiftId(void);
 
 // External kinds (call to get the value of each; you can cache them)
 
@@ -166,6 +171,7 @@ struct BinaryenLiteral {
     int64_t i64;
     float f32;
     double f64;
+    uint8_t v128[16];
   };
 };
 
@@ -173,6 +179,7 @@ struct BinaryenLiteral BinaryenLiteralInt32(int32_t x);
 struct BinaryenLiteral BinaryenLiteralInt64(int64_t x);
 struct BinaryenLiteral BinaryenLiteralFloat32(float x);
 struct BinaryenLiteral BinaryenLiteralFloat64(double x);
+struct BinaryenLiteral BinaryenLiteralVec128(const uint8_t x[16]);
 struct BinaryenLiteral BinaryenLiteralFloat32Bits(int32_t x);
 struct BinaryenLiteral BinaryenLiteralFloat64Bits(int64_t x);
 
@@ -332,6 +339,141 @@ BinaryenOp BinaryenTruncSatSFloat64ToInt32(void);
 BinaryenOp BinaryenTruncSatSFloat64ToInt64(void);
 BinaryenOp BinaryenTruncSatUFloat64ToInt32(void);
 BinaryenOp BinaryenTruncSatUFloat64ToInt64(void);
+BinaryenOp BinaryenSplatVecI8x16(void);
+BinaryenOp BinaryenExtractLaneSVecI8x16(void);
+BinaryenOp BinaryenExtractLaneUVecI8x16(void);
+BinaryenOp BinaryenReplaceLaneVecI8x16(void);
+BinaryenOp BinaryenSplatVecI16x8(void);
+BinaryenOp BinaryenExtractLaneSVecI16x8(void);
+BinaryenOp BinaryenExtractLaneUVecI16x8(void);
+BinaryenOp BinaryenReplaceLaneVecI16x8(void);
+BinaryenOp BinaryenSplatVecI32x4(void);
+BinaryenOp BinaryenExtractLaneVecI32x4(void);
+BinaryenOp BinaryenReplaceLaneVecI32x4(void);
+BinaryenOp BinaryenSplatVecI64x2(void);
+BinaryenOp BinaryenExtractLaneVecI64x2(void);
+BinaryenOp BinaryenReplaceLaneVecI64x2(void);
+BinaryenOp BinaryenSplatVecF32x4(void);
+BinaryenOp BinaryenExtractLaneVecF32x4(void);
+BinaryenOp BinaryenReplaceLaneVecF32x4(void);
+BinaryenOp BinaryenSplatVecF64x2(void);
+BinaryenOp BinaryenExtractLaneVecF64x2(void);
+BinaryenOp BinaryenReplaceLaneVecF64x2(void);
+BinaryenOp BinaryenEqVecI8x16(void);
+BinaryenOp BinaryenNeVecI8x16(void);
+BinaryenOp BinaryenLtSVecI8x16(void);
+BinaryenOp BinaryenLtUVecI8x16(void);
+BinaryenOp BinaryenGtSVecI8x16(void);
+BinaryenOp BinaryenGtUVecI8x16(void);
+BinaryenOp BinaryenLeSVecI8x16(void);
+BinaryenOp BinaryenLeUVecI8x16(void);
+BinaryenOp BinaryenGeSVecI8x16(void);
+BinaryenOp BinaryenGeUVecI8x16(void);
+BinaryenOp BinaryenEqVecI16x8(void);
+BinaryenOp BinaryenNeVecI16x8(void);
+BinaryenOp BinaryenLtSVecI16x8(void);
+BinaryenOp BinaryenLtUVecI16x8(void);
+BinaryenOp BinaryenGtSVecI16x8(void);
+BinaryenOp BinaryenGtUVecI16x8(void);
+BinaryenOp BinaryenLeSVecI16x8(void);
+BinaryenOp BinaryenLeUVecI16x8(void);
+BinaryenOp BinaryenGeSVecI16x8(void);
+BinaryenOp BinaryenGeUVecI16x8(void);
+BinaryenOp BinaryenEqVecI32x4(void);
+BinaryenOp BinaryenNeVecI32x4(void);
+BinaryenOp BinaryenLtSVecI32x4(void);
+BinaryenOp BinaryenLtUVecI32x4(void);
+BinaryenOp BinaryenGtSVecI32x4(void);
+BinaryenOp BinaryenGtUVecI32x4(void);
+BinaryenOp BinaryenLeSVecI32x4(void);
+BinaryenOp BinaryenLeUVecI32x4(void);
+BinaryenOp BinaryenGeSVecI32x4(void);
+BinaryenOp BinaryenGeUVecI32x4(void);
+BinaryenOp BinaryenEqVecF32x4(void);
+BinaryenOp BinaryenNeVecF32x4(void);
+BinaryenOp BinaryenLtVecF32x4(void);
+BinaryenOp BinaryenGtVecF32x4(void);
+BinaryenOp BinaryenLeVecF32x4(void);
+BinaryenOp BinaryenGeVecF32x4(void);
+BinaryenOp BinaryenEqVecF64x2(void);
+BinaryenOp BinaryenNeVecF64x2(void);
+BinaryenOp BinaryenLtVecF64x2(void);
+BinaryenOp BinaryenGtVecF64x2(void);
+BinaryenOp BinaryenLeVecF64x2(void);
+BinaryenOp BinaryenGeVecF64x2(void);
+BinaryenOp BinaryenNotVec128(void);
+BinaryenOp BinaryenAndVec128(void);
+BinaryenOp BinaryenOrVec128(void);
+BinaryenOp BinaryenXorVec128(void);
+BinaryenOp BinaryenNegVecI8x16(void);
+BinaryenOp BinaryenAnyTrueVecI8x16(void);
+BinaryenOp BinaryenAllTrueVecI8x16(void);
+BinaryenOp BinaryenShlVecI8x16(void);
+BinaryenOp BinaryenShrSVecI8x16(void);
+BinaryenOp BinaryenShrUVecI8x16(void);
+BinaryenOp BinaryenAddVecI8x16(void);
+BinaryenOp BinaryenAddSatSVecI8x16(void);
+BinaryenOp BinaryenAddSatUVecI8x16(void);
+BinaryenOp BinaryenSubVecI8x16(void);
+BinaryenOp BinaryenSubSatSVecI8x16(void);
+BinaryenOp BinaryenSubSatUVecI8x16(void);
+BinaryenOp BinaryenMulVecI8x16(void);
+BinaryenOp BinaryenNegVecI16x8(void);
+BinaryenOp BinaryenAnyTrueVecI16x8(void);
+BinaryenOp BinaryenAllTrueVecI16x8(void);
+BinaryenOp BinaryenShlVecI16x8(void);
+BinaryenOp BinaryenShrSVecI16x8(void);
+BinaryenOp BinaryenShrUVecI16x8(void);
+BinaryenOp BinaryenAddVecI16x8(void);
+BinaryenOp BinaryenAddSatSVecI16x8(void);
+BinaryenOp BinaryenAddSatUVecI16x8(void);
+BinaryenOp BinaryenSubVecI16x8(void);
+BinaryenOp BinaryenSubSatSVecI16x8(void);
+BinaryenOp BinaryenSubSatUVecI16x8(void);
+BinaryenOp BinaryenMulVecI16x8(void);
+BinaryenOp BinaryenNegVecI32x4(void);
+BinaryenOp BinaryenAnyTrueVecI32x4(void);
+BinaryenOp BinaryenAllTrueVecI32x4(void);
+BinaryenOp BinaryenShlVecI32x4(void);
+BinaryenOp BinaryenShrSVecI32x4(void);
+BinaryenOp BinaryenShrUVecI32x4(void);
+BinaryenOp BinaryenAddVecI32x4(void);
+BinaryenOp BinaryenSubVecI32x4(void);
+BinaryenOp BinaryenMulVecI32x4(void);
+BinaryenOp BinaryenNegVecI64x2(void);
+BinaryenOp BinaryenAnyTrueVecI64x2(void);
+BinaryenOp BinaryenAllTrueVecI64x2(void);
+BinaryenOp BinaryenShlVecI64x2(void);
+BinaryenOp BinaryenShrSVecI64x2(void);
+BinaryenOp BinaryenShrUVecI64x2(void);
+BinaryenOp BinaryenAddVecI64x2(void);
+BinaryenOp BinaryenSubVecI64x2(void);
+BinaryenOp BinaryenAbsVecF32x4(void);
+BinaryenOp BinaryenNegVecF32x4(void);
+BinaryenOp BinaryenSqrtVecF32x4(void);
+BinaryenOp BinaryenAddVecF32x4(void);
+BinaryenOp BinaryenSubVecF32x4(void);
+BinaryenOp BinaryenMulVecF32x4(void);
+BinaryenOp BinaryenDivVecF32x4(void);
+BinaryenOp BinaryenMinVecF32x4(void);
+BinaryenOp BinaryenMaxVecF32x4(void);
+BinaryenOp BinaryenAbsVecF64x2(void);
+BinaryenOp BinaryenNegVecF64x2(void);
+BinaryenOp BinaryenSqrtVecF64x2(void);
+BinaryenOp BinaryenAddVecF64x2(void);
+BinaryenOp BinaryenSubVecF64x2(void);
+BinaryenOp BinaryenMulVecF64x2(void);
+BinaryenOp BinaryenDivVecF64x2(void);
+BinaryenOp BinaryenMinVecF64x2(void);
+BinaryenOp BinaryenMaxVecF64x2(void);
+BinaryenOp BinaryenTruncSatSVecF32x4ToVecI32x4(void);
+BinaryenOp BinaryenTruncSatUVecF32x4ToVecI32x4(void);
+BinaryenOp BinaryenTruncSatSVecF64x2ToVecI64x2(void);
+BinaryenOp BinaryenTruncSatUVecF64x2ToVecI64x2(void);
+BinaryenOp BinaryenConvertSVecI32x4ToVecF32x4(void);
+BinaryenOp BinaryenConvertUVecI32x4ToVecF32x4(void);
+BinaryenOp BinaryenConvertSVecI64x2ToVecF64x2(void);
+BinaryenOp BinaryenConvertUVecI64x2ToVecF64x2(void);
 
 typedef void* BinaryenExpressionRef;
 
@@ -393,192 +535,139 @@ BinaryenExpressionRef BinaryenAtomicRMW(BinaryenModuleRef module, BinaryenOp op,
 BinaryenExpressionRef BinaryenAtomicCmpxchg(BinaryenModuleRef module, BinaryenIndex bytes, BinaryenIndex offset, BinaryenExpressionRef ptr, BinaryenExpressionRef expected, BinaryenExpressionRef replacement, BinaryenType type);
 BinaryenExpressionRef BinaryenAtomicWait(BinaryenModuleRef module, BinaryenExpressionRef ptr, BinaryenExpressionRef expected, BinaryenExpressionRef timeout, BinaryenType type);
 BinaryenExpressionRef BinaryenAtomicWake(BinaryenModuleRef module, BinaryenExpressionRef ptr, BinaryenExpressionRef wakeCount);
+BinaryenExpressionRef BinaryenSIMDExtract(BinaryenModuleRef module, BinaryenOp op, BinaryenExpressionRef vec, uint8_t index);
+BinaryenExpressionRef BinaryenSIMDReplace(BinaryenModuleRef module, BinaryenOp op, BinaryenExpressionRef vec, uint8_t index, BinaryenExpressionRef value);
+BinaryenExpressionRef BinaryenSIMDShuffle(BinaryenModuleRef module, BinaryenExpressionRef left, BinaryenExpressionRef right, const uint8_t mask[16]);
+BinaryenExpressionRef BinaryenSIMDBitselect(BinaryenModuleRef module, BinaryenExpressionRef left, BinaryenExpressionRef right, BinaryenExpressionRef cond);
+BinaryenExpressionRef BinaryenSIMDShift(BinaryenModuleRef module, BinaryenOp op, BinaryenExpressionRef vec, BinaryenExpressionRef shift);
 
-// Gets the id (kind) of the specified expression.
 BinaryenExpressionId BinaryenExpressionGetId(BinaryenExpressionRef expr);
-// Gets the type of the specified expression.
 BinaryenType BinaryenExpressionGetType(BinaryenExpressionRef expr);
-// Prints an expression to stdout. Useful for debugging.
 void BinaryenExpressionPrint(BinaryenExpressionRef expr);
 
-// Gets the name of the specified `Block` expression. May be `NULL`.
 const char* BinaryenBlockGetName(BinaryenExpressionRef expr);
-// Gets the number of nested child expressions within the specified `Block` expression.
 BinaryenIndex BinaryenBlockGetNumChildren(BinaryenExpressionRef expr);
-// Gets the nested child expression at the specified index within the specified `Block` expression.
 BinaryenExpressionRef BinaryenBlockGetChild(BinaryenExpressionRef expr, BinaryenIndex index);
 
-// Gets the nested condition expression within the specified `If` expression.
 BinaryenExpressionRef BinaryenIfGetCondition(BinaryenExpressionRef expr);
-// Gets the nested ifTrue expression within the specified `If` expression.
 BinaryenExpressionRef BinaryenIfGetIfTrue(BinaryenExpressionRef expr);
-// Gets the nested ifFalse expression within the specified `If` expression.
 BinaryenExpressionRef BinaryenIfGetIfFalse(BinaryenExpressionRef expr);
 
-// Gets the name of the specified `Loop` expression. May be `NULL`.
 const char* BinaryenLoopGetName(BinaryenExpressionRef expr);
-// Gets the nested body expression within the specified `Loop` expression.
 BinaryenExpressionRef BinaryenLoopGetBody(BinaryenExpressionRef expr);
 
-// Gets the name of the specified `Break` expression. May be `NULL`.
 const char* BinaryenBreakGetName(BinaryenExpressionRef expr);
-// Gets the nested condition expression within the specified `Break` expression. Returns `NULL` if this is a `br` and not a `br_if`.
 BinaryenExpressionRef BinaryenBreakGetCondition(BinaryenExpressionRef expr);
-// Gets the nested value expression within the specified `Break` expression. May be `NULL`.
 BinaryenExpressionRef BinaryenBreakGetValue(BinaryenExpressionRef expr);
 
-// Gets the number of names within the specified `Switch` expression.
 BinaryenIndex BinaryenSwitchGetNumNames(BinaryenExpressionRef expr);
-// Gets the name at the specified index within the specified `Switch` expression.
 const char* BinaryenSwitchGetName(BinaryenExpressionRef expr, BinaryenIndex index);
-// Gets the default name of the specified `Switch` expression.
 const char* BinaryenSwitchGetDefaultName(BinaryenExpressionRef expr);
-// Gets the nested condition expression within the specified `Switch` expression.
 BinaryenExpressionRef BinaryenSwitchGetCondition(BinaryenExpressionRef expr);
-// Gets the nested value expression within the specifiedd `Switch` expression. May be `NULL`.
 BinaryenExpressionRef BinaryenSwitchGetValue(BinaryenExpressionRef expr);
 
-// Gets the name of the target of the specified `Call` expression.
 const char* BinaryenCallGetTarget(BinaryenExpressionRef expr);
-// Gets the number of nested operand expressions within the specified `Call` expression.
 BinaryenIndex BinaryenCallGetNumOperands(BinaryenExpressionRef expr);
-// Gets the nested operand expression at the specified index within the specified `Call` expression.
 BinaryenExpressionRef BinaryenCallGetOperand(BinaryenExpressionRef expr, BinaryenIndex index);
 
-// Gets the nested target expression of the specified `CallIndirect` expression.
 BinaryenExpressionRef BinaryenCallIndirectGetTarget(BinaryenExpressionRef expr);
-// Gets the number of nested operand expressions within the specified `CallIndirect` expression.
 BinaryenIndex BinaryenCallIndirectGetNumOperands(BinaryenExpressionRef expr);
-// Gets the nested operand expression at the specified index within the specified `CallIndirect` expression.
 BinaryenExpressionRef BinaryenCallIndirectGetOperand(BinaryenExpressionRef expr, BinaryenIndex index);
 
-// Gets the index of the specified `GetLocal` expression.
 BinaryenIndex BinaryenGetLocalGetIndex(BinaryenExpressionRef expr);
 
-// Tests if the specified `SetLocal` expression performs a `tee_local` instead of a `set_local`.
 int BinaryenSetLocalIsTee(BinaryenExpressionRef expr);
-// Gets the index of the specified `SetLocal` expression.
 BinaryenIndex BinaryenSetLocalGetIndex(BinaryenExpressionRef expr);
-// Gets the nested value expression within the specified `SetLocal` expression.
 BinaryenExpressionRef BinaryenSetLocalGetValue(BinaryenExpressionRef expr);
 
-// Gets the name of the specified `GetGlobal` expression.
 const char* BinaryenGetGlobalGetName(BinaryenExpressionRef expr);
 
-// Gets the name of the specified `SetGlobal` expression.
 const char* BinaryenSetGlobalGetName(BinaryenExpressionRef expr);
-// Gets the nested value expression within the specified `SetLocal` expression.
 BinaryenExpressionRef BinaryenSetGlobalGetValue(BinaryenExpressionRef expr);
 
-// Gets the operator of the specified `Host` expression.
 BinaryenOp BinaryenHostGetOp(BinaryenExpressionRef expr);
-// Gets the name operand of the specified `Host` expression. May be `NULL`.
 const char* BinaryenHostGetNameOperand(BinaryenExpressionRef expr);
-// Gets the number of nested operand expressions within the specified `Host` expression.
 BinaryenIndex BinaryenHostGetNumOperands(BinaryenExpressionRef expr);
-// Gets the nested operand expression at the specified index within the specified `Host` expression.
 BinaryenExpressionRef BinaryenHostGetOperand(BinaryenExpressionRef expr, BinaryenIndex index);
 
-// Tests if the specified `Load` expression is atomic.
 int BinaryenLoadIsAtomic(BinaryenExpressionRef expr);
-// Tests if the specified `Load` expression is signed.
 int BinaryenLoadIsSigned(BinaryenExpressionRef expr);
-// Gets the offset of the specified `Load` expression.
 uint32_t BinaryenLoadGetOffset(BinaryenExpressionRef expr);
-// Gets the byte size of the specified `Load` expression.
 uint32_t BinaryenLoadGetBytes(BinaryenExpressionRef expr);
-// Gets the alignment of the specified `Load` expression.
 uint32_t BinaryenLoadGetAlign(BinaryenExpressionRef expr);
-// Gets the nested pointer expression within the specified `Load` expression.
 BinaryenExpressionRef BinaryenLoadGetPtr(BinaryenExpressionRef expr);
 
-// Tests if the specified `Store` expression is atomic.
 int BinaryenStoreIsAtomic(BinaryenExpressionRef expr);
-// Gets the byte size of the specified `Store` expression.
 uint32_t BinaryenStoreGetBytes(BinaryenExpressionRef expr);
-// Gets the offset of the specified store expression.
 uint32_t BinaryenStoreGetOffset(BinaryenExpressionRef expr);
-// Gets the alignment of the specified `Store` expression.
 uint32_t BinaryenStoreGetAlign(BinaryenExpressionRef expr);
-// Gets the nested pointer expression within the specified `Store` expression.
 BinaryenExpressionRef BinaryenStoreGetPtr(BinaryenExpressionRef expr);
-// Gets the nested value expression within the specified `Store` expression.
 BinaryenExpressionRef BinaryenStoreGetValue(BinaryenExpressionRef expr);
 
-// Gets the 32-bit integer value of the specified `Const` expression.
 int32_t BinaryenConstGetValueI32(BinaryenExpressionRef expr);
-// Gets the 64-bit integer value of the specified `Const` expression.
 int64_t BinaryenConstGetValueI64(BinaryenExpressionRef expr);
-// Gets the low 32-bits of a 64-bit integer value of the specified `Const` expression. Useful where I64 returning exports are illegal, i.e. binaryen.js.
 int32_t BinaryenConstGetValueI64Low(BinaryenExpressionRef expr);
-// Gets the high 32-bits of a 64-bit integer value of the specified `Const` expression. Useful where I64 returning exports are illegal, i.e. binaryen.js.
 int32_t BinaryenConstGetValueI64High(BinaryenExpressionRef expr);
-// Gets the 32-bit float value of the specified `Const` expression.
 float BinaryenConstGetValueF32(BinaryenExpressionRef expr);
-// Gets the 64-bit float value of the specified `Const` expression.
 double BinaryenConstGetValueF64(BinaryenExpressionRef expr);
 
-// Gets the operator of the specified `Unary` expression.
 BinaryenOp BinaryenUnaryGetOp(BinaryenExpressionRef expr);
-// Gets the nested value expression within the specified `Unary` expression.
 BinaryenExpressionRef BinaryenUnaryGetValue(BinaryenExpressionRef expr);
 
-// Gets the operator of the specified `Binary` expression.
 BinaryenOp BinaryenBinaryGetOp(BinaryenExpressionRef expr);
-// Gets the nested left expression within the specified `Binary` expression.
 BinaryenExpressionRef BinaryenBinaryGetLeft(BinaryenExpressionRef expr);
-// Gets the nested right expression within the specified `Binary` expression.
 BinaryenExpressionRef BinaryenBinaryGetRight(BinaryenExpressionRef expr);
 
-// Gets the nested ifTrue expression within the specified `Select` expression.
 BinaryenExpressionRef BinaryenSelectGetIfTrue(BinaryenExpressionRef expr);
-// Gets the nested ifFalse expression within the specified `Select` expression.
 BinaryenExpressionRef BinaryenSelectGetIfFalse(BinaryenExpressionRef expr);
-// Gets the nested condition expression within the specified `Select` expression.
 BinaryenExpressionRef BinaryenSelectGetCondition(BinaryenExpressionRef expr);
 
-// Gets the nested value expression within the specified `Drop` expression.
 BinaryenExpressionRef BinaryenDropGetValue(BinaryenExpressionRef expr);
 
-// Gets the nested value expression within the specified `Return` expression.
 BinaryenExpressionRef BinaryenReturnGetValue(BinaryenExpressionRef expr);
 
-// Gets the operator of the specified `AtomicRMW` expression.
 BinaryenOp BinaryenAtomicRMWGetOp(BinaryenExpressionRef expr);
-// Gets the byte size of the specified `AtomicRMW` expression.
 uint32_t BinaryenAtomicRMWGetBytes(BinaryenExpressionRef expr);
-// Gets the offset of the specified `AtomicRMW` expression.
 uint32_t BinaryenAtomicRMWGetOffset(BinaryenExpressionRef expr);
-// Gets the nested pointer expression within the specified `AtomicRMW` expression.
 BinaryenExpressionRef BinaryenAtomicRMWGetPtr(BinaryenExpressionRef expr);
-// Gets the nested value expression within the specified `AtomicRMW` expression.
 BinaryenExpressionRef BinaryenAtomicRMWGetValue(BinaryenExpressionRef expr);
 
-// Gets the byte size of the specified `AtomicCmpxchg` expression.
 uint32_t BinaryenAtomicCmpxchgGetBytes(BinaryenExpressionRef expr);
-// Gets the offset of the specified `AtomicCmpxchg` expression.
 uint32_t BinaryenAtomicCmpxchgGetOffset(BinaryenExpressionRef expr);
-// Gets the nested pointer expression within the specified `AtomicCmpxchg` expression.
 BinaryenExpressionRef BinaryenAtomicCmpxchgGetPtr(BinaryenExpressionRef expr);
-// Gets the nested expected value expression within the specified `AtomicCmpxchg` expression.
 BinaryenExpressionRef BinaryenAtomicCmpxchgGetExpected(BinaryenExpressionRef expr);
-// Gets the nested replacement value expression within the specified `AtomicCmpxchg` expression.
 BinaryenExpressionRef BinaryenAtomicCmpxchgGetReplacement(BinaryenExpressionRef expr);
 
-// Gets the nested pointer expression within the specified `AtomicWait` expression.
 BinaryenExpressionRef BinaryenAtomicWaitGetPtr(BinaryenExpressionRef expr);
-// Gets the nested expected value expression within the specified `AtomicWait` expression.
 BinaryenExpressionRef BinaryenAtomicWaitGetExpected(BinaryenExpressionRef expr);
-// Gets the nested timeout expression within the specified `AtomicWait` expression.
 BinaryenExpressionRef BinaryenAtomicWaitGetTimeout(BinaryenExpressionRef expr);
-// Gets the expected type of the specified `AtomicWait` expression.
 BinaryenType BinaryenAtomicWaitGetExpectedType(BinaryenExpressionRef expr);
 
-// Gets the nested pointer expression within the specified `AtomicWake` expression.
 BinaryenExpressionRef BinaryenAtomicWakeGetPtr(BinaryenExpressionRef expr);
-// Gets the nested wake count expression within the specified `AtomicWake` expression.
 BinaryenExpressionRef BinaryenAtomicWakeGetWakeCount(BinaryenExpressionRef expr);
 
+BinaryenOp BinaryenSIMDExtractGetOp(BinaryenExpressionRef expr);
+BinaryenExpressionRef BinaryenSIMDExtractGetVec(BinaryenExpressionRef expr);
+uint8_t BinaryenSIMDExtractGetIndex(BinaryenExpressionRef expr);
+
+BinaryenOp BinaryenSIMDReplaceGetOp(BinaryenExpressionRef expr);
+BinaryenExpressionRef BinaryenSIMDReplaceGetVec(BinaryenExpressionRef expr);
+uint8_t BinaryenSIMDReplaceGetIndex(BinaryenExpressionRef expr);
+BinaryenExpressionRef BinaryenSIMDReplaceGetValue(BinaryenExpressionRef expr);
+
+BinaryenExpressionRef BinaryenSIMDShuffleGetLeft(BinaryenExpressionRef expr);
+BinaryenExpressionRef BinaryenSIMDShuffleGetRight(BinaryenExpressionRef expr);
+void BinaryenSIMDShuffleGetMask(BinaryenExpressionRef expr, uint8_t *mask);
+
+BinaryenExpressionRef BinaryenSIMDBitselectGetLeft(BinaryenExpressionRef expr);
+BinaryenExpressionRef BinaryenSIMDBitselectGetRight(BinaryenExpressionRef expr);
+BinaryenExpressionRef BinaryenSIMDBitselectGetCond(BinaryenExpressionRef expr);
+
+BinaryenOp BinaryenSIMDShiftGetOp(BinaryenExpressionRef expr);
+BinaryenExpressionRef BinaryenSIMDShiftGetVec(BinaryenExpressionRef expr);
+BinaryenExpressionRef BinaryenSIMDShiftGetShift(BinaryenExpressionRef expr);
+
+
 // Functions
 
 typedef void* BinaryenFunctionRef;
diff --git a/src/dataflow/graph.h b/src/dataflow/graph.h
index 7f5654f8d..85b37b7b0 100644
--- a/src/dataflow/graph.h
+++ b/src/dataflow/graph.h
@@ -40,7 +40,7 @@ namespace DataFlow {
 // contains the DataFlow IR for that expression, which can be a
 // Bad node if not supported, or nullptr if not relevant (we only
 // use the return value for internal expressions, that is, the
-// value of a set_local or the condition of an if etc).
+// value of a local.set or the condition of an if etc).
 struct Graph : public UnifiedExpressionVisitor<Graph, Node*> {
   // We only need one canonical bad node. It is never modified.
   Node bad = Node(Node::Type::Bad);
@@ -153,7 +153,7 @@ struct Graph : public UnifiedExpressionVisitor<Graph, Node*> {
   }
 
   Node* makeZero(wasm::Type type) {
-    return makeConst(LiteralUtils::makeLiteralZero(type));
+    return makeConst(Literal::makeZero(type));
   }
 
   // Add a new node to our list of owned nodes.
@@ -699,7 +699,7 @@ struct Graph : public UnifiedExpressionVisitor<Graph, Node*> {
     return node;
   }
 
-  // Given a node representing something that is set_local'd, return
+  // Given a node representing something that is local.set'd, return
   // the set.
   SetLocal* getSet(Node* node) {
     auto iter = nodeParentMap.find(node);
@@ -721,7 +721,7 @@ struct Graph : public UnifiedExpressionVisitor<Graph, Node*> {
   }
 
   // Creates an expression that uses a node. Generally, a node represents
-  // a value in a local, so we create a get_local for it.
+  // a value in a local, so we create a local.get for it.
   Expression* makeUse(Node* node) {
     Builder builder(*module);
     if (node->isPhi()) {
diff --git a/src/gen-s-parser.inc b/src/gen-s-parser.inc
index 16399bfba..6db1d7a2c 100644
--- a/src/gen-s-parser.inc
+++ b/src/gen-s-parser.inc
@@ -60,311 +60,531 @@ switch (op[0]) {
   case 'f': {
     switch (op[1]) {
       case '3': {
-        switch (op[4]) {
-          case 'a': {
-            switch (op[5]) {
-              case 'b':
-                if (strcmp(op, "f32.abs") == 0) return makeUnary(s, UnaryOp::AbsFloat32);
-                goto parse_error;
-              case 'd':
-                if (strcmp(op, "f32.add") == 0) return makeBinary(s, BinaryOp::AddFloat32);
-                goto parse_error;
-              default: goto parse_error;
-            }
-          }
-          case 'c': {
-            switch (op[5]) {
-              case 'e':
-                if (strcmp(op, "f32.ceil") == 0) return makeUnary(s, UnaryOp::CeilFloat32);
-                goto parse_error;
-              case 'o': {
-                switch (op[6]) {
-                  case 'n': {
-                    switch (op[7]) {
-                      case 's':
-                        if (strcmp(op, "f32.const") == 0) return makeConst(s, f32);
-                        goto parse_error;
-                      case 'v': {
-                        switch (op[12]) {
-                          case 's': {
-                            switch (op[15]) {
-                              case '3':
-                                if (strcmp(op, "f32.convert_s/i32") == 0) return makeUnary(s, UnaryOp::ConvertSInt32ToFloat32);
-                                goto parse_error;
-                              case '6':
-                                if (strcmp(op, "f32.convert_s/i64") == 0) return makeUnary(s, UnaryOp::ConvertSInt64ToFloat32);
-                                goto parse_error;
-                              default: goto parse_error;
-                            }
-                          }
-                          case 'u': {
-                            switch (op[15]) {
-                              case '3':
-                                if (strcmp(op, "f32.convert_u/i32") == 0) return makeUnary(s, UnaryOp::ConvertUInt32ToFloat32);
-                                goto parse_error;
-                              case '6':
-                                if (strcmp(op, "f32.convert_u/i64") == 0) return makeUnary(s, UnaryOp::ConvertUInt64ToFloat32);
-                                goto parse_error;
+        switch (op[3]) {
+          case '.': {
+            switch (op[4]) {
+              case 'a': {
+                switch (op[5]) {
+                  case 'b':
+                    if (strcmp(op, "f32.abs") == 0) return makeUnary(s, UnaryOp::AbsFloat32);
+                    goto parse_error;
+                  case 'd':
+                    if (strcmp(op, "f32.add") == 0) return makeBinary(s, BinaryOp::AddFloat32);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'c': {
+                switch (op[5]) {
+                  case 'e':
+                    if (strcmp(op, "f32.ceil") == 0) return makeUnary(s, UnaryOp::CeilFloat32);
+                    goto parse_error;
+                  case 'o': {
+                    switch (op[6]) {
+                      case 'n': {
+                        switch (op[7]) {
+                          case 's':
+                            if (strcmp(op, "f32.const") == 0) return makeConst(s, f32);
+                            goto parse_error;
+                          case 'v': {
+                            switch (op[13]) {
+                              case '3': {
+                                switch (op[16]) {
+                                  case 's':
+                                    if (strcmp(op, "f32.convert_i32_s") == 0) return makeUnary(s, UnaryOp::ConvertSInt32ToFloat32);
+                                    goto parse_error;
+                                  case 'u':
+                                    if (strcmp(op, "f32.convert_i32_u") == 0) return makeUnary(s, UnaryOp::ConvertUInt32ToFloat32);
+                                    goto parse_error;
+                                  default: goto parse_error;
+                                }
+                              }
+                              case '6': {
+                                switch (op[16]) {
+                                  case 's':
+                                    if (strcmp(op, "f32.convert_i64_s") == 0) return makeUnary(s, UnaryOp::ConvertSInt64ToFloat32);
+                                    goto parse_error;
+                                  case 'u':
+                                    if (strcmp(op, "f32.convert_i64_u") == 0) return makeUnary(s, UnaryOp::ConvertUInt64ToFloat32);
+                                    goto parse_error;
+                                  default: goto parse_error;
+                                }
+                              }
                               default: goto parse_error;
                             }
                           }
                           default: goto parse_error;
                         }
                       }
+                      case 'p':
+                        if (strcmp(op, "f32.copysign") == 0) return makeBinary(s, BinaryOp::CopySignFloat32);
+                        goto parse_error;
                       default: goto parse_error;
                     }
                   }
-                  case 'p':
-                    if (strcmp(op, "f32.copysign") == 0) return makeBinary(s, BinaryOp::CopySignFloat32);
+                  default: goto parse_error;
+                }
+              }
+              case 'd': {
+                switch (op[5]) {
+                  case 'e':
+                    if (strcmp(op, "f32.demote_f64") == 0) return makeUnary(s, UnaryOp::DemoteFloat64);
+                    goto parse_error;
+                  case 'i':
+                    if (strcmp(op, "f32.div") == 0) return makeBinary(s, BinaryOp::DivFloat32);
                     goto parse_error;
                   default: goto parse_error;
                 }
               }
-              default: goto parse_error;
-            }
-          }
-          case 'd': {
-            switch (op[5]) {
-              case 'e':
-                if (strcmp(op, "f32.demote/f64") == 0) return makeUnary(s, UnaryOp::DemoteFloat64);
-                goto parse_error;
-              case 'i':
-                if (strcmp(op, "f32.div") == 0) return makeBinary(s, BinaryOp::DivFloat32);
-                goto parse_error;
-              default: goto parse_error;
-            }
-          }
-          case 'e':
-            if (strcmp(op, "f32.eq") == 0) return makeBinary(s, BinaryOp::EqFloat32);
-            goto parse_error;
-          case 'f':
-            if (strcmp(op, "f32.floor") == 0) return makeUnary(s, UnaryOp::FloorFloat32);
-            goto parse_error;
-          case 'g': {
-            switch (op[5]) {
               case 'e':
-                if (strcmp(op, "f32.ge") == 0) return makeBinary(s, BinaryOp::GeFloat32);
+                if (strcmp(op, "f32.eq") == 0) return makeBinary(s, BinaryOp::EqFloat32);
                 goto parse_error;
-              case 't':
-                if (strcmp(op, "f32.gt") == 0) return makeBinary(s, BinaryOp::GtFloat32);
+              case 'f':
+                if (strcmp(op, "f32.floor") == 0) return makeUnary(s, UnaryOp::FloorFloat32);
                 goto parse_error;
-              default: goto parse_error;
-            }
-          }
-          case 'l': {
-            switch (op[5]) {
-              case 'e':
-                if (strcmp(op, "f32.le") == 0) return makeBinary(s, BinaryOp::LeFloat32);
-                goto parse_error;
-              case 'o':
-                if (strcmp(op, "f32.load") == 0) return makeLoad(s, f32, /*isAtomic=*/false);
+              case 'g': {
+                switch (op[5]) {
+                  case 'e':
+                    if (strcmp(op, "f32.ge") == 0) return makeBinary(s, BinaryOp::GeFloat32);
+                    goto parse_error;
+                  case 't':
+                    if (strcmp(op, "f32.gt") == 0) return makeBinary(s, BinaryOp::GtFloat32);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'l': {
+                switch (op[5]) {
+                  case 'e':
+                    if (strcmp(op, "f32.le") == 0) return makeBinary(s, BinaryOp::LeFloat32);
+                    goto parse_error;
+                  case 'o':
+                    if (strcmp(op, "f32.load") == 0) return makeLoad(s, f32, /*isAtomic=*/false);
+                    goto parse_error;
+                  case 't':
+                    if (strcmp(op, "f32.lt") == 0) return makeBinary(s, BinaryOp::LtFloat32);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'm': {
+                switch (op[5]) {
+                  case 'a':
+                    if (strcmp(op, "f32.max") == 0) return makeBinary(s, BinaryOp::MaxFloat32);
+                    goto parse_error;
+                  case 'i':
+                    if (strcmp(op, "f32.min") == 0) return makeBinary(s, BinaryOp::MinFloat32);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "f32.mul") == 0) return makeBinary(s, BinaryOp::MulFloat32);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'n': {
+                switch (op[6]) {
+                  case '\0':
+                    if (strcmp(op, "f32.ne") == 0) return makeBinary(s, BinaryOp::NeFloat32);
+                    goto parse_error;
+                  case 'a':
+                    if (strcmp(op, "f32.nearest") == 0) return makeUnary(s, UnaryOp::NearestFloat32);
+                    goto parse_error;
+                  case 'g':
+                    if (strcmp(op, "f32.neg") == 0) return makeUnary(s, UnaryOp::NegFloat32);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'r':
+                if (strcmp(op, "f32.reinterpret_i32") == 0) return makeUnary(s, UnaryOp::ReinterpretInt32);
                 goto parse_error;
+              case 's': {
+                switch (op[5]) {
+                  case 'q':
+                    if (strcmp(op, "f32.sqrt") == 0) return makeUnary(s, UnaryOp::SqrtFloat32);
+                    goto parse_error;
+                  case 't':
+                    if (strcmp(op, "f32.store") == 0) return makeStore(s, f32, /*isAtomic=*/false);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "f32.sub") == 0) return makeBinary(s, BinaryOp::SubFloat32);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
               case 't':
-                if (strcmp(op, "f32.lt") == 0) return makeBinary(s, BinaryOp::LtFloat32);
+                if (strcmp(op, "f32.trunc") == 0) return makeUnary(s, UnaryOp::TruncFloat32);
                 goto parse_error;
               default: goto parse_error;
             }
           }
-          case 'm': {
-            switch (op[5]) {
-              case 'a':
-                if (strcmp(op, "f32.max") == 0) return makeBinary(s, BinaryOp::MaxFloat32);
-                goto parse_error;
-              case 'i':
-                if (strcmp(op, "f32.min") == 0) return makeBinary(s, BinaryOp::MinFloat32);
-                goto parse_error;
-              case 'u':
-                if (strcmp(op, "f32.mul") == 0) return makeBinary(s, BinaryOp::MulFloat32);
-                goto parse_error;
-              default: goto parse_error;
-            }
-          }
-          case 'n': {
+          case 'x': {
             switch (op[6]) {
-              case '\0':
-                if (strcmp(op, "f32.ne") == 0) return makeBinary(s, BinaryOp::NeFloat32);
-                goto parse_error;
-              case 'a':
-                if (strcmp(op, "f32.nearest") == 0) return makeUnary(s, UnaryOp::NearestFloat32);
-                goto parse_error;
-              case 'g':
-                if (strcmp(op, "f32.neg") == 0) return makeUnary(s, UnaryOp::NegFloat32);
-                goto parse_error;
-              default: goto parse_error;
-            }
-          }
-          case 'r':
-            if (strcmp(op, "f32.reinterpret/i32") == 0) return makeUnary(s, UnaryOp::ReinterpretInt32);
-            goto parse_error;
-          case 's': {
-            switch (op[5]) {
-              case 'q':
-                if (strcmp(op, "f32.sqrt") == 0) return makeUnary(s, UnaryOp::SqrtFloat32);
-                goto parse_error;
-              case 't':
-                if (strcmp(op, "f32.store") == 0) return makeStore(s, f32, /*isAtomic=*/false);
+              case 'a': {
+                switch (op[7]) {
+                  case 'b':
+                    if (strcmp(op, "f32x4.abs") == 0) return makeUnary(s, UnaryOp::AbsVecF32x4);
+                    goto parse_error;
+                  case 'd':
+                    if (strcmp(op, "f32x4.add") == 0) return makeBinary(s, BinaryOp::AddVecF32x4);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'c': {
+                switch (op[20]) {
+                  case 's':
+                    if (strcmp(op, "f32x4.convert_i32x4_s") == 0) return makeUnary(s, UnaryOp::ConvertSVecI32x4ToVecF32x4);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "f32x4.convert_i32x4_u") == 0) return makeUnary(s, UnaryOp::ConvertUVecI32x4ToVecF32x4);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'd':
+                if (strcmp(op, "f32x4.div") == 0) return makeBinary(s, BinaryOp::DivVecF32x4);
                 goto parse_error;
-              case 'u':
-                if (strcmp(op, "f32.sub") == 0) return makeBinary(s, BinaryOp::SubFloat32);
+              case 'e': {
+                switch (op[7]) {
+                  case 'q':
+                    if (strcmp(op, "f32x4.eq") == 0) return makeBinary(s, BinaryOp::EqVecF32x4);
+                    goto parse_error;
+                  case 'x':
+                    if (strcmp(op, "f32x4.extract_lane") == 0) return makeSIMDExtract(s, SIMDExtractOp::ExtractLaneVecF32x4, 4);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'g': {
+                switch (op[7]) {
+                  case 'e':
+                    if (strcmp(op, "f32x4.ge") == 0) return makeBinary(s, BinaryOp::GeVecF32x4);
+                    goto parse_error;
+                  case 't':
+                    if (strcmp(op, "f32x4.gt") == 0) return makeBinary(s, BinaryOp::GtVecF32x4);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'l': {
+                switch (op[7]) {
+                  case 'e':
+                    if (strcmp(op, "f32x4.le") == 0) return makeBinary(s, BinaryOp::LeVecF32x4);
+                    goto parse_error;
+                  case 't':
+                    if (strcmp(op, "f32x4.lt") == 0) return makeBinary(s, BinaryOp::LtVecF32x4);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'm': {
+                switch (op[7]) {
+                  case 'a':
+                    if (strcmp(op, "f32x4.max") == 0) return makeBinary(s, BinaryOp::MaxVecF32x4);
+                    goto parse_error;
+                  case 'i':
+                    if (strcmp(op, "f32x4.min") == 0) return makeBinary(s, BinaryOp::MinVecF32x4);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "f32x4.mul") == 0) return makeBinary(s, BinaryOp::MulVecF32x4);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'n': {
+                switch (op[8]) {
+                  case '\0':
+                    if (strcmp(op, "f32x4.ne") == 0) return makeBinary(s, BinaryOp::NeVecF32x4);
+                    goto parse_error;
+                  case 'g':
+                    if (strcmp(op, "f32x4.neg") == 0) return makeUnary(s, UnaryOp::NegVecF32x4);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'r':
+                if (strcmp(op, "f32x4.replace_lane") == 0) return makeSIMDReplace(s, SIMDReplaceOp::ReplaceLaneVecF32x4, 4);
                 goto parse_error;
+              case 's': {
+                switch (op[7]) {
+                  case 'p':
+                    if (strcmp(op, "f32x4.splat") == 0) return makeUnary(s, UnaryOp::SplatVecF32x4);
+                    goto parse_error;
+                  case 'q':
+                    if (strcmp(op, "f32x4.sqrt") == 0) return makeUnary(s, UnaryOp::SqrtVecF32x4);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "f32x4.sub") == 0) return makeBinary(s, BinaryOp::SubVecF32x4);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
               default: goto parse_error;
             }
           }
-          case 't':
-            if (strcmp(op, "f32.trunc") == 0) return makeUnary(s, UnaryOp::TruncFloat32);
-            goto parse_error;
           default: goto parse_error;
         }
       }
       case '6': {
-        switch (op[4]) {
-          case 'a': {
-            switch (op[5]) {
-              case 'b':
-                if (strcmp(op, "f64.abs") == 0) return makeUnary(s, UnaryOp::AbsFloat64);
-                goto parse_error;
-              case 'd':
-                if (strcmp(op, "f64.add") == 0) return makeBinary(s, BinaryOp::AddFloat64);
-                goto parse_error;
-              default: goto parse_error;
-            }
-          }
-          case 'c': {
-            switch (op[5]) {
-              case 'e':
-                if (strcmp(op, "f64.ceil") == 0) return makeUnary(s, UnaryOp::CeilFloat64);
-                goto parse_error;
-              case 'o': {
-                switch (op[6]) {
-                  case 'n': {
-                    switch (op[7]) {
-                      case 's':
-                        if (strcmp(op, "f64.const") == 0) return makeConst(s, f64);
-                        goto parse_error;
-                      case 'v': {
-                        switch (op[12]) {
-                          case 's': {
-                            switch (op[15]) {
-                              case '3':
-                                if (strcmp(op, "f64.convert_s/i32") == 0) return makeUnary(s, UnaryOp::ConvertSInt32ToFloat64);
-                                goto parse_error;
-                              case '6':
-                                if (strcmp(op, "f64.convert_s/i64") == 0) return makeUnary(s, UnaryOp::ConvertSInt64ToFloat64);
-                                goto parse_error;
-                              default: goto parse_error;
-                            }
-                          }
-                          case 'u': {
-                            switch (op[15]) {
-                              case '3':
-                                if (strcmp(op, "f64.convert_u/i32") == 0) return makeUnary(s, UnaryOp::ConvertUInt32ToFloat64);
-                                goto parse_error;
-                              case '6':
-                                if (strcmp(op, "f64.convert_u/i64") == 0) return makeUnary(s, UnaryOp::ConvertUInt64ToFloat64);
-                                goto parse_error;
+        switch (op[3]) {
+          case '.': {
+            switch (op[4]) {
+              case 'a': {
+                switch (op[5]) {
+                  case 'b':
+                    if (strcmp(op, "f64.abs") == 0) return makeUnary(s, UnaryOp::AbsFloat64);
+                    goto parse_error;
+                  case 'd':
+                    if (strcmp(op, "f64.add") == 0) return makeBinary(s, BinaryOp::AddFloat64);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'c': {
+                switch (op[5]) {
+                  case 'e':
+                    if (strcmp(op, "f64.ceil") == 0) return makeUnary(s, UnaryOp::CeilFloat64);
+                    goto parse_error;
+                  case 'o': {
+                    switch (op[6]) {
+                      case 'n': {
+                        switch (op[7]) {
+                          case 's':
+                            if (strcmp(op, "f64.const") == 0) return makeConst(s, f64);
+                            goto parse_error;
+                          case 'v': {
+                            switch (op[13]) {
+                              case '3': {
+                                switch (op[16]) {
+                                  case 's':
+                                    if (strcmp(op, "f64.convert_i32_s") == 0) return makeUnary(s, UnaryOp::ConvertSInt32ToFloat64);
+                                    goto parse_error;
+                                  case 'u':
+                                    if (strcmp(op, "f64.convert_i32_u") == 0) return makeUnary(s, UnaryOp::ConvertUInt32ToFloat64);
+                                    goto parse_error;
+                                  default: goto parse_error;
+                                }
+                              }
+                              case '6': {
+                                switch (op[16]) {
+                                  case 's':
+                                    if (strcmp(op, "f64.convert_i64_s") == 0) return makeUnary(s, UnaryOp::ConvertSInt64ToFloat64);
+                                    goto parse_error;
+                                  case 'u':
+                                    if (strcmp(op, "f64.convert_i64_u") == 0) return makeUnary(s, UnaryOp::ConvertUInt64ToFloat64);
+                                    goto parse_error;
+                                  default: goto parse_error;
+                                }
+                              }
                               default: goto parse_error;
                             }
                           }
                           default: goto parse_error;
                         }
                       }
+                      case 'p':
+                        if (strcmp(op, "f64.copysign") == 0) return makeBinary(s, BinaryOp::CopySignFloat64);
+                        goto parse_error;
                       default: goto parse_error;
                     }
                   }
-                  case 'p':
-                    if (strcmp(op, "f64.copysign") == 0) return makeBinary(s, BinaryOp::CopySignFloat64);
-                    goto parse_error;
                   default: goto parse_error;
                 }
               }
-              default: goto parse_error;
-            }
-          }
-          case 'd':
-            if (strcmp(op, "f64.div") == 0) return makeBinary(s, BinaryOp::DivFloat64);
-            goto parse_error;
-          case 'e':
-            if (strcmp(op, "f64.eq") == 0) return makeBinary(s, BinaryOp::EqFloat64);
-            goto parse_error;
-          case 'f':
-            if (strcmp(op, "f64.floor") == 0) return makeUnary(s, UnaryOp::FloorFloat64);
-            goto parse_error;
-          case 'g': {
-            switch (op[5]) {
-              case 'e':
-                if (strcmp(op, "f64.ge") == 0) return makeBinary(s, BinaryOp::GeFloat64);
-                goto parse_error;
-              case 't':
-                if (strcmp(op, "f64.gt") == 0) return makeBinary(s, BinaryOp::GtFloat64);
+              case 'd':
+                if (strcmp(op, "f64.div") == 0) return makeBinary(s, BinaryOp::DivFloat64);
                 goto parse_error;
-              default: goto parse_error;
-            }
-          }
-          case 'l': {
-            switch (op[5]) {
               case 'e':
-                if (strcmp(op, "f64.le") == 0) return makeBinary(s, BinaryOp::LeFloat64);
+                if (strcmp(op, "f64.eq") == 0) return makeBinary(s, BinaryOp::EqFloat64);
                 goto parse_error;
-              case 'o':
-                if (strcmp(op, "f64.load") == 0) return makeLoad(s, f64, /*isAtomic=*/false);
+              case 'f':
+                if (strcmp(op, "f64.floor") == 0) return makeUnary(s, UnaryOp::FloorFloat64);
                 goto parse_error;
-              case 't':
-                if (strcmp(op, "f64.lt") == 0) return makeBinary(s, BinaryOp::LtFloat64);
-                goto parse_error;
-              default: goto parse_error;
-            }
-          }
-          case 'm': {
-            switch (op[5]) {
-              case 'a':
-                if (strcmp(op, "f64.max") == 0) return makeBinary(s, BinaryOp::MaxFloat64);
+              case 'g': {
+                switch (op[5]) {
+                  case 'e':
+                    if (strcmp(op, "f64.ge") == 0) return makeBinary(s, BinaryOp::GeFloat64);
+                    goto parse_error;
+                  case 't':
+                    if (strcmp(op, "f64.gt") == 0) return makeBinary(s, BinaryOp::GtFloat64);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'l': {
+                switch (op[5]) {
+                  case 'e':
+                    if (strcmp(op, "f64.le") == 0) return makeBinary(s, BinaryOp::LeFloat64);
+                    goto parse_error;
+                  case 'o':
+                    if (strcmp(op, "f64.load") == 0) return makeLoad(s, f64, /*isAtomic=*/false);
+                    goto parse_error;
+                  case 't':
+                    if (strcmp(op, "f64.lt") == 0) return makeBinary(s, BinaryOp::LtFloat64);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'm': {
+                switch (op[5]) {
+                  case 'a':
+                    if (strcmp(op, "f64.max") == 0) return makeBinary(s, BinaryOp::MaxFloat64);
+                    goto parse_error;
+                  case 'i':
+                    if (strcmp(op, "f64.min") == 0) return makeBinary(s, BinaryOp::MinFloat64);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "f64.mul") == 0) return makeBinary(s, BinaryOp::MulFloat64);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'n': {
+                switch (op[6]) {
+                  case '\0':
+                    if (strcmp(op, "f64.ne") == 0) return makeBinary(s, BinaryOp::NeFloat64);
+                    goto parse_error;
+                  case 'a':
+                    if (strcmp(op, "f64.nearest") == 0) return makeUnary(s, UnaryOp::NearestFloat64);
+                    goto parse_error;
+                  case 'g':
+                    if (strcmp(op, "f64.neg") == 0) return makeUnary(s, UnaryOp::NegFloat64);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'p':
+                if (strcmp(op, "f64.promote_f32") == 0) return makeUnary(s, UnaryOp::PromoteFloat32);
                 goto parse_error;
-              case 'i':
-                if (strcmp(op, "f64.min") == 0) return makeBinary(s, BinaryOp::MinFloat64);
+              case 'r':
+                if (strcmp(op, "f64.reinterpret_i64") == 0) return makeUnary(s, UnaryOp::ReinterpretInt64);
                 goto parse_error;
-              case 'u':
-                if (strcmp(op, "f64.mul") == 0) return makeBinary(s, BinaryOp::MulFloat64);
+              case 's': {
+                switch (op[5]) {
+                  case 'q':
+                    if (strcmp(op, "f64.sqrt") == 0) return makeUnary(s, UnaryOp::SqrtFloat64);
+                    goto parse_error;
+                  case 't':
+                    if (strcmp(op, "f64.store") == 0) return makeStore(s, f64, /*isAtomic=*/false);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "f64.sub") == 0) return makeBinary(s, BinaryOp::SubFloat64);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 't':
+                if (strcmp(op, "f64.trunc") == 0) return makeUnary(s, UnaryOp::TruncFloat64);
                 goto parse_error;
               default: goto parse_error;
             }
           }
-          case 'n': {
+          case 'x': {
             switch (op[6]) {
-              case '\0':
-                if (strcmp(op, "f64.ne") == 0) return makeBinary(s, BinaryOp::NeFloat64);
-                goto parse_error;
-              case 'a':
-                if (strcmp(op, "f64.nearest") == 0) return makeUnary(s, UnaryOp::NearestFloat64);
-                goto parse_error;
-              case 'g':
-                if (strcmp(op, "f64.neg") == 0) return makeUnary(s, UnaryOp::NegFloat64);
-                goto parse_error;
-              default: goto parse_error;
-            }
-          }
-          case 'p':
-            if (strcmp(op, "f64.promote/f32") == 0) return makeUnary(s, UnaryOp::PromoteFloat32);
-            goto parse_error;
-          case 'r':
-            if (strcmp(op, "f64.reinterpret/i64") == 0) return makeUnary(s, UnaryOp::ReinterpretInt64);
-            goto parse_error;
-          case 's': {
-            switch (op[5]) {
-              case 'q':
-                if (strcmp(op, "f64.sqrt") == 0) return makeUnary(s, UnaryOp::SqrtFloat64);
-                goto parse_error;
-              case 't':
-                if (strcmp(op, "f64.store") == 0) return makeStore(s, f64, /*isAtomic=*/false);
+              case 'a': {
+                switch (op[7]) {
+                  case 'b':
+                    if (strcmp(op, "f64x2.abs") == 0) return makeUnary(s, UnaryOp::AbsVecF64x2);
+                    goto parse_error;
+                  case 'd':
+                    if (strcmp(op, "f64x2.add") == 0) return makeBinary(s, BinaryOp::AddVecF64x2);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'c': {
+                switch (op[20]) {
+                  case 's':
+                    if (strcmp(op, "f64x2.convert_i64x2_s") == 0) return makeUnary(s, UnaryOp::ConvertSVecI64x2ToVecF64x2);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "f64x2.convert_i64x2_u") == 0) return makeUnary(s, UnaryOp::ConvertUVecI64x2ToVecF64x2);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'd':
+                if (strcmp(op, "f64x2.div") == 0) return makeBinary(s, BinaryOp::DivVecF64x2);
                 goto parse_error;
-              case 'u':
-                if (strcmp(op, "f64.sub") == 0) return makeBinary(s, BinaryOp::SubFloat64);
+              case 'e': {
+                switch (op[7]) {
+                  case 'q':
+                    if (strcmp(op, "f64x2.eq") == 0) return makeBinary(s, BinaryOp::EqVecF64x2);
+                    goto parse_error;
+                  case 'x':
+                    if (strcmp(op, "f64x2.extract_lane") == 0) return makeSIMDExtract(s, SIMDExtractOp::ExtractLaneVecF64x2, 2);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'g': {
+                switch (op[7]) {
+                  case 'e':
+                    if (strcmp(op, "f64x2.ge") == 0) return makeBinary(s, BinaryOp::GeVecF64x2);
+                    goto parse_error;
+                  case 't':
+                    if (strcmp(op, "f64x2.gt") == 0) return makeBinary(s, BinaryOp::GtVecF64x2);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'l': {
+                switch (op[7]) {
+                  case 'e':
+                    if (strcmp(op, "f64x2.le") == 0) return makeBinary(s, BinaryOp::LeVecF64x2);
+                    goto parse_error;
+                  case 't':
+                    if (strcmp(op, "f64x2.lt") == 0) return makeBinary(s, BinaryOp::LtVecF64x2);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'm': {
+                switch (op[7]) {
+                  case 'a':
+                    if (strcmp(op, "f64x2.max") == 0) return makeBinary(s, BinaryOp::MaxVecF64x2);
+                    goto parse_error;
+                  case 'i':
+                    if (strcmp(op, "f64x2.min") == 0) return makeBinary(s, BinaryOp::MinVecF64x2);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "f64x2.mul") == 0) return makeBinary(s, BinaryOp::MulVecF64x2);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'n': {
+                switch (op[8]) {
+                  case '\0':
+                    if (strcmp(op, "f64x2.ne") == 0) return makeBinary(s, BinaryOp::NeVecF64x2);
+                    goto parse_error;
+                  case 'g':
+                    if (strcmp(op, "f64x2.neg") == 0) return makeUnary(s, UnaryOp::NegVecF64x2);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'r':
+                if (strcmp(op, "f64x2.replace_lane") == 0) return makeSIMDReplace(s, SIMDReplaceOp::ReplaceLaneVecF64x2, 2);
                 goto parse_error;
+              case 's': {
+                switch (op[7]) {
+                  case 'p':
+                    if (strcmp(op, "f64x2.splat") == 0) return makeUnary(s, UnaryOp::SplatVecF64x2);
+                    goto parse_error;
+                  case 'q':
+                    if (strcmp(op, "f64x2.sqrt") == 0) return makeUnary(s, UnaryOp::SqrtVecF64x2);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "f64x2.sub") == 0) return makeBinary(s, BinaryOp::SubVecF64x2);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
               default: goto parse_error;
             }
           }
-          case 't':
-            if (strcmp(op, "f64.trunc") == 0) return makeUnary(s, UnaryOp::TruncFloat64);
-            goto parse_error;
           default: goto parse_error;
         }
       }
@@ -373,13 +593,13 @@ switch (op[0]) {
   }
   case 'g': {
     switch (op[1]) {
-      case 'e': {
-        switch (op[4]) {
+      case 'l': {
+        switch (op[7]) {
           case 'g':
-            if (strcmp(op, "get_global") == 0) return makeGetGlobal(s);
+            if (strcmp(op, "global.get") == 0) return makeGetGlobal(s);
             goto parse_error;
-          case 'l':
-            if (strcmp(op, "get_local") == 0) return makeGetLocal(s);
+          case 's':
+            if (strcmp(op, "global.set") == 0) return makeSetGlobal(s);
             goto parse_error;
           default: goto parse_error;
         }
@@ -392,280 +612,410 @@ switch (op[0]) {
   }
   case 'i': {
     switch (op[1]) {
-      case '3': {
-        switch (op[4]) {
+      case '1': {
+        switch (op[6]) {
           case 'a': {
-            switch (op[5]) {
-              case 'd':
-                if (strcmp(op, "i32.add") == 0) return makeBinary(s, BinaryOp::AddInt32);
+            switch (op[7]) {
+              case 'd': {
+                switch (op[9]) {
+                  case '\0':
+                    if (strcmp(op, "i16x8.add") == 0) return makeBinary(s, BinaryOp::AddVecI16x8);
+                    goto parse_error;
+                  case '_': {
+                    switch (op[19]) {
+                      case 's':
+                        if (strcmp(op, "i16x8.add_saturate_s") == 0) return makeBinary(s, BinaryOp::AddSatSVecI16x8);
+                        goto parse_error;
+                      case 'u':
+                        if (strcmp(op, "i16x8.add_saturate_u") == 0) return makeBinary(s, BinaryOp::AddSatUVecI16x8);
+                        goto parse_error;
+                      default: goto parse_error;
+                    }
+                  }
+                  default: goto parse_error;
+                }
+              }
+              case 'l':
+                if (strcmp(op, "i16x8.all_true") == 0) return makeUnary(s, UnaryOp::AllTrueVecI16x8);
                 goto parse_error;
               case 'n':
-                if (strcmp(op, "i32.and") == 0) return makeBinary(s, BinaryOp::AndInt32);
+                if (strcmp(op, "i16x8.any_true") == 0) return makeUnary(s, UnaryOp::AnyTrueVecI16x8);
+                goto parse_error;
+              default: goto parse_error;
+            }
+          }
+          case 'e': {
+            switch (op[7]) {
+              case 'q':
+                if (strcmp(op, "i16x8.eq") == 0) return makeBinary(s, BinaryOp::EqVecI16x8);
                 goto parse_error;
+              case 'x': {
+                switch (op[19]) {
+                  case 's':
+                    if (strcmp(op, "i16x8.extract_lane_s") == 0) return makeSIMDExtract(s, SIMDExtractOp::ExtractLaneSVecI16x8, 8);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "i16x8.extract_lane_u") == 0) return makeSIMDExtract(s, SIMDExtractOp::ExtractLaneUVecI16x8, 8);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              default: goto parse_error;
+            }
+          }
+          case 'g': {
+            switch (op[7]) {
+              case 'e': {
+                switch (op[9]) {
+                  case 's':
+                    if (strcmp(op, "i16x8.ge_s") == 0) return makeBinary(s, BinaryOp::GeSVecI16x8);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "i16x8.ge_u") == 0) return makeBinary(s, BinaryOp::GeUVecI16x8);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
               case 't': {
-                switch (op[11]) {
-                  case 'l': {
-                    switch (op[15]) {
-                      case '\0':
-                        if (strcmp(op, "i32.atomic.load") == 0) return makeLoad(s, i32, /*isAtomic=*/true);
+                switch (op[9]) {
+                  case 's':
+                    if (strcmp(op, "i16x8.gt_s") == 0) return makeBinary(s, BinaryOp::GtSVecI16x8);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "i16x8.gt_u") == 0) return makeBinary(s, BinaryOp::GtUVecI16x8);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              default: goto parse_error;
+            }
+          }
+          case 'l': {
+            switch (op[7]) {
+              case 'e': {
+                switch (op[9]) {
+                  case 's':
+                    if (strcmp(op, "i16x8.le_s") == 0) return makeBinary(s, BinaryOp::LeSVecI16x8);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "i16x8.le_u") == 0) return makeBinary(s, BinaryOp::LeUVecI16x8);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 't': {
+                switch (op[9]) {
+                  case 's':
+                    if (strcmp(op, "i16x8.lt_s") == 0) return makeBinary(s, BinaryOp::LtSVecI16x8);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "i16x8.lt_u") == 0) return makeBinary(s, BinaryOp::LtUVecI16x8);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              default: goto parse_error;
+            }
+          }
+          case 'm':
+            if (strcmp(op, "i16x8.mul") == 0) return makeBinary(s, BinaryOp::MulVecI16x8);
+            goto parse_error;
+          case 'n': {
+            switch (op[8]) {
+              case '\0':
+                if (strcmp(op, "i16x8.ne") == 0) return makeBinary(s, BinaryOp::NeVecI16x8);
+                goto parse_error;
+              case 'g':
+                if (strcmp(op, "i16x8.neg") == 0) return makeUnary(s, UnaryOp::NegVecI16x8);
+                goto parse_error;
+              default: goto parse_error;
+            }
+          }
+          case 'r':
+            if (strcmp(op, "i16x8.replace_lane") == 0) return makeSIMDReplace(s, SIMDReplaceOp::ReplaceLaneVecI16x8, 8);
+            goto parse_error;
+          case 's': {
+            switch (op[7]) {
+              case 'h': {
+                switch (op[8]) {
+                  case 'l':
+                    if (strcmp(op, "i16x8.shl") == 0) return makeSIMDShift(s, SIMDShiftOp::ShlVecI16x8);
+                    goto parse_error;
+                  case 'r': {
+                    switch (op[10]) {
+                      case 's':
+                        if (strcmp(op, "i16x8.shr_s") == 0) return makeSIMDShift(s, SIMDShiftOp::ShrSVecI16x8);
                         goto parse_error;
-                      case '1':
-                        if (strcmp(op, "i32.atomic.load16_u") == 0) return makeLoad(s, i32, /*isAtomic=*/true);
+                      case 'u':
+                        if (strcmp(op, "i16x8.shr_u") == 0) return makeSIMDShift(s, SIMDShiftOp::ShrUVecI16x8);
                         goto parse_error;
-                      case '8':
-                        if (strcmp(op, "i32.atomic.load8_u") == 0) return makeLoad(s, i32, /*isAtomic=*/true);
+                      default: goto parse_error;
+                    }
+                  }
+                  default: goto parse_error;
+                }
+              }
+              case 'p':
+                if (strcmp(op, "i16x8.splat") == 0) return makeUnary(s, UnaryOp::SplatVecI16x8);
+                goto parse_error;
+              case 'u': {
+                switch (op[9]) {
+                  case '\0':
+                    if (strcmp(op, "i16x8.sub") == 0) return makeBinary(s, BinaryOp::SubVecI16x8);
+                    goto parse_error;
+                  case '_': {
+                    switch (op[19]) {
+                      case 's':
+                        if (strcmp(op, "i16x8.sub_saturate_s") == 0) return makeBinary(s, BinaryOp::SubSatSVecI16x8);
+                        goto parse_error;
+                      case 'u':
+                        if (strcmp(op, "i16x8.sub_saturate_u") == 0) return makeBinary(s, BinaryOp::SubSatUVecI16x8);
                         goto parse_error;
                       default: goto parse_error;
                     }
                   }
-                  case 'r': {
-                    switch (op[14]) {
-                      case '.': {
+                  default: goto parse_error;
+                }
+              }
+              default: goto parse_error;
+            }
+          }
+          default: goto parse_error;
+        }
+      }
+      case '3': {
+        switch (op[3]) {
+          case '.': {
+            switch (op[4]) {
+              case 'a': {
+                switch (op[5]) {
+                  case 'd':
+                    if (strcmp(op, "i32.add") == 0) return makeBinary(s, BinaryOp::AddInt32);
+                    goto parse_error;
+                  case 'n':
+                    if (strcmp(op, "i32.and") == 0) return makeBinary(s, BinaryOp::AndInt32);
+                    goto parse_error;
+                  case 't': {
+                    switch (op[11]) {
+                      case 'l': {
                         switch (op[15]) {
-                          case 'a': {
-                            switch (op[16]) {
-                              case 'd':
-                                if (strcmp(op, "i32.atomic.rmw.add") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
-                                goto parse_error;
-                              case 'n':
-                                if (strcmp(op, "i32.atomic.rmw.and") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
-                                goto parse_error;
-                              default: goto parse_error;
-                            }
-                          }
-                          case 'c':
-                            if (strcmp(op, "i32.atomic.rmw.cmpxchg") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                          case '\0':
+                            if (strcmp(op, "i32.atomic.load") == 0) return makeLoad(s, i32, /*isAtomic=*/true);
                             goto parse_error;
-                          case 'o':
-                            if (strcmp(op, "i32.atomic.rmw.or") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                          case '1':
+                            if (strcmp(op, "i32.atomic.load16_u") == 0) return makeLoad(s, i32, /*isAtomic=*/true);
                             goto parse_error;
-                          case 's':
-                            if (strcmp(op, "i32.atomic.rmw.sub") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                          case '8':
+                            if (strcmp(op, "i32.atomic.load8_u") == 0) return makeLoad(s, i32, /*isAtomic=*/true);
                             goto parse_error;
-                          case 'x': {
-                            switch (op[16]) {
+                          default: goto parse_error;
+                        }
+                      }
+                      case 'r': {
+                        switch (op[14]) {
+                          case '.': {
+                            switch (op[15]) {
+                              case 'a': {
+                                switch (op[16]) {
+                                  case 'd':
+                                    if (strcmp(op, "i32.atomic.rmw.add") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                                    goto parse_error;
+                                  case 'n':
+                                    if (strcmp(op, "i32.atomic.rmw.and") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                                    goto parse_error;
+                                  default: goto parse_error;
+                                }
+                              }
                               case 'c':
-                                if (strcmp(op, "i32.atomic.rmw.xchg") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                                if (strcmp(op, "i32.atomic.rmw.cmpxchg") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
                                 goto parse_error;
                               case 'o':
-                                if (strcmp(op, "i32.atomic.rmw.xor") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                                if (strcmp(op, "i32.atomic.rmw.or") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                                goto parse_error;
+                              case 's':
+                                if (strcmp(op, "i32.atomic.rmw.sub") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
                                 goto parse_error;
+                              case 'x': {
+                                switch (op[16]) {
+                                  case 'c':
+                                    if (strcmp(op, "i32.atomic.rmw.xchg") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                                    goto parse_error;
+                                  case 'o':
+                                    if (strcmp(op, "i32.atomic.rmw.xor") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                                    goto parse_error;
+                                  default: goto parse_error;
+                                }
+                              }
                               default: goto parse_error;
                             }
                           }
-                          default: goto parse_error;
-                        }
-                      }
-                      case '1': {
-                        switch (op[19]) {
-                          case 'a': {
-                            switch (op[20]) {
-                              case 'd':
-                                if (strcmp(op, "i32.atomic.rmw16_u.add") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                          case '1': {
+                            switch (op[17]) {
+                              case 'a': {
+                                switch (op[18]) {
+                                  case 'd':
+                                    if (strcmp(op, "i32.atomic.rmw16.add_u") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                                    goto parse_error;
+                                  case 'n':
+                                    if (strcmp(op, "i32.atomic.rmw16.and_u") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                                    goto parse_error;
+                                  default: goto parse_error;
+                                }
+                              }
+                              case 'c':
+                                if (strcmp(op, "i32.atomic.rmw16.cmpxchg_u") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
                                 goto parse_error;
-                              case 'n':
-                                if (strcmp(op, "i32.atomic.rmw16_u.and") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                              case 'o':
+                                if (strcmp(op, "i32.atomic.rmw16.or_u") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                                goto parse_error;
+                              case 's':
+                                if (strcmp(op, "i32.atomic.rmw16.sub_u") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
                                 goto parse_error;
+                              case 'x': {
+                                switch (op[18]) {
+                                  case 'c':
+                                    if (strcmp(op, "i32.atomic.rmw16.xchg_u") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                                    goto parse_error;
+                                  case 'o':
+                                    if (strcmp(op, "i32.atomic.rmw16.xor_u") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                                    goto parse_error;
+                                  default: goto parse_error;
+                                }
+                              }
                               default: goto parse_error;
                             }
                           }
-                          case 'c':
-                            if (strcmp(op, "i32.atomic.rmw16_u.cmpxchg") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
-                            goto parse_error;
-                          case 'o':
-                            if (strcmp(op, "i32.atomic.rmw16_u.or") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
-                            goto parse_error;
-                          case 's':
-                            if (strcmp(op, "i32.atomic.rmw16_u.sub") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
-                            goto parse_error;
-                          case 'x': {
-                            switch (op[20]) {
+                          case '8': {
+                            switch (op[16]) {
+                              case 'a': {
+                                switch (op[17]) {
+                                  case 'd':
+                                    if (strcmp(op, "i32.atomic.rmw8.add_u") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                                    goto parse_error;
+                                  case 'n':
+                                    if (strcmp(op, "i32.atomic.rmw8.and_u") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                                    goto parse_error;
+                                  default: goto parse_error;
+                                }
+                              }
                               case 'c':
-                                if (strcmp(op, "i32.atomic.rmw16_u.xchg") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                                if (strcmp(op, "i32.atomic.rmw8.cmpxchg_u") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
                                 goto parse_error;
                               case 'o':
-                                if (strcmp(op, "i32.atomic.rmw16_u.xor") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                                if (strcmp(op, "i32.atomic.rmw8.or_u") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                                goto parse_error;
+                              case 's':
+                                if (strcmp(op, "i32.atomic.rmw8.sub_u") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
                                 goto parse_error;
+                              case 'x': {
+                                switch (op[17]) {
+                                  case 'c':
+                                    if (strcmp(op, "i32.atomic.rmw8.xchg_u") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                                    goto parse_error;
+                                  case 'o':
+                                    if (strcmp(op, "i32.atomic.rmw8.xor_u") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                                    goto parse_error;
+                                  default: goto parse_error;
+                                }
+                              }
                               default: goto parse_error;
                             }
                           }
                           default: goto parse_error;
                         }
                       }
-                      case '8': {
-                        switch (op[18]) {
-                          case 'a': {
-                            switch (op[19]) {
-                              case 'd':
-                                if (strcmp(op, "i32.atomic.rmw8_u.add") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
-                                goto parse_error;
-                              case 'n':
-                                if (strcmp(op, "i32.atomic.rmw8_u.and") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
-                                goto parse_error;
-                              default: goto parse_error;
-                            }
-                          }
-                          case 'c':
-                            if (strcmp(op, "i32.atomic.rmw8_u.cmpxchg") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                      case 's': {
+                        switch (op[16]) {
+                          case '\0':
+                            if (strcmp(op, "i32.atomic.store") == 0) return makeStore(s, i32, /*isAtomic=*/true);
                             goto parse_error;
-                          case 'o':
-                            if (strcmp(op, "i32.atomic.rmw8_u.or") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                          case '1':
+                            if (strcmp(op, "i32.atomic.store16") == 0) return makeStore(s, i32, /*isAtomic=*/true);
                             goto parse_error;
-                          case 's':
-                            if (strcmp(op, "i32.atomic.rmw8_u.sub") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
+                          case '8':
+                            if (strcmp(op, "i32.atomic.store8") == 0) return makeStore(s, i32, /*isAtomic=*/true);
                             goto parse_error;
-                          case 'x': {
-                            switch (op[19]) {
-                              case 'c':
-                                if (strcmp(op, "i32.atomic.rmw8_u.xchg") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
-                                goto parse_error;
-                              case 'o':
-                                if (strcmp(op, "i32.atomic.rmw8_u.xor") == 0) return makeAtomicRMWOrCmpxchg(s, i32);
-                                goto parse_error;
-                              default: goto parse_error;
-                            }
-                          }
                           default: goto parse_error;
                         }
                       }
                       default: goto parse_error;
                     }
                   }
-                  case 's': {
-                    switch (op[16]) {
-                      case '\0':
-                        if (strcmp(op, "i32.atomic.store") == 0) return makeStore(s, i32, /*isAtomic=*/true);
-                        goto parse_error;
-                      case '1':
-                        if (strcmp(op, "i32.atomic.store16") == 0) return makeStore(s, i32, /*isAtomic=*/true);
-                        goto parse_error;
-                      case '8':
-                        if (strcmp(op, "i32.atomic.store8") == 0) return makeStore(s, i32, /*isAtomic=*/true);
-                        goto parse_error;
-                      default: goto parse_error;
-                    }
-                  }
                   default: goto parse_error;
                 }
               }
-              default: goto parse_error;
-            }
-          }
-          case 'c': {
-            switch (op[5]) {
-              case 'l':
-                if (strcmp(op, "i32.clz") == 0) return makeUnary(s, UnaryOp::ClzInt32);
-                goto parse_error;
-              case 'o':
-                if (strcmp(op, "i32.const") == 0) return makeConst(s, i32);
-                goto parse_error;
-              case 't':
-                if (strcmp(op, "i32.ctz") == 0) return makeUnary(s, UnaryOp::CtzInt32);
-                goto parse_error;
-              default: goto parse_error;
-            }
-          }
-          case 'd': {
-            switch (op[8]) {
-              case 's':
-                if (strcmp(op, "i32.div_s") == 0) return makeBinary(s, BinaryOp::DivSInt32);
-                goto parse_error;
-              case 'u':
-                if (strcmp(op, "i32.div_u") == 0) return makeBinary(s, BinaryOp::DivUInt32);
-                goto parse_error;
-              default: goto parse_error;
-            }
-          }
-          case 'e': {
-            switch (op[5]) {
-              case 'q': {
-                switch (op[6]) {
-                  case '\0':
-                    if (strcmp(op, "i32.eq") == 0) return makeBinary(s, BinaryOp::EqInt32);
-                    goto parse_error;
-                  case 'z':
-                    if (strcmp(op, "i32.eqz") == 0) return makeUnary(s, UnaryOp::EqZInt32);
-                    goto parse_error;
-                  default: goto parse_error;
-                }
-              }
-              case 'x': {
-                switch (op[10]) {
-                  case '1':
-                    if (strcmp(op, "i32.extend16_s") == 0) return makeUnary(s, UnaryOp::ExtendS16Int32);
-                    goto parse_error;
-                  case '8':
-                    if (strcmp(op, "i32.extend8_s") == 0) return makeUnary(s, UnaryOp::ExtendS8Int32);
+              case 'c': {
+                switch (op[5]) {
+                  case 'l':
+                    if (strcmp(op, "i32.clz") == 0) return makeUnary(s, UnaryOp::ClzInt32);
                     goto parse_error;
-                  default: goto parse_error;
-                }
-              }
-              default: goto parse_error;
-            }
-          }
-          case 'g': {
-            switch (op[5]) {
-              case 'e': {
-                switch (op[7]) {
-                  case 's':
-                    if (strcmp(op, "i32.ge_s") == 0) return makeBinary(s, BinaryOp::GeSInt32);
+                  case 'o':
+                    if (strcmp(op, "i32.const") == 0) return makeConst(s, i32);
                     goto parse_error;
-                  case 'u':
-                    if (strcmp(op, "i32.ge_u") == 0) return makeBinary(s, BinaryOp::GeUInt32);
+                  case 't':
+                    if (strcmp(op, "i32.ctz") == 0) return makeUnary(s, UnaryOp::CtzInt32);
                     goto parse_error;
                   default: goto parse_error;
                 }
               }
-              case 't': {
-                switch (op[7]) {
+              case 'd': {
+                switch (op[8]) {
                   case 's':
-                    if (strcmp(op, "i32.gt_s") == 0) return makeBinary(s, BinaryOp::GtSInt32);
+                    if (strcmp(op, "i32.div_s") == 0) return makeBinary(s, BinaryOp::DivSInt32);
                     goto parse_error;
                   case 'u':
-                    if (strcmp(op, "i32.gt_u") == 0) return makeBinary(s, BinaryOp::GtUInt32);
+                    if (strcmp(op, "i32.div_u") == 0) return makeBinary(s, BinaryOp::DivUInt32);
                     goto parse_error;
                   default: goto parse_error;
                 }
               }
-              default: goto parse_error;
-            }
-          }
-          case 'l': {
-            switch (op[5]) {
               case 'e': {
-                switch (op[7]) {
-                  case 's':
-                    if (strcmp(op, "i32.le_s") == 0) return makeBinary(s, BinaryOp::LeSInt32);
-                    goto parse_error;
-                  case 'u':
-                    if (strcmp(op, "i32.le_u") == 0) return makeBinary(s, BinaryOp::LeUInt32);
-                    goto parse_error;
+                switch (op[5]) {
+                  case 'q': {
+                    switch (op[6]) {
+                      case '\0':
+                        if (strcmp(op, "i32.eq") == 0) return makeBinary(s, BinaryOp::EqInt32);
+                        goto parse_error;
+                      case 'z':
+                        if (strcmp(op, "i32.eqz") == 0) return makeUnary(s, UnaryOp::EqZInt32);
+                        goto parse_error;
+                      default: goto parse_error;
+                    }
+                  }
+                  case 'x': {
+                    switch (op[10]) {
+                      case '1':
+                        if (strcmp(op, "i32.extend16_s") == 0) return makeUnary(s, UnaryOp::ExtendS16Int32);
+                        goto parse_error;
+                      case '8':
+                        if (strcmp(op, "i32.extend8_s") == 0) return makeUnary(s, UnaryOp::ExtendS8Int32);
+                        goto parse_error;
+                      default: goto parse_error;
+                    }
+                  }
                   default: goto parse_error;
                 }
               }
-              case 'o': {
-                switch (op[8]) {
-                  case '\0':
-                    if (strcmp(op, "i32.load") == 0) return makeLoad(s, i32, /*isAtomic=*/false);
-                    goto parse_error;
-                  case '1': {
-                    switch (op[11]) {
+              case 'g': {
+                switch (op[5]) {
+                  case 'e': {
+                    switch (op[7]) {
                       case 's':
-                        if (strcmp(op, "i32.load16_s") == 0) return makeLoad(s, i32, /*isAtomic=*/false);
+                        if (strcmp(op, "i32.ge_s") == 0) return makeBinary(s, BinaryOp::GeSInt32);
                         goto parse_error;
                       case 'u':
-                        if (strcmp(op, "i32.load16_u") == 0) return makeLoad(s, i32, /*isAtomic=*/false);
+                        if (strcmp(op, "i32.ge_u") == 0) return makeBinary(s, BinaryOp::GeUInt32);
                         goto parse_error;
                       default: goto parse_error;
                     }
                   }
-                  case '8': {
-                    switch (op[10]) {
+                  case 't': {
+                    switch (op[7]) {
                       case 's':
-                        if (strcmp(op, "i32.load8_s") == 0) return makeLoad(s, i32, /*isAtomic=*/false);
+                        if (strcmp(op, "i32.gt_s") == 0) return makeBinary(s, BinaryOp::GtSInt32);
                         goto parse_error;
                       case 'u':
-                        if (strcmp(op, "i32.load8_u") == 0) return makeLoad(s, i32, /*isAtomic=*/false);
+                        if (strcmp(op, "i32.gt_u") == 0) return makeBinary(s, BinaryOp::GtUInt32);
                         goto parse_error;
                       default: goto parse_error;
                     }
@@ -673,46 +1023,56 @@ switch (op[0]) {
                   default: goto parse_error;
                 }
               }
-              case 't': {
-                switch (op[7]) {
-                  case 's':
-                    if (strcmp(op, "i32.lt_s") == 0) return makeBinary(s, BinaryOp::LtSInt32);
-                    goto parse_error;
-                  case 'u':
-                    if (strcmp(op, "i32.lt_u") == 0) return makeBinary(s, BinaryOp::LtUInt32);
-                    goto parse_error;
-                  default: goto parse_error;
-                }
-              }
-              default: goto parse_error;
-            }
-          }
-          case 'm':
-            if (strcmp(op, "i32.mul") == 0) return makeBinary(s, BinaryOp::MulInt32);
-            goto parse_error;
-          case 'n':
-            if (strcmp(op, "i32.ne") == 0) return makeBinary(s, BinaryOp::NeInt32);
-            goto parse_error;
-          case 'o':
-            if (strcmp(op, "i32.or") == 0) return makeBinary(s, BinaryOp::OrInt32);
-            goto parse_error;
-          case 'p':
-            if (strcmp(op, "i32.popcnt") == 0) return makeUnary(s, UnaryOp::PopcntInt32);
-            goto parse_error;
-          case 'r': {
-            switch (op[5]) {
-              case 'e': {
-                switch (op[6]) {
-                  case 'i':
-                    if (strcmp(op, "i32.reinterpret/f32") == 0) return makeUnary(s, UnaryOp::ReinterpretFloat32);
-                    goto parse_error;
-                  case 'm': {
+              case 'l': {
+                switch (op[5]) {
+                  case 'e': {
+                    switch (op[7]) {
+                      case 's':
+                        if (strcmp(op, "i32.le_s") == 0) return makeBinary(s, BinaryOp::LeSInt32);
+                        goto parse_error;
+                      case 'u':
+                        if (strcmp(op, "i32.le_u") == 0) return makeBinary(s, BinaryOp::LeUInt32);
+                        goto parse_error;
+                      default: goto parse_error;
+                    }
+                  }
+                  case 'o': {
                     switch (op[8]) {
+                      case '\0':
+                        if (strcmp(op, "i32.load") == 0) return makeLoad(s, i32, /*isAtomic=*/false);
+                        goto parse_error;
+                      case '1': {
+                        switch (op[11]) {
+                          case 's':
+                            if (strcmp(op, "i32.load16_s") == 0) return makeLoad(s, i32, /*isAtomic=*/false);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i32.load16_u") == 0) return makeLoad(s, i32, /*isAtomic=*/false);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
+                      case '8': {
+                        switch (op[10]) {
+                          case 's':
+                            if (strcmp(op, "i32.load8_s") == 0) return makeLoad(s, i32, /*isAtomic=*/false);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i32.load8_u") == 0) return makeLoad(s, i32, /*isAtomic=*/false);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
+                      default: goto parse_error;
+                    }
+                  }
+                  case 't': {
+                    switch (op[7]) {
                       case 's':
-                        if (strcmp(op, "i32.rem_s") == 0) return makeBinary(s, BinaryOp::RemSInt32);
+                        if (strcmp(op, "i32.lt_s") == 0) return makeBinary(s, BinaryOp::LtSInt32);
                         goto parse_error;
                       case 'u':
-                        if (strcmp(op, "i32.rem_u") == 0) return makeBinary(s, BinaryOp::RemUInt32);
+                        if (strcmp(op, "i32.lt_u") == 0) return makeBinary(s, BinaryOp::LtUInt32);
                         goto parse_error;
                       default: goto parse_error;
                     }
@@ -720,83 +1080,217 @@ switch (op[0]) {
                   default: goto parse_error;
                 }
               }
-              case 'o': {
-                switch (op[7]) {
-                  case 'l':
-                    if (strcmp(op, "i32.rotl") == 0) return makeBinary(s, BinaryOp::RotLInt32);
-                    goto parse_error;
-                  case 'r':
-                    if (strcmp(op, "i32.rotr") == 0) return makeBinary(s, BinaryOp::RotRInt32);
-                    goto parse_error;
+              case 'm':
+                if (strcmp(op, "i32.mul") == 0) return makeBinary(s, BinaryOp::MulInt32);
+                goto parse_error;
+              case 'n':
+                if (strcmp(op, "i32.ne") == 0) return makeBinary(s, BinaryOp::NeInt32);
+                goto parse_error;
+              case 'o':
+                if (strcmp(op, "i32.or") == 0) return makeBinary(s, BinaryOp::OrInt32);
+                goto parse_error;
+              case 'p':
+                if (strcmp(op, "i32.popcnt") == 0) return makeUnary(s, UnaryOp::PopcntInt32);
+                goto parse_error;
+              case 'r': {
+                switch (op[5]) {
+                  case 'e': {
+                    switch (op[6]) {
+                      case 'i':
+                        if (strcmp(op, "i32.reinterpret_f32") == 0) return makeUnary(s, UnaryOp::ReinterpretFloat32);
+                        goto parse_error;
+                      case 'm': {
+                        switch (op[8]) {
+                          case 's':
+                            if (strcmp(op, "i32.rem_s") == 0) return makeBinary(s, BinaryOp::RemSInt32);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i32.rem_u") == 0) return makeBinary(s, BinaryOp::RemUInt32);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
+                      default: goto parse_error;
+                    }
+                  }
+                  case 'o': {
+                    switch (op[7]) {
+                      case 'l':
+                        if (strcmp(op, "i32.rotl") == 0) return makeBinary(s, BinaryOp::RotLInt32);
+                        goto parse_error;
+                      case 'r':
+                        if (strcmp(op, "i32.rotr") == 0) return makeBinary(s, BinaryOp::RotRInt32);
+                        goto parse_error;
+                      default: goto parse_error;
+                    }
+                  }
                   default: goto parse_error;
                 }
               }
-              default: goto parse_error;
-            }
-          }
-          case 's': {
-            switch (op[5]) {
-              case 'h': {
-                switch (op[6]) {
-                  case 'l':
-                    if (strcmp(op, "i32.shl") == 0) return makeBinary(s, BinaryOp::ShlInt32);
-                    goto parse_error;
-                  case 'r': {
-                    switch (op[8]) {
-                      case 's':
-                        if (strcmp(op, "i32.shr_s") == 0) return makeBinary(s, BinaryOp::ShrSInt32);
+              case 's': {
+                switch (op[5]) {
+                  case 'h': {
+                    switch (op[6]) {
+                      case 'l':
+                        if (strcmp(op, "i32.shl") == 0) return makeBinary(s, BinaryOp::ShlInt32);
                         goto parse_error;
-                      case 'u':
-                        if (strcmp(op, "i32.shr_u") == 0) return makeBinary(s, BinaryOp::ShrUInt32);
+                      case 'r': {
+                        switch (op[8]) {
+                          case 's':
+                            if (strcmp(op, "i32.shr_s") == 0) return makeBinary(s, BinaryOp::ShrSInt32);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i32.shr_u") == 0) return makeBinary(s, BinaryOp::ShrUInt32);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
+                      default: goto parse_error;
+                    }
+                  }
+                  case 't': {
+                    switch (op[9]) {
+                      case '\0':
+                        if (strcmp(op, "i32.store") == 0) return makeStore(s, i32, /*isAtomic=*/false);
+                        goto parse_error;
+                      case '1':
+                        if (strcmp(op, "i32.store16") == 0) return makeStore(s, i32, /*isAtomic=*/false);
+                        goto parse_error;
+                      case '8':
+                        if (strcmp(op, "i32.store8") == 0) return makeStore(s, i32, /*isAtomic=*/false);
                         goto parse_error;
                       default: goto parse_error;
                     }
                   }
+                  case 'u':
+                    if (strcmp(op, "i32.sub") == 0) return makeBinary(s, BinaryOp::SubInt32);
+                    goto parse_error;
                   default: goto parse_error;
                 }
               }
               case 't': {
-                switch (op[9]) {
-                  case '\0':
-                    if (strcmp(op, "i32.store") == 0) return makeStore(s, i32, /*isAtomic=*/false);
-                    goto parse_error;
-                  case '1':
-                    if (strcmp(op, "i32.store16") == 0) return makeStore(s, i32, /*isAtomic=*/false);
+                switch (op[10]) {
+                  case 'f': {
+                    switch (op[11]) {
+                      case '3': {
+                        switch (op[14]) {
+                          case 's':
+                            if (strcmp(op, "i32.trunc_f32_s") == 0) return makeUnary(s, UnaryOp::TruncSFloat32ToInt32);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i32.trunc_f32_u") == 0) return makeUnary(s, UnaryOp::TruncUFloat32ToInt32);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
+                      case '6': {
+                        switch (op[14]) {
+                          case 's':
+                            if (strcmp(op, "i32.trunc_f64_s") == 0) return makeUnary(s, UnaryOp::TruncSFloat64ToInt32);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i32.trunc_f64_u") == 0) return makeUnary(s, UnaryOp::TruncUFloat64ToInt32);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
+                      default: goto parse_error;
+                    }
+                  }
+                  case 's': {
+                    switch (op[15]) {
+                      case '3': {
+                        switch (op[18]) {
+                          case 's':
+                            if (strcmp(op, "i32.trunc_sat_f32_s") == 0) return makeUnary(s, UnaryOp::TruncSatSFloat32ToInt32);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i32.trunc_sat_f32_u") == 0) return makeUnary(s, UnaryOp::TruncSatUFloat32ToInt32);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
+                      case '6': {
+                        switch (op[18]) {
+                          case 's':
+                            if (strcmp(op, "i32.trunc_sat_f64_s") == 0) return makeUnary(s, UnaryOp::TruncSatSFloat64ToInt32);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i32.trunc_sat_f64_u") == 0) return makeUnary(s, UnaryOp::TruncSatUFloat64ToInt32);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
+                      default: goto parse_error;
+                    }
+                  }
+                  default: goto parse_error;
+                }
+              }
+              case 'w': {
+                switch (op[5]) {
+                  case 'a':
+                    if (strcmp(op, "i32.wait") == 0) return makeAtomicWait(s, i32);
                     goto parse_error;
-                  case '8':
-                    if (strcmp(op, "i32.store8") == 0) return makeStore(s, i32, /*isAtomic=*/false);
+                  case 'r':
+                    if (strcmp(op, "i32.wrap_i64") == 0) return makeUnary(s, UnaryOp::WrapInt64);
                     goto parse_error;
                   default: goto parse_error;
                 }
               }
-              case 'u':
-                if (strcmp(op, "i32.sub") == 0) return makeBinary(s, BinaryOp::SubInt32);
+              case 'x':
+                if (strcmp(op, "i32.xor") == 0) return makeBinary(s, BinaryOp::XorInt32);
                 goto parse_error;
               default: goto parse_error;
             }
           }
-          case 't': {
-            switch (op[10]) {
-              case 's': {
-                switch (op[11]) {
-                  case '/': {
-                    switch (op[13]) {
-                      case '3':
-                        if (strcmp(op, "i32.trunc_s/f32") == 0) return makeUnary(s, UnaryOp::TruncSFloat32ToInt32);
+          case 'x': {
+            switch (op[6]) {
+              case 'a': {
+                switch (op[7]) {
+                  case 'd':
+                    if (strcmp(op, "i32x4.add") == 0) return makeBinary(s, BinaryOp::AddVecI32x4);
+                    goto parse_error;
+                  case 'l':
+                    if (strcmp(op, "i32x4.all_true") == 0) return makeUnary(s, UnaryOp::AllTrueVecI32x4);
+                    goto parse_error;
+                  case 'n':
+                    if (strcmp(op, "i32x4.any_true") == 0) return makeUnary(s, UnaryOp::AnyTrueVecI32x4);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'e': {
+                switch (op[7]) {
+                  case 'q':
+                    if (strcmp(op, "i32x4.eq") == 0) return makeBinary(s, BinaryOp::EqVecI32x4);
+                    goto parse_error;
+                  case 'x':
+                    if (strcmp(op, "i32x4.extract_lane") == 0) return makeSIMDExtract(s, SIMDExtractOp::ExtractLaneVecI32x4, 4);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'g': {
+                switch (op[7]) {
+                  case 'e': {
+                    switch (op[9]) {
+                      case 's':
+                        if (strcmp(op, "i32x4.ge_s") == 0) return makeBinary(s, BinaryOp::GeSVecI32x4);
                         goto parse_error;
-                      case '6':
-                        if (strcmp(op, "i32.trunc_s/f64") == 0) return makeUnary(s, UnaryOp::TruncSFloat64ToInt32);
+                      case 'u':
+                        if (strcmp(op, "i32x4.ge_u") == 0) return makeBinary(s, BinaryOp::GeUVecI32x4);
                         goto parse_error;
                       default: goto parse_error;
                     }
                   }
-                  case ':': {
-                    switch (op[17]) {
-                      case '3':
-                        if (strcmp(op, "i32.trunc_s:sat/f32") == 0) return makeUnary(s, UnaryOp::TruncSatSFloat32ToInt32);
+                  case 't': {
+                    switch (op[9]) {
+                      case 's':
+                        if (strcmp(op, "i32x4.gt_s") == 0) return makeBinary(s, BinaryOp::GtSVecI32x4);
                         goto parse_error;
-                      case '6':
-                        if (strcmp(op, "i32.trunc_s:sat/f64") == 0) return makeUnary(s, UnaryOp::TruncSatSFloat64ToInt32);
+                      case 'u':
+                        if (strcmp(op, "i32x4.gt_u") == 0) return makeBinary(s, BinaryOp::GtUVecI32x4);
                         goto parse_error;
                       default: goto parse_error;
                     }
@@ -804,26 +1298,26 @@ switch (op[0]) {
                   default: goto parse_error;
                 }
               }
-              case 'u': {
-                switch (op[11]) {
-                  case '/': {
-                    switch (op[13]) {
-                      case '3':
-                        if (strcmp(op, "i32.trunc_u/f32") == 0) return makeUnary(s, UnaryOp::TruncUFloat32ToInt32);
+              case 'l': {
+                switch (op[7]) {
+                  case 'e': {
+                    switch (op[9]) {
+                      case 's':
+                        if (strcmp(op, "i32x4.le_s") == 0) return makeBinary(s, BinaryOp::LeSVecI32x4);
                         goto parse_error;
-                      case '6':
-                        if (strcmp(op, "i32.trunc_u/f64") == 0) return makeUnary(s, UnaryOp::TruncUFloat64ToInt32);
+                      case 'u':
+                        if (strcmp(op, "i32x4.le_u") == 0) return makeBinary(s, BinaryOp::LeUVecI32x4);
                         goto parse_error;
                       default: goto parse_error;
                     }
                   }
-                  case ':': {
-                    switch (op[17]) {
-                      case '3':
-                        if (strcmp(op, "i32.trunc_u:sat/f32") == 0) return makeUnary(s, UnaryOp::TruncSatUFloat32ToInt32);
+                  case 't': {
+                    switch (op[9]) {
+                      case 's':
+                        if (strcmp(op, "i32x4.lt_s") == 0) return makeBinary(s, BinaryOp::LtSVecI32x4);
                         goto parse_error;
-                      case '6':
-                        if (strcmp(op, "i32.trunc_u:sat/f64") == 0) return makeUnary(s, UnaryOp::TruncSatUFloat64ToInt32);
+                      case 'u':
+                        if (strcmp(op, "i32x4.lt_u") == 0) return makeBinary(s, BinaryOp::LtUVecI32x4);
                         goto parse_error;
                       default: goto parse_error;
                     }
@@ -831,367 +1325,681 @@ switch (op[0]) {
                   default: goto parse_error;
                 }
               }
-              default: goto parse_error;
-            }
-          }
-          case 'w': {
-            switch (op[5]) {
-              case 'a':
-                if (strcmp(op, "i32.wait") == 0) return makeAtomicWait(s, i32);
+              case 'm':
+                if (strcmp(op, "i32x4.mul") == 0) return makeBinary(s, BinaryOp::MulVecI32x4);
                 goto parse_error;
+              case 'n': {
+                switch (op[8]) {
+                  case '\0':
+                    if (strcmp(op, "i32x4.ne") == 0) return makeBinary(s, BinaryOp::NeVecI32x4);
+                    goto parse_error;
+                  case 'g':
+                    if (strcmp(op, "i32x4.neg") == 0) return makeUnary(s, UnaryOp::NegVecI32x4);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
               case 'r':
-                if (strcmp(op, "i32.wrap/i64") == 0) return makeUnary(s, UnaryOp::WrapInt64);
+                if (strcmp(op, "i32x4.replace_lane") == 0) return makeSIMDReplace(s, SIMDReplaceOp::ReplaceLaneVecI32x4, 4);
                 goto parse_error;
+              case 's': {
+                switch (op[7]) {
+                  case 'h': {
+                    switch (op[8]) {
+                      case 'l':
+                        if (strcmp(op, "i32x4.shl") == 0) return makeSIMDShift(s, SIMDShiftOp::ShlVecI32x4);
+                        goto parse_error;
+                      case 'r': {
+                        switch (op[10]) {
+                          case 's':
+                            if (strcmp(op, "i32x4.shr_s") == 0) return makeSIMDShift(s, SIMDShiftOp::ShrSVecI32x4);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i32x4.shr_u") == 0) return makeSIMDShift(s, SIMDShiftOp::ShrUVecI32x4);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
+                      default: goto parse_error;
+                    }
+                  }
+                  case 'p':
+                    if (strcmp(op, "i32x4.splat") == 0) return makeUnary(s, UnaryOp::SplatVecI32x4);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "i32x4.sub") == 0) return makeBinary(s, BinaryOp::SubVecI32x4);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 't': {
+                switch (op[22]) {
+                  case 's':
+                    if (strcmp(op, "i32x4.trunc_sat_f32x4_s") == 0) return makeUnary(s, UnaryOp::TruncSatSVecF32x4ToVecI32x4);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "i32x4.trunc_sat_f32x4_u") == 0) return makeUnary(s, UnaryOp::TruncSatUVecF32x4ToVecI32x4);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
               default: goto parse_error;
             }
           }
-          case 'x':
-            if (strcmp(op, "i32.xor") == 0) return makeBinary(s, BinaryOp::XorInt32);
-            goto parse_error;
           default: goto parse_error;
         }
       }
       case '6': {
-        switch (op[4]) {
-          case 'a': {
-            switch (op[5]) {
-              case 'd':
-                if (strcmp(op, "i64.add") == 0) return makeBinary(s, BinaryOp::AddInt64);
-                goto parse_error;
-              case 'n':
-                if (strcmp(op, "i64.and") == 0) return makeBinary(s, BinaryOp::AndInt64);
-                goto parse_error;
-              case 't': {
-                switch (op[11]) {
-                  case 'l': {
-                    switch (op[15]) {
-                      case '\0':
-                        if (strcmp(op, "i64.atomic.load") == 0) return makeLoad(s, i64, /*isAtomic=*/true);
-                        goto parse_error;
-                      case '1':
-                        if (strcmp(op, "i64.atomic.load16_u") == 0) return makeLoad(s, i64, /*isAtomic=*/true);
-                        goto parse_error;
-                      case '3':
-                        if (strcmp(op, "i64.atomic.load32_u") == 0) return makeLoad(s, i64, /*isAtomic=*/true);
-                        goto parse_error;
-                      case '8':
-                        if (strcmp(op, "i64.atomic.load8_u") == 0) return makeLoad(s, i64, /*isAtomic=*/true);
-                        goto parse_error;
-                      default: goto parse_error;
-                    }
-                  }
-                  case 'r': {
-                    switch (op[14]) {
-                      case '.': {
+        switch (op[3]) {
+          case '.': {
+            switch (op[4]) {
+              case 'a': {
+                switch (op[5]) {
+                  case 'd':
+                    if (strcmp(op, "i64.add") == 0) return makeBinary(s, BinaryOp::AddInt64);
+                    goto parse_error;
+                  case 'n':
+                    if (strcmp(op, "i64.and") == 0) return makeBinary(s, BinaryOp::AndInt64);
+                    goto parse_error;
+                  case 't': {
+                    switch (op[11]) {
+                      case 'l': {
                         switch (op[15]) {
-                          case 'a': {
-                            switch (op[16]) {
-                              case 'd':
-                                if (strcmp(op, "i64.atomic.rmw.add") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
-                                goto parse_error;
-                              case 'n':
-                                if (strcmp(op, "i64.atomic.rmw.and") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
-                                goto parse_error;
-                              default: goto parse_error;
-                            }
-                          }
-                          case 'c':
-                            if (strcmp(op, "i64.atomic.rmw.cmpxchg") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                          case '\0':
+                            if (strcmp(op, "i64.atomic.load") == 0) return makeLoad(s, i64, /*isAtomic=*/true);
                             goto parse_error;
-                          case 'o':
-                            if (strcmp(op, "i64.atomic.rmw.or") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                          case '1':
+                            if (strcmp(op, "i64.atomic.load16_u") == 0) return makeLoad(s, i64, /*isAtomic=*/true);
                             goto parse_error;
-                          case 's':
-                            if (strcmp(op, "i64.atomic.rmw.sub") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                          case '3':
+                            if (strcmp(op, "i64.atomic.load32_u") == 0) return makeLoad(s, i64, /*isAtomic=*/true);
+                            goto parse_error;
+                          case '8':
+                            if (strcmp(op, "i64.atomic.load8_u") == 0) return makeLoad(s, i64, /*isAtomic=*/true);
                             goto parse_error;
-                          case 'x': {
-                            switch (op[16]) {
-                              case 'c':
-                                if (strcmp(op, "i64.atomic.rmw.xchg") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
-                                goto parse_error;
-                              case 'o':
-                                if (strcmp(op, "i64.atomic.rmw.xor") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
-                                goto parse_error;
-                              default: goto parse_error;
-                            }
-                          }
                           default: goto parse_error;
                         }
                       }
-                      case '1': {
-                        switch (op[19]) {
-                          case 'a': {
-                            switch (op[20]) {
-                              case 'd':
-                                if (strcmp(op, "i64.atomic.rmw16_u.add") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                      case 'r': {
+                        switch (op[14]) {
+                          case '.': {
+                            switch (op[15]) {
+                              case 'a': {
+                                switch (op[16]) {
+                                  case 'd':
+                                    if (strcmp(op, "i64.atomic.rmw.add") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                    goto parse_error;
+                                  case 'n':
+                                    if (strcmp(op, "i64.atomic.rmw.and") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                    goto parse_error;
+                                  default: goto parse_error;
+                                }
+                              }
+                              case 'c':
+                                if (strcmp(op, "i64.atomic.rmw.cmpxchg") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                goto parse_error;
+                              case 'o':
+                                if (strcmp(op, "i64.atomic.rmw.or") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
                                 goto parse_error;
-                              case 'n':
-                                if (strcmp(op, "i64.atomic.rmw16_u.and") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                              case 's':
+                                if (strcmp(op, "i64.atomic.rmw.sub") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
                                 goto parse_error;
+                              case 'x': {
+                                switch (op[16]) {
+                                  case 'c':
+                                    if (strcmp(op, "i64.atomic.rmw.xchg") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                    goto parse_error;
+                                  case 'o':
+                                    if (strcmp(op, "i64.atomic.rmw.xor") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                    goto parse_error;
+                                  default: goto parse_error;
+                                }
+                              }
                               default: goto parse_error;
                             }
                           }
-                          case 'c':
-                            if (strcmp(op, "i64.atomic.rmw16_u.cmpxchg") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
-                            goto parse_error;
-                          case 'o':
-                            if (strcmp(op, "i64.atomic.rmw16_u.or") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
-                            goto parse_error;
-                          case 's':
-                            if (strcmp(op, "i64.atomic.rmw16_u.sub") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
-                            goto parse_error;
-                          case 'x': {
-                            switch (op[20]) {
+                          case '1': {
+                            switch (op[17]) {
+                              case 'a': {
+                                switch (op[18]) {
+                                  case 'd':
+                                    if (strcmp(op, "i64.atomic.rmw16.add_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                    goto parse_error;
+                                  case 'n':
+                                    if (strcmp(op, "i64.atomic.rmw16.and_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                    goto parse_error;
+                                  default: goto parse_error;
+                                }
+                              }
                               case 'c':
-                                if (strcmp(op, "i64.atomic.rmw16_u.xchg") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                if (strcmp(op, "i64.atomic.rmw16.cmpxchg_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
                                 goto parse_error;
                               case 'o':
-                                if (strcmp(op, "i64.atomic.rmw16_u.xor") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                if (strcmp(op, "i64.atomic.rmw16.or_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                goto parse_error;
+                              case 's':
+                                if (strcmp(op, "i64.atomic.rmw16.sub_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
                                 goto parse_error;
+                              case 'x': {
+                                switch (op[18]) {
+                                  case 'c':
+                                    if (strcmp(op, "i64.atomic.rmw16.xchg_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                    goto parse_error;
+                                  case 'o':
+                                    if (strcmp(op, "i64.atomic.rmw16.xor_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                    goto parse_error;
+                                  default: goto parse_error;
+                                }
+                              }
                               default: goto parse_error;
                             }
                           }
-                          default: goto parse_error;
-                        }
-                      }
-                      case '3': {
-                        switch (op[19]) {
-                          case 'a': {
-                            switch (op[20]) {
-                              case 'd':
-                                if (strcmp(op, "i64.atomic.rmw32_u.add") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                          case '3': {
+                            switch (op[17]) {
+                              case 'a': {
+                                switch (op[18]) {
+                                  case 'd':
+                                    if (strcmp(op, "i64.atomic.rmw32.add_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                    goto parse_error;
+                                  case 'n':
+                                    if (strcmp(op, "i64.atomic.rmw32.and_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                    goto parse_error;
+                                  default: goto parse_error;
+                                }
+                              }
+                              case 'c':
+                                if (strcmp(op, "i64.atomic.rmw32.cmpxchg_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
                                 goto parse_error;
-                              case 'n':
-                                if (strcmp(op, "i64.atomic.rmw32_u.and") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                              case 'o':
+                                if (strcmp(op, "i64.atomic.rmw32.or_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
                                 goto parse_error;
+                              case 's':
+                                if (strcmp(op, "i64.atomic.rmw32.sub_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                goto parse_error;
+                              case 'x': {
+                                switch (op[18]) {
+                                  case 'c':
+                                    if (strcmp(op, "i64.atomic.rmw32.xchg_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                    goto parse_error;
+                                  case 'o':
+                                    if (strcmp(op, "i64.atomic.rmw32.xor_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                    goto parse_error;
+                                  default: goto parse_error;
+                                }
+                              }
                               default: goto parse_error;
                             }
                           }
-                          case 'c':
-                            if (strcmp(op, "i64.atomic.rmw32_u.cmpxchg") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
-                            goto parse_error;
-                          case 'o':
-                            if (strcmp(op, "i64.atomic.rmw32_u.or") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
-                            goto parse_error;
-                          case 's':
-                            if (strcmp(op, "i64.atomic.rmw32_u.sub") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
-                            goto parse_error;
-                          case 'x': {
-                            switch (op[20]) {
+                          case '8': {
+                            switch (op[16]) {
+                              case 'a': {
+                                switch (op[17]) {
+                                  case 'd':
+                                    if (strcmp(op, "i64.atomic.rmw8.add_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                    goto parse_error;
+                                  case 'n':
+                                    if (strcmp(op, "i64.atomic.rmw8.and_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                    goto parse_error;
+                                  default: goto parse_error;
+                                }
+                              }
                               case 'c':
-                                if (strcmp(op, "i64.atomic.rmw32_u.xchg") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                if (strcmp(op, "i64.atomic.rmw8.cmpxchg_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
                                 goto parse_error;
                               case 'o':
-                                if (strcmp(op, "i64.atomic.rmw32_u.xor") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                if (strcmp(op, "i64.atomic.rmw8.or_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                goto parse_error;
+                              case 's':
+                                if (strcmp(op, "i64.atomic.rmw8.sub_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
                                 goto parse_error;
+                              case 'x': {
+                                switch (op[17]) {
+                                  case 'c':
+                                    if (strcmp(op, "i64.atomic.rmw8.xchg_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                    goto parse_error;
+                                  case 'o':
+                                    if (strcmp(op, "i64.atomic.rmw8.xor_u") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                                    goto parse_error;
+                                  default: goto parse_error;
+                                }
+                              }
                               default: goto parse_error;
                             }
                           }
                           default: goto parse_error;
                         }
                       }
-                      case '8': {
-                        switch (op[18]) {
-                          case 'a': {
-                            switch (op[19]) {
-                              case 'd':
-                                if (strcmp(op, "i64.atomic.rmw8_u.add") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
-                                goto parse_error;
-                              case 'n':
-                                if (strcmp(op, "i64.atomic.rmw8_u.and") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
-                                goto parse_error;
-                              default: goto parse_error;
-                            }
-                          }
-                          case 'c':
-                            if (strcmp(op, "i64.atomic.rmw8_u.cmpxchg") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                      case 's': {
+                        switch (op[16]) {
+                          case '\0':
+                            if (strcmp(op, "i64.atomic.store") == 0) return makeStore(s, i64, /*isAtomic=*/true);
                             goto parse_error;
-                          case 'o':
-                            if (strcmp(op, "i64.atomic.rmw8_u.or") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                          case '1':
+                            if (strcmp(op, "i64.atomic.store16") == 0) return makeStore(s, i64, /*isAtomic=*/true);
                             goto parse_error;
-                          case 's':
-                            if (strcmp(op, "i64.atomic.rmw8_u.sub") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
+                          case '3':
+                            if (strcmp(op, "i64.atomic.store32") == 0) return makeStore(s, i64, /*isAtomic=*/true);
+                            goto parse_error;
+                          case '8':
+                            if (strcmp(op, "i64.atomic.store8") == 0) return makeStore(s, i64, /*isAtomic=*/true);
                             goto parse_error;
-                          case 'x': {
-                            switch (op[19]) {
-                              case 'c':
-                                if (strcmp(op, "i64.atomic.rmw8_u.xchg") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
-                                goto parse_error;
-                              case 'o':
-                                if (strcmp(op, "i64.atomic.rmw8_u.xor") == 0) return makeAtomicRMWOrCmpxchg(s, i64);
-                                goto parse_error;
-                              default: goto parse_error;
-                            }
-                          }
                           default: goto parse_error;
                         }
                       }
                       default: goto parse_error;
                     }
                   }
-                  case 's': {
-                    switch (op[16]) {
+                  default: goto parse_error;
+                }
+              }
+              case 'c': {
+                switch (op[5]) {
+                  case 'l':
+                    if (strcmp(op, "i64.clz") == 0) return makeUnary(s, UnaryOp::ClzInt64);
+                    goto parse_error;
+                  case 'o':
+                    if (strcmp(op, "i64.const") == 0) return makeConst(s, i64);
+                    goto parse_error;
+                  case 't':
+                    if (strcmp(op, "i64.ctz") == 0) return makeUnary(s, UnaryOp::CtzInt64);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'd': {
+                switch (op[8]) {
+                  case 's':
+                    if (strcmp(op, "i64.div_s") == 0) return makeBinary(s, BinaryOp::DivSInt64);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "i64.div_u") == 0) return makeBinary(s, BinaryOp::DivUInt64);
+                    goto parse_error;
+                  default: goto parse_error;
+                }
+              }
+              case 'e': {
+                switch (op[5]) {
+                  case 'q': {
+                    switch (op[6]) {
                       case '\0':
-                        if (strcmp(op, "i64.atomic.store") == 0) return makeStore(s, i64, /*isAtomic=*/true);
+                        if (strcmp(op, "i64.eq") == 0) return makeBinary(s, BinaryOp::EqInt64);
                         goto parse_error;
+                      case 'z':
+                        if (strcmp(op, "i64.eqz") == 0) return makeUnary(s, UnaryOp::EqZInt64);
+                        goto parse_error;
+                      default: goto parse_error;
+                    }
+                  }
+                  case 'x': {
+                    switch (op[10]) {
                       case '1':
-                        if (strcmp(op, "i64.atomic.store16") == 0) return makeStore(s, i64, /*isAtomic=*/true);
+                        if (strcmp(op, "i64.extend16_s") == 0) return makeUnary(s, UnaryOp::ExtendS16Int64);
                         goto parse_error;
                       case '3':
-                        if (strcmp(op, "i64.atomic.store32") == 0) return makeStore(s, i64, /*isAtomic=*/true);
+                        if (strcmp(op, "i64.extend32_s") == 0) return makeUnary(s, UnaryOp::ExtendS32Int64);
                         goto parse_error;
                       case '8':
-                        if (strcmp(op, "i64.atomic.store8") == 0) return makeStore(s, i64, /*isAtomic=*/true);
+                        if (strcmp(op, "i64.extend8_s") == 0) return makeUnary(s, UnaryOp::ExtendS8Int64);
                         goto parse_error;
+                      case '_': {
+                        switch (op[15]) {
+                          case 's':
+                            if (strcmp(op, "i64.extend_i32_s") == 0) return makeUnary(s, UnaryOp::ExtendSInt32);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i64.extend_i32_u") == 0) return makeUnary(s, UnaryOp::ExtendUInt32);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
                       default: goto parse_error;
                     }
                   }
                   default: goto parse_error;
                 }
               }
-              default: goto parse_error;
-            }
-          }
-          case 'c': {
-            switch (op[5]) {
-              case 'l':
-                if (strcmp(op, "i64.clz") == 0) return makeUnary(s, UnaryOp::ClzInt64);
-                goto parse_error;
-              case 'o':
-                if (strcmp(op, "i64.const") == 0) return makeConst(s, i64);
+              case 'g': {
+                switch (op[5]) {
+                  case 'e': {
+                    switch (op[7]) {
+                      case 's':
+                        if (strcmp(op, "i64.ge_s") == 0) return makeBinary(s, BinaryOp::GeSInt64);
+                        goto parse_error;
+                      case 'u':
+                        if (strcmp(op, "i64.ge_u") == 0) return makeBinary(s, BinaryOp::GeUInt64);
+                        goto parse_error;
+                      default: goto parse_error;
+                    }
+                  }
+                  case 't': {
+                    switch (op[7]) {
+                      case 's':
+                        if (strcmp(op, "i64.gt_s") == 0) return makeBinary(s, BinaryOp::GtSInt64);
+                        goto parse_error;
+                      case 'u':
+                        if (strcmp(op, "i64.gt_u") == 0) return makeBinary(s, BinaryOp::GtUInt64);
+                        goto parse_error;
+                      default: goto parse_error;
+                    }
+                  }
+                  default: goto parse_error;
+                }
+              }
+              case 'l': {
+                switch (op[5]) {
+                  case 'e': {
+                    switch (op[7]) {
+                      case 's':
+                        if (strcmp(op, "i64.le_s") == 0) return makeBinary(s, BinaryOp::LeSInt64);
+                        goto parse_error;
+                      case 'u':
+                        if (strcmp(op, "i64.le_u") == 0) return makeBinary(s, BinaryOp::LeUInt64);
+                        goto parse_error;
+                      default: goto parse_error;
+                    }
+                  }
+                  case 'o': {
+                    switch (op[8]) {
+                      case '\0':
+                        if (strcmp(op, "i64.load") == 0) return makeLoad(s, i64, /*isAtomic=*/false);
+                        goto parse_error;
+                      case '1': {
+                        switch (op[11]) {
+                          case 's':
+                            if (strcmp(op, "i64.load16_s") == 0) return makeLoad(s, i64, /*isAtomic=*/false);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i64.load16_u") == 0) return makeLoad(s, i64, /*isAtomic=*/false);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
+                      case '3': {
+                        switch (op[11]) {
+                          case 's':
+                            if (strcmp(op, "i64.load32_s") == 0) return makeLoad(s, i64, /*isAtomic=*/false);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i64.load32_u") == 0) return makeLoad(s, i64, /*isAtomic=*/false);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
+                      case '8': {
+                        switch (op[10]) {
+                          case 's':
+                            if (strcmp(op, "i64.load8_s") == 0) return makeLoad(s, i64, /*isAtomic=*/false);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i64.load8_u") == 0) return makeLoad(s, i64, /*isAtomic=*/false);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
+                      default: goto parse_error;
+                    }
+                  }
+                  case 't': {
+                    switch (op[7]) {
+                      case 's':
+                        if (strcmp(op, "i64.lt_s") == 0) return makeBinary(s, BinaryOp::LtSInt64);
+                        goto parse_error;
+                      case 'u':
+                        if (strcmp(op, "i64.lt_u") == 0) return makeBinary(s, BinaryOp::LtUInt64);
+                        goto parse_error;
+                      default: goto parse_error;
+                    }
+                  }
+                  default: goto parse_error;
+                }
+              }
+              case 'm':
+                if (strcmp(op, "i64.mul") == 0) return makeBinary(s, BinaryOp::MulInt64);
                 goto parse_error;
-              case 't':
-                if (strcmp(op, "i64.ctz") == 0) return makeUnary(s, UnaryOp::CtzInt64);
+              case 'n':
+                if (strcmp(op, "i64.ne") == 0) return makeBinary(s, BinaryOp::NeInt64);
                 goto parse_error;
-              default: goto parse_error;
-            }
-          }
-          case 'd': {
-            switch (op[8]) {
-              case 's':
-                if (strcmp(op, "i64.div_s") == 0) return makeBinary(s, BinaryOp::DivSInt64);
+              case 'o':
+                if (strcmp(op, "i64.or") == 0) return makeBinary(s, BinaryOp::OrInt64);
                 goto parse_error;
-              case 'u':
-                if (strcmp(op, "i64.div_u") == 0) return makeBinary(s, BinaryOp::DivUInt64);
+              case 'p':
+                if (strcmp(op, "i64.popcnt") == 0) return makeUnary(s, UnaryOp::PopcntInt64);
                 goto parse_error;
-              default: goto parse_error;
-            }
-          }
-          case 'e': {
-            switch (op[5]) {
-              case 'q': {
-                switch (op[6]) {
-                  case '\0':
-                    if (strcmp(op, "i64.eq") == 0) return makeBinary(s, BinaryOp::EqInt64);
-                    goto parse_error;
-                  case 'z':
-                    if (strcmp(op, "i64.eqz") == 0) return makeUnary(s, UnaryOp::EqZInt64);
+              case 'r': {
+                switch (op[5]) {
+                  case 'e': {
+                    switch (op[6]) {
+                      case 'i':
+                        if (strcmp(op, "i64.reinterpret_f64") == 0) return makeUnary(s, UnaryOp::ReinterpretFloat64);
+                        goto parse_error;
+                      case 'm': {
+                        switch (op[8]) {
+                          case 's':
+                            if (strcmp(op, "i64.rem_s") == 0) return makeBinary(s, BinaryOp::RemSInt64);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i64.rem_u") == 0) return makeBinary(s, BinaryOp::RemUInt64);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
+                      default: goto parse_error;
+                    }
+                  }
+                  case 'o': {
+                    switch (op[7]) {
+                      case 'l':
+                        if (strcmp(op, "i64.rotl") == 0) return makeBinary(s, BinaryOp::RotLInt64);
+                        goto parse_error;
+                      case 'r':
+                        if (strcmp(op, "i64.rotr") == 0) return makeBinary(s, BinaryOp::RotRInt64);
+                        goto parse_error;
+                      default: goto parse_error;
+                    }
+                  }
+                  default: goto parse_error;
+                }
+              }
+              case 's': {
+                switch (op[5]) {
+                  case 'h': {
+                    switch (op[6]) {
+                      case 'l':
+                        if (strcmp(op, "i64.shl") == 0) return makeBinary(s, BinaryOp::ShlInt64);
+                        goto parse_error;
+                      case 'r': {
+                        switch (op[8]) {
+                          case 's':
+                            if (strcmp(op, "i64.shr_s") == 0) return makeBinary(s, BinaryOp::ShrSInt64);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i64.shr_u") == 0) return makeBinary(s, BinaryOp::ShrUInt64);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
+                      default: goto parse_error;
+                    }
+                  }
+                  case 't': {
+                    switch (op[9]) {
+                      case '\0':
+                        if (strcmp(op, "i64.store") == 0) return makeStore(s, i64, /*isAtomic=*/false);
+                        goto parse_error;
+                      case '1':
+                        if (strcmp(op, "i64.store16") == 0) return makeStore(s, i64, /*isAtomic=*/false);
+                        goto parse_error;
+                      case '3':
+                        if (strcmp(op, "i64.store32") == 0) return makeStore(s, i64, /*isAtomic=*/false);
+                        goto parse_error;
+                      case '8':
+                        if (strcmp(op, "i64.store8") == 0) return makeStore(s, i64, /*isAtomic=*/false);
+                        goto parse_error;
+                      default: goto parse_error;
+                    }
+                  }
+                  case 'u':
+                    if (strcmp(op, "i64.sub") == 0) return makeBinary(s, BinaryOp::SubInt64);
                     goto parse_error;
                   default: goto parse_error;
                 }
               }
-              case 'x': {
+              case 't': {
                 switch (op[10]) {
-                  case '1':
-                    if (strcmp(op, "i64.extend16_s") == 0) return makeUnary(s, UnaryOp::ExtendS16Int64);
-                    goto parse_error;
-                  case '3':
-                    if (strcmp(op, "i64.extend32_s") == 0) return makeUnary(s, UnaryOp::ExtendS32Int64);
-                    goto parse_error;
-                  case '8':
-                    if (strcmp(op, "i64.extend8_s") == 0) return makeUnary(s, UnaryOp::ExtendS8Int64);
-                    goto parse_error;
-                  case '_': {
+                  case 'f': {
                     switch (op[11]) {
-                      case 's':
-                        if (strcmp(op, "i64.extend_s/i32") == 0) return makeUnary(s, UnaryOp::ExtendSInt32);
-                        goto parse_error;
-                      case 'u':
-                        if (strcmp(op, "i64.extend_u/i32") == 0) return makeUnary(s, UnaryOp::ExtendUInt32);
-                        goto parse_error;
+                      case '3': {
+                        switch (op[14]) {
+                          case 's':
+                            if (strcmp(op, "i64.trunc_f32_s") == 0) return makeUnary(s, UnaryOp::TruncSFloat32ToInt64);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i64.trunc_f32_u") == 0) return makeUnary(s, UnaryOp::TruncUFloat32ToInt64);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
+                      case '6': {
+                        switch (op[14]) {
+                          case 's':
+                            if (strcmp(op, "i64.trunc_f64_s") == 0) return makeUnary(s, UnaryOp::TruncSFloat64ToInt64);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i64.trunc_f64_u") == 0) return makeUnary(s, UnaryOp::TruncUFloat64ToInt64);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
+                      default: goto parse_error;
+                    }
+                  }
+                  case 's': {
+                    switch (op[15]) {
+                      case '3': {
+                        switch (op[18]) {
+                          case 's':
+                            if (strcmp(op, "i64.trunc_sat_f32_s") == 0) return makeUnary(s, UnaryOp::TruncSatSFloat32ToInt64);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i64.trunc_sat_f32_u") == 0) return makeUnary(s, UnaryOp::TruncSatUFloat32ToInt64);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
+                      case '6': {
+                        switch (op[18]) {
+                          case 's':
+                            if (strcmp(op, "i64.trunc_sat_f64_s") == 0) return makeUnary(s, UnaryOp::TruncSatSFloat64ToInt64);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i64.trunc_sat_f64_u") == 0) return makeUnary(s, UnaryOp::TruncSatUFloat64ToInt64);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
                       default: goto parse_error;
                     }
                   }
                   default: goto parse_error;
                 }
               }
+              case 'w':
+                if (strcmp(op, "i64.wait") == 0) return makeAtomicWait(s, i64);
+                goto parse_error;
+              case 'x':
+                if (strcmp(op, "i64.xor") == 0) return makeBinary(s, BinaryOp::XorInt64);
+                goto parse_error;
               default: goto parse_error;
             }
           }
-          case 'g': {
-            switch (op[5]) {
-              case 'e': {
+          case 'x': {
+            switch (op[6]) {
+              case 'a': {
                 switch (op[7]) {
-                  case 's':
-                    if (strcmp(op, "i64.ge_s") == 0) return makeBinary(s, BinaryOp::GeSInt64);
+                  case 'd':
+                    if (strcmp(op, "i64x2.add") == 0) return makeBinary(s, BinaryOp::AddVecI64x2);
                     goto parse_error;
-                  case 'u':
-                    if (strcmp(op, "i64.ge_u") == 0) return makeBinary(s, BinaryOp::GeUInt64);
+                  case 'l':
+                    if (strcmp(op, "i64x2.all_true") == 0) return makeUnary(s, UnaryOp::AllTrueVecI64x2);
+                    goto parse_error;
+                  case 'n':
+                    if (strcmp(op, "i64x2.any_true") == 0) return makeUnary(s, UnaryOp::AnyTrueVecI64x2);
                     goto parse_error;
                   default: goto parse_error;
                 }
               }
-              case 't': {
+              case 'e':
+                if (strcmp(op, "i64x2.extract_lane") == 0) return makeSIMDExtract(s, SIMDExtractOp::ExtractLaneVecI64x2, 2);
+                goto parse_error;
+              case 'n':
+                if (strcmp(op, "i64x2.neg") == 0) return makeUnary(s, UnaryOp::NegVecI64x2);
+                goto parse_error;
+              case 'r':
+                if (strcmp(op, "i64x2.replace_lane") == 0) return makeSIMDReplace(s, SIMDReplaceOp::ReplaceLaneVecI64x2, 2);
+                goto parse_error;
+              case 's': {
                 switch (op[7]) {
-                  case 's':
-                    if (strcmp(op, "i64.gt_s") == 0) return makeBinary(s, BinaryOp::GtSInt64);
+                  case 'h': {
+                    switch (op[8]) {
+                      case 'l':
+                        if (strcmp(op, "i64x2.shl") == 0) return makeSIMDShift(s, SIMDShiftOp::ShlVecI64x2);
+                        goto parse_error;
+                      case 'r': {
+                        switch (op[10]) {
+                          case 's':
+                            if (strcmp(op, "i64x2.shr_s") == 0) return makeSIMDShift(s, SIMDShiftOp::ShrSVecI64x2);
+                            goto parse_error;
+                          case 'u':
+                            if (strcmp(op, "i64x2.shr_u") == 0) return makeSIMDShift(s, SIMDShiftOp::ShrUVecI64x2);
+                            goto parse_error;
+                          default: goto parse_error;
+                        }
+                      }
+                      default: goto parse_error;
+                    }
+                  }
+                  case 'p':
+                    if (strcmp(op, "i64x2.splat") == 0) return makeUnary(s, UnaryOp::SplatVecI64x2);
                     goto parse_error;
                   case 'u':
-                    if (strcmp(op, "i64.gt_u") == 0) return makeBinary(s, BinaryOp::GtUInt64);
+                    if (strcmp(op, "i64x2.sub") == 0) return makeBinary(s, BinaryOp::SubVecI64x2);
                     goto parse_error;
                   default: goto parse_error;
                 }
               }
-              default: goto parse_error;
-            }
-          }
-          case 'l': {
-            switch (op[5]) {
-              case 'e': {
-                switch (op[7]) {
+              case 't': {
+                switch (op[22]) {
                   case 's':
-                    if (strcmp(op, "i64.le_s") == 0) return makeBinary(s, BinaryOp::LeSInt64);
+                    if (strcmp(op, "i64x2.trunc_sat_f64x2_s") == 0) return makeUnary(s, UnaryOp::TruncSatSVecF64x2ToVecI64x2);
                     goto parse_error;
                   case 'u':
-                    if (strcmp(op, "i64.le_u") == 0) return makeBinary(s, BinaryOp::LeUInt64);
+                    if (strcmp(op, "i64x2.trunc_sat_f64x2_u") == 0) return makeUnary(s, UnaryOp::TruncSatUVecF64x2ToVecI64x2);
                     goto parse_error;
                   default: goto parse_error;
                 }
               }
-              case 'o': {
-                switch (op[8]) {
+              default: goto parse_error;
+            }
+          }
+          default: goto parse_error;
+        }
+      }
+      case '8': {
+        switch (op[6]) {
+          case 'a': {
+            switch (op[7]) {
+              case 'd': {
+                switch (op[9]) {
                   case '\0':
-                    if (strcmp(op, "i64.load") == 0) return makeLoad(s, i64, /*isAtomic=*/false);
+                    if (strcmp(op, "i8x16.add") == 0) return makeBinary(s, BinaryOp::AddVecI8x16);
                     goto parse_error;
-                  case '1': {
-                    switch (op[11]) {
-                      case 's':
-                        if (strcmp(op, "i64.load16_s") == 0) return makeLoad(s, i64, /*isAtomic=*/false);
-                        goto parse_error;
-                      case 'u':
-                        if (strcmp(op, "i64.load16_u") == 0) return makeLoad(s, i64, /*isAtomic=*/false);
-                        goto parse_error;
-                      default: goto parse_error;
-                    }
-                  }
-                  case '3': {
-                    switch (op[11]) {
-                      case 's':
-                        if (strcmp(op, "i64.load32_s") == 0) return makeLoad(s, i64, /*isAtomic=*/false);
-                        goto parse_error;
-                      case 'u':
-                        if (strcmp(op, "i64.load32_u") == 0) return makeLoad(s, i64, /*isAtomic=*/false);
-                        goto parse_error;
-                      default: goto parse_error;
-                    }
-                  }
-                  case '8': {
-                    switch (op[10]) {
+                  case '_': {
+                    switch (op[19]) {
                       case 's':
-                        if (strcmp(op, "i64.load8_s") == 0) return makeLoad(s, i64, /*isAtomic=*/false);
+                        if (strcmp(op, "i8x16.add_saturate_s") == 0) return makeBinary(s, BinaryOp::AddSatSVecI8x16);
                         goto parse_error;
                       case 'u':
-                        if (strcmp(op, "i64.load8_u") == 0) return makeLoad(s, i64, /*isAtomic=*/false);
+                        if (strcmp(op, "i8x16.add_saturate_u") == 0) return makeBinary(s, BinaryOp::AddSatUVecI8x16);
                         goto parse_error;
                       default: goto parse_error;
                     }
@@ -1199,13 +2007,27 @@ switch (op[0]) {
                   default: goto parse_error;
                 }
               }
-              case 't': {
-                switch (op[7]) {
+              case 'l':
+                if (strcmp(op, "i8x16.all_true") == 0) return makeUnary(s, UnaryOp::AllTrueVecI8x16);
+                goto parse_error;
+              case 'n':
+                if (strcmp(op, "i8x16.any_true") == 0) return makeUnary(s, UnaryOp::AnyTrueVecI8x16);
+                goto parse_error;
+              default: goto parse_error;
+            }
+          }
+          case 'e': {
+            switch (op[7]) {
+              case 'q':
+                if (strcmp(op, "i8x16.eq") == 0) return makeBinary(s, BinaryOp::EqVecI8x16);
+                goto parse_error;
+              case 'x': {
+                switch (op[19]) {
                   case 's':
-                    if (strcmp(op, "i64.lt_s") == 0) return makeBinary(s, BinaryOp::LtSInt64);
+                    if (strcmp(op, "i8x16.extract_lane_s") == 0) return makeSIMDExtract(s, SIMDExtractOp::ExtractLaneSVecI8x16, 16);
                     goto parse_error;
                   case 'u':
-                    if (strcmp(op, "i64.lt_u") == 0) return makeBinary(s, BinaryOp::LtUInt64);
+                    if (strcmp(op, "i8x16.extract_lane_u") == 0) return makeSIMDExtract(s, SIMDExtractOp::ExtractLaneUVecI8x16, 16);
                     goto parse_error;
                   default: goto parse_error;
                 }
@@ -1213,46 +2035,26 @@ switch (op[0]) {
               default: goto parse_error;
             }
           }
-          case 'm':
-            if (strcmp(op, "i64.mul") == 0) return makeBinary(s, BinaryOp::MulInt64);
-            goto parse_error;
-          case 'n':
-            if (strcmp(op, "i64.ne") == 0) return makeBinary(s, BinaryOp::NeInt64);
-            goto parse_error;
-          case 'o':
-            if (strcmp(op, "i64.or") == 0) return makeBinary(s, BinaryOp::OrInt64);
-            goto parse_error;
-          case 'p':
-            if (strcmp(op, "i64.popcnt") == 0) return makeUnary(s, UnaryOp::PopcntInt64);
-            goto parse_error;
-          case 'r': {
-            switch (op[5]) {
+          case 'g': {
+            switch (op[7]) {
               case 'e': {
-                switch (op[6]) {
-                  case 'i':
-                    if (strcmp(op, "i64.reinterpret/f64") == 0) return makeUnary(s, UnaryOp::ReinterpretFloat64);
+                switch (op[9]) {
+                  case 's':
+                    if (strcmp(op, "i8x16.ge_s") == 0) return makeBinary(s, BinaryOp::GeSVecI8x16);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "i8x16.ge_u") == 0) return makeBinary(s, BinaryOp::GeUVecI8x16);
                     goto parse_error;
-                  case 'm': {
-                    switch (op[8]) {
-                      case 's':
-                        if (strcmp(op, "i64.rem_s") == 0) return makeBinary(s, BinaryOp::RemSInt64);
-                        goto parse_error;
-                      case 'u':
-                        if (strcmp(op, "i64.rem_u") == 0) return makeBinary(s, BinaryOp::RemUInt64);
-                        goto parse_error;
-                      default: goto parse_error;
-                    }
-                  }
                   default: goto parse_error;
                 }
               }
-              case 'o': {
-                switch (op[7]) {
-                  case 'l':
-                    if (strcmp(op, "i64.rotl") == 0) return makeBinary(s, BinaryOp::RotLInt64);
+              case 't': {
+                switch (op[9]) {
+                  case 's':
+                    if (strcmp(op, "i8x16.gt_s") == 0) return makeBinary(s, BinaryOp::GtSVecI8x16);
                     goto parse_error;
-                  case 'r':
-                    if (strcmp(op, "i64.rotr") == 0) return makeBinary(s, BinaryOp::RotRInt64);
+                  case 'u':
+                    if (strcmp(op, "i8x16.gt_u") == 0) return makeBinary(s, BinaryOp::GtUVecI8x16);
                     goto parse_error;
                   default: goto parse_error;
                 }
@@ -1260,72 +2062,64 @@ switch (op[0]) {
               default: goto parse_error;
             }
           }
-          case 's': {
-            switch (op[5]) {
-              case 'h': {
-                switch (op[6]) {
-                  case 'l':
-                    if (strcmp(op, "i64.shl") == 0) return makeBinary(s, BinaryOp::ShlInt64);
+          case 'l': {
+            switch (op[7]) {
+              case 'e': {
+                switch (op[9]) {
+                  case 's':
+                    if (strcmp(op, "i8x16.le_s") == 0) return makeBinary(s, BinaryOp::LeSVecI8x16);
+                    goto parse_error;
+                  case 'u':
+                    if (strcmp(op, "i8x16.le_u") == 0) return makeBinary(s, BinaryOp::LeUVecI8x16);
                     goto parse_error;
-                  case 'r': {
-                    switch (op[8]) {
-                      case 's':
-                        if (strcmp(op, "i64.shr_s") == 0) return makeBinary(s, BinaryOp::ShrSInt64);
-                        goto parse_error;
-                      case 'u':
-                        if (strcmp(op, "i64.shr_u") == 0) return makeBinary(s, BinaryOp::ShrUInt64);
-                        goto parse_error;
-                      default: goto parse_error;
-                    }
-                  }
                   default: goto parse_error;
                 }
               }
               case 't': {
                 switch (op[9]) {
-                  case '\0':
-                    if (strcmp(op, "i64.store") == 0) return makeStore(s, i64, /*isAtomic=*/false);
-                    goto parse_error;
-                  case '1':
-                    if (strcmp(op, "i64.store16") == 0) return makeStore(s, i64, /*isAtomic=*/false);
-                    goto parse_error;
-                  case '3':
-                    if (strcmp(op, "i64.store32") == 0) return makeStore(s, i64, /*isAtomic=*/false);
+                  case 's':
+                    if (strcmp(op, "i8x16.lt_s") == 0) return makeBinary(s, BinaryOp::LtSVecI8x16);
                     goto parse_error;
-                  case '8':
-                    if (strcmp(op, "i64.store8") == 0) return makeStore(s, i64, /*isAtomic=*/false);
+                  case 'u':
+                    if (strcmp(op, "i8x16.lt_u") == 0) return makeBinary(s, BinaryOp::LtUVecI8x16);
                     goto parse_error;
                   default: goto parse_error;
                 }
               }
-              case 'u':
-                if (strcmp(op, "i64.sub") == 0) return makeBinary(s, BinaryOp::SubInt64);
+              default: goto parse_error;
+            }
+          }
+          case 'm':
+            if (strcmp(op, "i8x16.mul") == 0) return makeBinary(s, BinaryOp::MulVecI8x16);
+            goto parse_error;
+          case 'n': {
+            switch (op[8]) {
+              case '\0':
+                if (strcmp(op, "i8x16.ne") == 0) return makeBinary(s, BinaryOp::NeVecI8x16);
+                goto parse_error;
+              case 'g':
+                if (strcmp(op, "i8x16.neg") == 0) return makeUnary(s, UnaryOp::NegVecI8x16);
                 goto parse_error;
               default: goto parse_error;
             }
           }
-          case 't': {
-            switch (op[10]) {
-              case 's': {
-                switch (op[11]) {
-                  case '/': {
-                    switch (op[13]) {
-                      case '3':
-                        if (strcmp(op, "i64.trunc_s/f32") == 0) return makeUnary(s, UnaryOp::TruncSFloat32ToInt64);
-                        goto parse_error;
-                      case '6':
-                        if (strcmp(op, "i64.trunc_s/f64") == 0) return makeUnary(s, UnaryOp::TruncSFloat64ToInt64);
-                        goto parse_error;
-                      default: goto parse_error;
-                    }
-                  }
-                  case ':': {
-                    switch (op[17]) {
-                      case '3':
-                        if (strcmp(op, "i64.trunc_s:sat/f32") == 0) return makeUnary(s, UnaryOp::TruncSatSFloat32ToInt64);
+          case 'r':
+            if (strcmp(op, "i8x16.replace_lane") == 0) return makeSIMDReplace(s, SIMDReplaceOp::ReplaceLaneVecI8x16, 16);
+            goto parse_error;
+          case 's': {
+            switch (op[7]) {
+              case 'h': {
+                switch (op[8]) {
+                  case 'l':
+                    if (strcmp(op, "i8x16.shl") == 0) return makeSIMDShift(s, SIMDShiftOp::ShlVecI8x16);
+                    goto parse_error;
+                  case 'r': {
+                    switch (op[10]) {
+                      case 's':
+                        if (strcmp(op, "i8x16.shr_s") == 0) return makeSIMDShift(s, SIMDShiftOp::ShrSVecI8x16);
                         goto parse_error;
-                      case '6':
-                        if (strcmp(op, "i64.trunc_s:sat/f64") == 0) return makeUnary(s, UnaryOp::TruncSatSFloat64ToInt64);
+                      case 'u':
+                        if (strcmp(op, "i8x16.shr_u") == 0) return makeSIMDShift(s, SIMDShiftOp::ShrUVecI8x16);
                         goto parse_error;
                       default: goto parse_error;
                     }
@@ -1333,26 +2127,21 @@ switch (op[0]) {
                   default: goto parse_error;
                 }
               }
+              case 'p':
+                if (strcmp(op, "i8x16.splat") == 0) return makeUnary(s, UnaryOp::SplatVecI8x16);
+                goto parse_error;
               case 'u': {
-                switch (op[11]) {
-                  case '/': {
-                    switch (op[13]) {
-                      case '3':
-                        if (strcmp(op, "i64.trunc_u/f32") == 0) return makeUnary(s, UnaryOp::TruncUFloat32ToInt64);
-                        goto parse_error;
-                      case '6':
-                        if (strcmp(op, "i64.trunc_u/f64") == 0) return makeUnary(s, UnaryOp::TruncUFloat64ToInt64);
-                        goto parse_error;
-                      default: goto parse_error;
-                    }
-                  }
-                  case ':': {
-                    switch (op[17]) {
-                      case '3':
-                        if (strcmp(op, "i64.trunc_u:sat/f32") == 0) return makeUnary(s, UnaryOp::TruncSatUFloat32ToInt64);
+                switch (op[9]) {
+                  case '\0':
+                    if (strcmp(op, "i8x16.sub") == 0) return makeBinary(s, BinaryOp::SubVecI8x16);
+                    goto parse_error;
+                  case '_': {
+                    switch (op[19]) {
+                      case 's':
+                        if (strcmp(op, "i8x16.sub_saturate_s") == 0) return makeBinary(s, BinaryOp::SubSatSVecI8x16);
                         goto parse_error;
-                      case '6':
-                        if (strcmp(op, "i64.trunc_u:sat/f64") == 0) return makeUnary(s, UnaryOp::TruncSatUFloat64ToInt64);
+                      case 'u':
+                        if (strcmp(op, "i8x16.sub_saturate_u") == 0) return makeBinary(s, BinaryOp::SubSatUVecI8x16);
                         goto parse_error;
                       default: goto parse_error;
                     }
@@ -1363,12 +2152,6 @@ switch (op[0]) {
               default: goto parse_error;
             }
           }
-          case 'w':
-            if (strcmp(op, "i64.wait") == 0) return makeAtomicWait(s, i64);
-            goto parse_error;
-          case 'x':
-            if (strcmp(op, "i64.xor") == 0) return makeBinary(s, BinaryOp::XorInt64);
-            goto parse_error;
           default: goto parse_error;
         }
       }
@@ -1378,48 +2161,80 @@ switch (op[0]) {
       default: goto parse_error;
     }
   }
-  case 'l':
-    if (strcmp(op, "loop") == 0) return makeLoop(s);
-    goto parse_error;
+  case 'l': {
+    switch (op[2]) {
+      case 'c': {
+        switch (op[6]) {
+          case 'g':
+            if (strcmp(op, "local.get") == 0) return makeGetLocal(s);
+            goto parse_error;
+          case 's':
+            if (strcmp(op, "local.set") == 0) return makeSetLocal(s);
+            goto parse_error;
+          case 't':
+            if (strcmp(op, "local.tee") == 0) return makeTeeLocal(s);
+            goto parse_error;
+          default: goto parse_error;
+        }
+      }
+      case 'o':
+        if (strcmp(op, "loop") == 0) return makeLoop(s);
+        goto parse_error;
+      default: goto parse_error;
+    }
+  }
   case 'n':
     if (strcmp(op, "nop") == 0) return makeNop();
     goto parse_error;
   case 'r':
     if (strcmp(op, "return") == 0) return makeReturn(s);
     goto parse_error;
-  case 's': {
-    switch (op[2]) {
-      case 'l':
-        if (strcmp(op, "select") == 0) return makeSelect(s);
-        goto parse_error;
-      case 't': {
-        switch (op[4]) {
-          case 'g':
-            if (strcmp(op, "set_global") == 0) return makeSetGlobal(s);
+  case 's':
+    if (strcmp(op, "select") == 0) return makeSelect(s);
+    goto parse_error;
+  case 't':
+    if (strcmp(op, "then") == 0) return makeThenOrElse(s);
+    goto parse_error;
+  case 'u':
+    if (strcmp(op, "unreachable") == 0) return makeUnreachable();
+    goto parse_error;
+  case 'v': {
+    switch (op[1]) {
+      case '1': {
+        switch (op[5]) {
+          case 'a':
+            if (strcmp(op, "v128.and") == 0) return makeBinary(s, BinaryOp::AndVec128);
+            goto parse_error;
+          case 'b':
+            if (strcmp(op, "v128.bitselect") == 0) return makeSIMDBitselect(s);
+            goto parse_error;
+          case 'c':
+            if (strcmp(op, "v128.const") == 0) return makeConst(s, v128);
             goto parse_error;
           case 'l':
-            if (strcmp(op, "set_local") == 0) return makeSetLocal(s);
+            if (strcmp(op, "v128.load") == 0) return makeLoad(s, v128, /*isAtomic=*/false);
+            goto parse_error;
+          case 'n':
+            if (strcmp(op, "v128.not") == 0) return makeUnary(s, UnaryOp::NotVec128);
+            goto parse_error;
+          case 'o':
+            if (strcmp(op, "v128.or") == 0) return makeBinary(s, BinaryOp::OrVec128);
+            goto parse_error;
+          case 's':
+            if (strcmp(op, "v128.store") == 0) return makeStore(s, v128, /*isAtomic=*/false);
+            goto parse_error;
+          case 'x':
+            if (strcmp(op, "v128.xor") == 0) return makeBinary(s, BinaryOp::XorVec128);
             goto parse_error;
           default: goto parse_error;
         }
       }
-      default: goto parse_error;
-    }
-  }
-  case 't': {
-    switch (op[1]) {
-      case 'e':
-        if (strcmp(op, "tee_local") == 0) return makeTeeLocal(s);
-        goto parse_error;
-      case 'h':
-        if (strcmp(op, "then") == 0) return makeThenOrElse(s);
+      case '8':
+        if (strcmp(op, "v8x16.shuffle") == 0) return makeSIMDShuffle(s);
         goto parse_error;
       default: goto parse_error;
     }
   }
-  case 'u':
-    if (strcmp(op, "unreachable") == 0) return makeUnreachable();
-    goto parse_error;
   case 'w':
     if (strcmp(op, "wake") == 0) return makeAtomicWake(s);
     goto parse_error;
diff --git a/src/ir/ExpressionAnalyzer.cpp b/src/ir/ExpressionAnalyzer.cpp
index 7788f7cde..0efc7b888 100644
--- a/src/ir/ExpressionAnalyzer.cpp
+++ b/src/ir/ExpressionAnalyzer.cpp
@@ -19,6 +19,7 @@
 #include "ir/load-utils.h"
 
 namespace wasm {
+
 // Given a stack of expressions, checks if the topmost is used as a result.
 // For example, if the parent is a block and the node is before the last position,
 // it is not used.
@@ -248,6 +249,37 @@ bool ExpressionAnalyzer::flexibleEqual(Expression* left, Expression* right, Expr
         PUSH(AtomicWake, wakeCount);
         break;
       }
+      case Expression::Id::SIMDExtractId: {
+        CHECK(SIMDExtract, op);
+        CHECK(SIMDExtract, index);
+        PUSH(SIMDExtract, vec);
+        break;
+      }
+      case Expression::Id::SIMDReplaceId: {
+        CHECK(SIMDReplace, op);
+        CHECK(SIMDReplace, index);
+        PUSH(SIMDReplace, vec);
+        PUSH(SIMDReplace, value);
+        break;
+      }
+      case Expression::Id::SIMDShuffleId: {
+        CHECK(SIMDShuffle, mask);
+        PUSH(SIMDShuffle, left);
+        PUSH(SIMDShuffle, right);
+        break;
+      }
+      case Expression::Id::SIMDBitselectId: {
+        PUSH(SIMDBitselect, left);
+        PUSH(SIMDBitselect, right);
+        PUSH(SIMDBitselect, cond);
+        break;
+      }
+      case Expression::Id::SIMDShiftId: {
+        CHECK(SIMDShift, op);
+        PUSH(SIMDShift, vec);
+        PUSH(SIMDShift, shift);
+        break;
+      }
       case Expression::Id::ConstId: {
         if (left->cast<Const>()->value != right->cast<Const>()->value) {
           return false;
@@ -356,7 +388,7 @@ HashType ExpressionAnalyzer::hash(Expression* curr) {
     hash(curr->_id);
     // we often don't need to hash the type, as it is tied to other values
     // we are hashing anyhow, but there are exceptions: for example, a
-    // get_local's type is determined by the function, so if we are
+    // local.get's type is determined by the function, so if we are
     // hashing only expression fragments, then two from different
     // functions may turn out the same even if the type differs. Likewise,
     // if we hash between modules, then we need to take int account
@@ -496,15 +528,43 @@ HashType ExpressionAnalyzer::hash(Expression* curr) {
         PUSH(AtomicWake, wakeCount);
         break;
       }
+      case Expression::Id::SIMDExtractId: {
+        HASH(SIMDExtract, op);
+        HASH(SIMDExtract, index);
+        PUSH(SIMDExtract, vec);
+        break;
+      }
+      case Expression::Id::SIMDReplaceId: {
+        HASH(SIMDReplace, op);
+        HASH(SIMDReplace, index);
+        PUSH(SIMDReplace, vec);
+        PUSH(SIMDReplace, value);
+        break;
+      }
+      case Expression::Id::SIMDShuffleId: {
+        for (size_t i = 0; i < 16; ++i) {
+          HASH(SIMDShuffle, mask[i]);
+        }
+        PUSH(SIMDShuffle, left);
+        PUSH(SIMDShuffle, right);
+        break;
+      }
+      case Expression::Id::SIMDBitselectId: {
+        PUSH(SIMDBitselect, left);
+        PUSH(SIMDBitselect, right);
+        PUSH(SIMDBitselect, cond);
+        break;
+      }
+      case Expression::Id::SIMDShiftId: {
+        HASH(SIMDShift, op);
+        PUSH(SIMDShift, vec);
+        PUSH(SIMDShift, shift);
+        break;
+      }
       case Expression::Id::ConstId: {
         auto* c = curr->cast<Const>();
         hash(c->type);
-        auto bits = c->value.getBits();
-        if (getTypeSize(c->type) == 4) {
-          hash(HashType(bits));
-        } else {
-          hash64(bits);
-        }
+        hash(std::hash<Literal>()(c->value));
         break;
       }
       case Expression::Id::UnaryId: {
@@ -557,4 +617,5 @@ HashType ExpressionAnalyzer::hash(Expression* curr) {
   }
   return digest;
 }
+
 } // namespace wasm
diff --git a/src/ir/ExpressionManipulator.cpp b/src/ir/ExpressionManipulator.cpp
index d65509c52..700f7fdb8 100644
--- a/src/ir/ExpressionManipulator.cpp
+++ b/src/ir/ExpressionManipulator.cpp
@@ -114,6 +114,21 @@ Expression* flexibleCopy(Expression* original, Module& wasm, CustomCopier custom
     Expression* visitAtomicWake(AtomicWake* curr) {
       return builder.makeAtomicWake(copy(curr->ptr), copy(curr->wakeCount), curr->offset);
     }
+    Expression* visitSIMDExtract(SIMDExtract* curr) {
+      return builder.makeSIMDExtract(curr->op, copy(curr->vec), curr->index);
+    }
+    Expression* visitSIMDReplace(SIMDReplace* curr) {
+      return builder.makeSIMDReplace(curr->op, copy(curr->vec), curr->index, copy(curr->value));
+    }
+    Expression* visitSIMDShuffle(SIMDShuffle* curr) {
+      return builder.makeSIMDShuffle(copy(curr->left), copy(curr->right), curr->mask);
+    }
+    Expression* visitSIMDBitselect(SIMDBitselect* curr) {
+      return builder.makeSIMDBitselect(copy(curr->left), copy(curr->right), copy(curr->cond));
+    }
+    Expression* visitSIMDShift(SIMDShift* curr) {
+      return builder.makeSIMDShift(curr->op, copy(curr->vec), copy(curr->shift));
+    }
     Expression* visitConst(Const *curr) {
       return builder.makeConst(curr->value);
     }
diff --git a/src/ir/LocalGraph.cpp b/src/ir/LocalGraph.cpp
index e0105693a..6a99ed44e 100644
--- a/src/ir/LocalGraph.cpp
+++ b/src/ir/LocalGraph.cpp
@@ -28,8 +28,8 @@ namespace LocalGraphInternal {
 
 // Information about a basic block.
 struct Info {
-  std::vector<Expression*> actions; // actions occurring in this block: get_locals and set_locals
-  std::unordered_map<Index, SetLocal*> lastSets; // for each index, the last set_local for it
+  std::vector<Expression*> actions; // actions occurring in this block: local.gets and local.sets
+  std::unordered_map<Index, SetLocal*> lastSets; // for each index, the last local.set for it
 };
 
 // flow helper class. flows the gets to their sets
@@ -78,7 +78,7 @@ struct Flower : public CFGWalker<Flower, Visitor<Flower>, Info> {
       size_t lastTraversedIteration;
       std::vector<Expression*> actions;
       std::vector<FlowBlock*> in;
-      // Sor each index, the last set_local for it
+      // Sor each index, the last local.set for it
       // The unordered_map from BasicBlock.Info is converted into a vector
       // This speeds up search as there are usually few sets in a block, so just scanning
       // them linearly is efficient, avoiding hash computations (while in Info,
diff --git a/src/ir/ReFinalize.cpp b/src/ir/ReFinalize.cpp
index 31140837f..68526678a 100644
--- a/src/ir/ReFinalize.cpp
+++ b/src/ir/ReFinalize.cpp
@@ -137,6 +137,11 @@ void ReFinalize::visitAtomicRMW(AtomicRMW* curr) { curr->finalize(); }
 void ReFinalize::visitAtomicCmpxchg(AtomicCmpxchg* curr) { curr->finalize(); }
 void ReFinalize::visitAtomicWait(AtomicWait* curr) { curr->finalize(); }
 void ReFinalize::visitAtomicWake(AtomicWake* curr) { curr->finalize(); }
+void ReFinalize::visitSIMDExtract(SIMDExtract* curr) { curr->finalize(); }
+void ReFinalize::visitSIMDReplace(SIMDReplace* curr) { curr->finalize(); }
+void ReFinalize::visitSIMDShuffle(SIMDShuffle* curr) { curr->finalize(); }
+void ReFinalize::visitSIMDBitselect(SIMDBitselect* curr) { curr->finalize(); }
+void ReFinalize::visitSIMDShift(SIMDShift* curr) { curr->finalize(); }
 void ReFinalize::visitConst(Const* curr) { curr->finalize(); }
 void ReFinalize::visitUnary(Unary* curr) { curr->finalize(); }
 void ReFinalize::visitBinary(Binary* curr) { curr->finalize(); }
@@ -195,4 +200,3 @@ void ReFinalize::replaceUntaken(Expression* value, Expression* condition) {
 }
 
 } // namespace wasm
-
diff --git a/src/ir/cost.h b/src/ir/cost.h
index e28f535e7..5179f80b1 100644
--- a/src/ir/cost.h
+++ b/src/ir/cost.h
@@ -17,13 +17,15 @@
 #ifndef wasm_ir_cost_h
 #define wasm_ir_cost_h
 
+#include <wasm.h>
+#include <wasm-traversal.h>
+
 namespace wasm {
 
 // Measure the execution cost of an AST. Very handwave-ey
 
 struct CostAnalyzer : public Visitor<CostAnalyzer, Index> {
-  CostAnalyzer(Expression *ast) {
-    assert(ast);
+  CostAnalyzer(Expression* ast) {
     cost = visit(ast);
   }
 
@@ -33,63 +35,63 @@ struct CostAnalyzer : public Visitor<CostAnalyzer, Index> {
     return curr ? visit(curr) : 0;
   }
 
-  Index visitBlock(Block *curr) {
+  Index visitBlock(Block* curr) {
     Index ret = 0;
     for (auto* child : curr->list) ret += visit(child);
     return ret;
   }
-  Index visitIf(If *curr) {
+  Index visitIf(If* curr) {
     return 1 + visit(curr->condition) + std::max(visit(curr->ifTrue), maybeVisit(curr->ifFalse));
   }
-  Index visitLoop(Loop *curr) {
+  Index visitLoop(Loop* curr) {
     return 5 * visit(curr->body);
   }
-  Index visitBreak(Break *curr) {
+  Index visitBreak(Break* curr) {
     return 1 + maybeVisit(curr->value) + maybeVisit(curr->condition);
   }
-  Index visitSwitch(Switch *curr) {
+  Index visitSwitch(Switch* curr) {
     return 2 + visit(curr->condition) + maybeVisit(curr->value);
   }
-  Index visitCall(Call *curr) {
+  Index visitCall(Call* curr) {
     // XXX this does not take into account if the call is to an import, which
     //     may be costlier in general
     Index ret = 4;
     for (auto* child : curr->operands) ret += visit(child);
     return ret;
   }
-  Index visitCallIndirect(CallIndirect *curr) {
+  Index visitCallIndirect(CallIndirect* curr) {
     Index ret = 6 + visit(curr->target);
     for (auto* child : curr->operands) ret += visit(child);
     return ret;
   }
-  Index visitGetLocal(GetLocal *curr) {
+  Index visitGetLocal(GetLocal* curr) {
     return 0;
   }
-  Index visitSetLocal(SetLocal *curr) {
+  Index visitSetLocal(SetLocal* curr) {
     return 1;
   }
-  Index visitGetGlobal(GetGlobal *curr) {
+  Index visitGetGlobal(GetGlobal* curr) {
     return 1;
   }
-  Index visitSetGlobal(SetGlobal *curr) {
+  Index visitSetGlobal(SetGlobal* curr) {
     return 2;
   }
-  Index visitLoad(Load *curr) {
+  Index visitLoad(Load* curr) {
     return 1 + visit(curr->ptr) + 10 * curr->isAtomic;
   }
-  Index visitStore(Store *curr) {
+  Index visitStore(Store* curr) {
     return 2 + visit(curr->ptr) + visit(curr->value) + 10 * curr->isAtomic;
   }
-  Index visitAtomicRMW(AtomicRMW *curr) {
+  Index visitAtomicRMW(AtomicRMW* curr) {
     return 100;
   }
   Index visitAtomicCmpxchg(AtomicCmpxchg* curr) {
     return 100;
   }
-  Index visitConst(Const *curr) {
+  Index visitConst(Const* curr) {
     return 1;
   }
-  Index visitUnary(Unary *curr) {
+  Index visitUnary(Unary* curr) {
     Index ret = 0;
     switch (curr->op) {
       case ClzInt32:
@@ -152,11 +154,44 @@ struct CostAnalyzer : public Visitor<CostAnalyzer, Index> {
       case TruncSatUFloat64ToInt64: ret = 1; break;
       case SqrtFloat32:
       case SqrtFloat64: ret = 2; break;
+      case SplatVecI8x16:
+      case SplatVecI16x8:
+      case SplatVecI32x4:
+      case SplatVecI64x2:
+      case SplatVecF32x4:
+      case SplatVecF64x2:
+      case NotVec128:
+      case NegVecI8x16:
+      case AnyTrueVecI8x16:
+      case AllTrueVecI8x16:
+      case NegVecI16x8:
+      case AnyTrueVecI16x8:
+      case AllTrueVecI16x8:
+      case NegVecI32x4:
+      case AnyTrueVecI32x4:
+      case AllTrueVecI32x4:
+      case NegVecI64x2:
+      case AnyTrueVecI64x2:
+      case AllTrueVecI64x2:
+      case AbsVecF32x4:
+      case NegVecF32x4:
+      case SqrtVecF32x4:
+      case AbsVecF64x2:
+      case NegVecF64x2:
+      case SqrtVecF64x2:
+      case TruncSatSVecF32x4ToVecI32x4:
+      case TruncSatUVecF32x4ToVecI32x4:
+      case TruncSatSVecF64x2ToVecI64x2:
+      case TruncSatUVecF64x2ToVecI64x2:
+      case ConvertSVecI32x4ToVecF32x4:
+      case ConvertUVecI32x4ToVecF32x4:
+      case ConvertSVecI64x2ToVecF64x2:
+      case ConvertUVecI64x2ToVecF64x2: return 1;
       case InvalidUnary: WASM_UNREACHABLE();
     }
     return ret + visit(curr->value);
   }
-  Index visitBinary(Binary *curr) {
+  Index visitBinary(Binary* curr) {
     Index ret = 0;
     switch (curr->op) {
       case AddInt32:        ret = 1; break;
@@ -235,26 +270,102 @@ struct CostAnalyzer : public Visitor<CostAnalyzer, Index> {
       case NeFloat32:       ret = 1; break;
       case EqFloat64:       ret = 1; break;
       case NeFloat64:       ret = 1; break;
+      case EqVecI8x16:      ret = 1; break;
+      case NeVecI8x16:      ret = 1; break;
+      case LtSVecI8x16:     ret = 1; break;
+      case LtUVecI8x16:     ret = 1; break;
+      case LeSVecI8x16:     ret = 1; break;
+      case LeUVecI8x16:     ret = 1; break;
+      case GtSVecI8x16:     ret = 1; break;
+      case GtUVecI8x16:     ret = 1; break;
+      case GeSVecI8x16:     ret = 1; break;
+      case GeUVecI8x16:     ret = 1; break;
+      case EqVecI16x8:      ret = 1; break;
+      case NeVecI16x8:      ret = 1; break;
+      case LtSVecI16x8:     ret = 1; break;
+      case LtUVecI16x8:     ret = 1; break;
+      case LeSVecI16x8:     ret = 1; break;
+      case LeUVecI16x8:     ret = 1; break;
+      case GtSVecI16x8:     ret = 1; break;
+      case GtUVecI16x8:     ret = 1; break;
+      case GeSVecI16x8:     ret = 1; break;
+      case GeUVecI16x8:     ret = 1; break;
+      case EqVecI32x4:      ret = 1; break;
+      case NeVecI32x4:      ret = 1; break;
+      case LtSVecI32x4:     ret = 1; break;
+      case LtUVecI32x4:     ret = 1; break;
+      case LeSVecI32x4:     ret = 1; break;
+      case LeUVecI32x4:     ret = 1; break;
+      case GtSVecI32x4:     ret = 1; break;
+      case GtUVecI32x4:     ret = 1; break;
+      case GeSVecI32x4:     ret = 1; break;
+      case GeUVecI32x4:     ret = 1; break;
+      case EqVecF32x4:      ret = 1; break;
+      case NeVecF32x4:      ret = 1; break;
+      case LtVecF32x4:      ret = 1; break;
+      case LeVecF32x4:      ret = 1; break;
+      case GtVecF32x4:      ret = 1; break;
+      case GeVecF32x4:      ret = 1; break;
+      case EqVecF64x2:      ret = 1; break;
+      case NeVecF64x2:      ret = 1; break;
+      case LtVecF64x2:      ret = 1; break;
+      case LeVecF64x2:      ret = 1; break;
+      case GtVecF64x2:      ret = 1; break;
+      case GeVecF64x2:      ret = 1; break;
+      case AndVec128:       ret = 1; break;
+      case OrVec128:        ret = 1; break;
+      case XorVec128:       ret = 1; break;
+      case AddVecI8x16:     ret = 1; break;
+      case AddSatSVecI8x16: ret = 1; break;
+      case AddSatUVecI8x16: ret = 1; break;
+      case SubVecI8x16:     ret = 1; break;
+      case SubSatSVecI8x16: ret = 1; break;
+      case SubSatUVecI8x16: ret = 1; break;
+      case MulVecI8x16:     ret = 2; break;
+      case AddVecI16x8:     ret = 1; break;
+      case AddSatSVecI16x8: ret = 1; break;
+      case AddSatUVecI16x8: ret = 1; break;
+      case SubVecI16x8:     ret = 1; break;
+      case SubSatSVecI16x8: ret = 1; break;
+      case SubSatUVecI16x8: ret = 1; break;
+      case MulVecI16x8:     ret = 2; break;
+      case AddVecI32x4:     ret = 1; break;
+      case SubVecI32x4:     ret = 1; break;
+      case MulVecI32x4:     ret = 2; break;
+      case AddVecI64x2:     ret = 1; break;
+      case SubVecI64x2:     ret = 1; break;
+      case AddVecF32x4:     ret = 1; break;
+      case SubVecF32x4:     ret = 1; break;
+      case MulVecF32x4:     ret = 2; break;
+      case DivVecF32x4:     ret = 3; break;
+      case MinVecF32x4:     ret = 1; break;
+      case MaxVecF32x4:     ret = 1; break;
+      case AddVecF64x2:     ret = 1; break;
+      case SubVecF64x2:     ret = 1; break;
+      case MulVecF64x2:     ret = 2; break;
+      case DivVecF64x2:     ret = 3; break;
+      case MinVecF64x2:     ret = 1; break;
+      case MaxVecF64x2:     ret = 1; break;
       case InvalidBinary: WASM_UNREACHABLE();
     }
     return ret + visit(curr->left) + visit(curr->right);
   }
-  Index visitSelect(Select *curr) {
+  Index visitSelect(Select* curr) {
     return 2 + visit(curr->condition) + visit(curr->ifTrue) + visit(curr->ifFalse);
   }
-  Index visitDrop(Drop *curr) {
+  Index visitDrop(Drop* curr) {
     return visit(curr->value);
   }
-  Index visitReturn(Return *curr) {
+  Index visitReturn(Return* curr) {
     return maybeVisit(curr->value);
   }
-  Index visitHost(Host *curr) {
+  Index visitHost(Host* curr) {
     return 100;
   }
-  Index visitNop(Nop *curr) {
+  Index visitNop(Nop* curr) {
     return 0;
   }
-  Index visitUnreachable(Unreachable *curr) {
+  Index visitUnreachable(Unreachable* curr) {
     return 0;
   }
 };
diff --git a/src/ir/features.h b/src/ir/features.h
new file mode 100644
index 000000000..ed7fb6ff5
--- /dev/null
+++ b/src/ir/features.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright 2018 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef wasm_ir_features_h
+#define wasm_ir_features_h
+
+#include <wasm.h>
+#include <wasm-binary.h>
+#include <wasm-traversal.h>
+#include <ir/iteration.h>
+
+namespace wasm {
+
+namespace Features {
+
+inline FeatureSet get(UnaryOp op) {
+  FeatureSet ret;
+  switch (op) {
+    case TruncSatSFloat32ToInt32:
+    case TruncSatUFloat32ToInt32:
+    case TruncSatSFloat64ToInt32:
+    case TruncSatUFloat64ToInt32:
+    case TruncSatSFloat32ToInt64:
+    case TruncSatUFloat32ToInt64:
+    case TruncSatSFloat64ToInt64:
+    case TruncSatUFloat64ToInt64: {
+      ret.setTruncSat();
+      break;
+    }
+    case SplatVecI8x16:
+    case SplatVecI16x8:
+    case SplatVecI32x4:
+    case SplatVecI64x2:
+    case SplatVecF32x4:
+    case SplatVecF64x2:
+    case NotVec128:
+    case NegVecI8x16:
+    case AnyTrueVecI8x16:
+    case AllTrueVecI8x16:
+    case NegVecI16x8:
+    case AnyTrueVecI16x8:
+    case AllTrueVecI16x8:
+    case NegVecI32x4:
+    case AnyTrueVecI32x4:
+    case AllTrueVecI32x4:
+    case NegVecI64x2:
+    case AnyTrueVecI64x2:
+    case AllTrueVecI64x2:
+    case AbsVecF32x4:
+    case NegVecF32x4:
+    case SqrtVecF32x4:
+    case AbsVecF64x2:
+    case NegVecF64x2:
+    case SqrtVecF64x2:
+    case TruncSatSVecF32x4ToVecI32x4:
+    case TruncSatUVecF32x4ToVecI32x4:
+    case TruncSatSVecF64x2ToVecI64x2:
+    case TruncSatUVecF64x2ToVecI64x2:
+    case ConvertSVecI32x4ToVecF32x4:
+    case ConvertUVecI32x4ToVecF32x4:
+    case ConvertSVecI64x2ToVecF64x2:
+    case ConvertUVecI64x2ToVecF64x2: {
+      ret.setSIMD();
+      break;
+    }
+    default: {}
+  }
+  return ret;
+}
+
+inline FeatureSet get(BinaryOp op) {
+  FeatureSet ret;
+  switch (op) {
+    case EqVecI8x16:
+    case NeVecI8x16:
+    case LtSVecI8x16:
+    case LtUVecI8x16:
+    case GtSVecI8x16:
+    case GtUVecI8x16:
+    case LeSVecI8x16:
+    case LeUVecI8x16:
+    case GeSVecI8x16:
+    case GeUVecI8x16:
+    case EqVecI16x8:
+    case NeVecI16x8:
+    case LtSVecI16x8:
+    case LtUVecI16x8:
+    case GtSVecI16x8:
+    case GtUVecI16x8:
+    case LeSVecI16x8:
+    case LeUVecI16x8:
+    case GeSVecI16x8:
+    case GeUVecI16x8:
+    case EqVecI32x4:
+    case NeVecI32x4:
+    case LtSVecI32x4:
+    case LtUVecI32x4:
+    case GtSVecI32x4:
+    case GtUVecI32x4:
+    case LeSVecI32x4:
+    case LeUVecI32x4:
+    case GeSVecI32x4:
+    case GeUVecI32x4:
+    case EqVecF32x4:
+    case NeVecF32x4:
+    case LtVecF32x4:
+    case GtVecF32x4:
+    case LeVecF32x4:
+    case GeVecF32x4:
+    case EqVecF64x2:
+    case NeVecF64x2:
+    case LtVecF64x2:
+    case GtVecF64x2:
+    case LeVecF64x2:
+    case GeVecF64x2:
+    case AndVec128:
+    case OrVec128:
+    case XorVec128:
+    case AddVecI8x16:
+    case AddSatSVecI8x16:
+    case AddSatUVecI8x16:
+    case SubVecI8x16:
+    case SubSatSVecI8x16:
+    case SubSatUVecI8x16:
+    case MulVecI8x16:
+    case AddVecI16x8:
+    case AddSatSVecI16x8:
+    case AddSatUVecI16x8:
+    case SubVecI16x8:
+    case SubSatSVecI16x8:
+    case SubSatUVecI16x8:
+    case MulVecI16x8:
+    case AddVecI32x4:
+    case SubVecI32x4:
+    case MulVecI32x4:
+    case AddVecI64x2:
+    case SubVecI64x2:
+    case AddVecF32x4:
+    case SubVecF32x4:
+    case MulVecF32x4:
+    case DivVecF32x4:
+    case MinVecF32x4:
+    case MaxVecF32x4:
+    case AddVecF64x2:
+    case SubVecF64x2:
+    case MulVecF64x2:
+    case DivVecF64x2:
+    case MinVecF64x2:
+    case MaxVecF64x2: {
+      ret.setSIMD();
+      break;
+    }
+    default: {}
+  }
+  return ret;
+}
+
+} // namespace Features
+
+} // namespace wasm
+
+#endif // wasm_ir_features_h
diff --git a/src/ir/literal-utils.h b/src/ir/literal-utils.h
index e00f05c52..543c34e9f 100644
--- a/src/ir/literal-utils.h
+++ b/src/ir/literal-utils.h
@@ -23,26 +23,9 @@ namespace wasm {
 
 namespace LiteralUtils {
 
-inline Literal makeLiteralFromInt32(int32_t x, Type type) {
-  switch (type) {
-    case i32: return Literal(int32_t(x)); break;
-    case i64: return Literal(int64_t(x)); break;
-    case f32: return Literal(float(x)); break;
-    case f64: return Literal(double(x)); break;
-    case v128: assert(false && "v128 not implemented yet");
-    case none:
-    case unreachable: WASM_UNREACHABLE();
-  }
-  WASM_UNREACHABLE();
-}
-
-inline Literal makeLiteralZero(Type type) {
-  return makeLiteralFromInt32(0, type);
-}
-
 inline Expression* makeFromInt32(int32_t x, Type type, Module& wasm) {
   auto* ret = wasm.allocator.alloc<Const>();
-  ret->value = makeLiteralFromInt32(x, type);
+  ret->value = Literal::makeFromInt32(x, type);
   ret->type = type;
   return ret;
 }
diff --git a/src/ir/local-graph.h b/src/ir/local-graph.h
index 84be2a4c2..725be0536 100644
--- a/src/ir/local-graph.h
+++ b/src/ir/local-graph.h
@@ -20,7 +20,7 @@
 namespace wasm {
 
 //
-// Finds the connections between get_locals and set_locals, creating
+// Finds the connections between local.gets and local.sets, creating
 // a graph of those ties. This is useful for "ssa-style" optimization,
 // in which you want to know exactly which sets are relevant for a
 // a get, so it is as if each get has just one set, logically speaking
@@ -33,7 +33,7 @@ struct LocalGraph {
   // the constructor computes getSetses, the sets affecting each get
   LocalGraph(Function* func);
 
-  // the set_locals relevant for an index or a get.
+  // the local.sets relevant for an index or a get.
   typedef std::set<SetLocal*> Sets;
 
   typedef std::map<GetLocal*, Sets> GetSetses;
diff --git a/src/ir/properties.h b/src/ir/properties.h
index 6848e9481..4afe3e909 100644
--- a/src/ir/properties.h
+++ b/src/ir/properties.h
@@ -146,7 +146,7 @@ inline Index getZeroExtBits(Expression* curr) {
   return Bits::getMaskedBits(curr->cast<Binary>()->right->cast<Const>()->value.geti32());
 }
 
-// Returns a falling-through value, that is, it looks through a tee_local
+// Returns a falling-through value, that is, it looks through a local.tee
 // and other operations that receive a value and let it flow through them.
 inline Expression* getFallthrough(Expression* curr) {
   // If the current node is unreachable, there is no value
diff --git a/src/ir/utils.h b/src/ir/utils.h
index a4082b6bc..afb63b01c 100644
--- a/src/ir/utils.h
+++ b/src/ir/utils.h
@@ -129,6 +129,11 @@ struct ReFinalize : public WalkerPass<PostWalker<ReFinalize, OverriddenVisitor<R
   void visitAtomicCmpxchg(AtomicCmpxchg* curr);
   void visitAtomicWait(AtomicWait* curr);
   void visitAtomicWake(AtomicWake* curr);
+  void visitSIMDExtract(SIMDExtract* curr);
+  void visitSIMDReplace(SIMDReplace* curr);
+  void visitSIMDShuffle(SIMDShuffle* curr);
+  void visitSIMDBitselect(SIMDBitselect* curr);
+  void visitSIMDShift(SIMDShift* curr);
   void visitConst(Const* curr);
   void visitUnary(Unary* curr);
   void visitBinary(Binary* curr);
@@ -176,6 +181,11 @@ struct ReFinalizeNode : public OverriddenVisitor<ReFinalizeNode> {
   void visitAtomicCmpxchg(AtomicCmpxchg* curr) { curr->finalize(); }
   void visitAtomicWait(AtomicWait* curr) { curr->finalize(); }
   void visitAtomicWake(AtomicWake* curr) { curr->finalize(); }
+  void visitSIMDExtract(SIMDExtract* curr) { curr->finalize(); }
+  void visitSIMDReplace(SIMDReplace* curr) { curr->finalize(); }
+  void visitSIMDShuffle(SIMDShuffle* curr) { curr->finalize(); }
+  void visitSIMDBitselect(SIMDBitselect* curr) { curr->finalize(); }
+  void visitSIMDShift(SIMDShift* curr) { curr->finalize(); }
   void visitConst(Const* curr) { curr->finalize(); }
   void visitUnary(Unary* curr) { curr->finalize(); }
   void visitBinary(Binary* curr) { curr->finalize(); }
diff --git a/src/js/binaryen.js-post.js b/src/js/binaryen.js-post.js
index b63427935..aa2e613ce 100644
--- a/src/js/binaryen.js-post.js
+++ b/src/js/binaryen.js-post.js
@@ -22,6 +22,14 @@ function i32sToStack(i32s) {
   return ret;
 }
 
+function i8sToStack(i8s) {
+  var ret = stackAlloc(i8s.length);
+  for (var i = 0; i < i8s.length; i++) {
+    HEAP8[ret + i] = i8s[i];
+  }
+  return ret;
+}
+
 // Types
 Module['none'] = Module['_BinaryenTypeNone']();
 Module['i32'] = Module['_BinaryenTypeInt32']();
@@ -60,6 +68,11 @@ Module['AtomicCmpxchgId'] = Module['_BinaryenAtomicCmpxchgId']();
 Module['AtomicRMWId'] = Module['_BinaryenAtomicRMWId']();
 Module['AtomicWaitId'] = Module['_BinaryenAtomicWaitId']();
 Module['AtomicWakeId'] = Module['_BinaryenAtomicWakeId']();
+Module['SIMDExtractId'] = Module['_BinaryenSIMDExtractId']();
+Module['SIMDReplaceId'] = Module['_BinaryenSIMDReplaceId']();
+Module['SIMDShuffleId'] = Module['_BinaryenSIMDShuffleId']();
+Module['SIMDBitselectId'] = Module['_BinaryenSIMDBitselectId']();
+Module['SIMDShiftId'] = Module['_BinaryenSIMDShiftId']();
 
 // External kinds
 Module['ExternalFunction'] = Module['_BinaryenExternalFunction']();
@@ -212,6 +225,141 @@ Module['AtomicRMWAnd'] = Module['_BinaryenAtomicRMWAnd']();
 Module['AtomicRMWOr'] = Module['_BinaryenAtomicRMWOr']();
 Module['AtomicRMWXor'] = Module['_BinaryenAtomicRMWXor']();
 Module['AtomicRMWXchg'] = Module['_BinaryenAtomicRMWXchg']();
+Module['SplatVecI8x16'] = Module['_BinaryenSplatVecI8x16']();
+Module['ExtractLaneSVecI8x16'] = Module['_BinaryenExtractLaneSVecI8x16']();
+Module['ExtractLaneUVecI8x16'] = Module['_BinaryenExtractLaneUVecI8x16']();
+Module['ReplaceLaneVecI8x16'] = Module['_BinaryenReplaceLaneVecI8x16']();
+Module['SplatVecI16x8'] = Module['_BinaryenSplatVecI16x8']();
+Module['ExtractLaneSVecI16x8'] = Module['_BinaryenExtractLaneSVecI16x8']();
+Module['ExtractLaneUVecI16x8'] = Module['_BinaryenExtractLaneUVecI16x8']();
+Module['ReplaceLaneVecI16x8'] = Module['_BinaryenReplaceLaneVecI16x8']();
+Module['SplatVecI32x4'] = Module['_BinaryenSplatVecI32x4']();
+Module['ExtractLaneVecI32x4'] = Module['_BinaryenExtractLaneVecI32x4']();
+Module['ReplaceLaneVecI32x4'] = Module['_BinaryenReplaceLaneVecI32x4']();
+Module['SplatVecI64x2'] = Module['_BinaryenSplatVecI64x2']();
+Module['ExtractLaneVecI64x2'] = Module['_BinaryenExtractLaneVecI64x2']();
+Module['ReplaceLaneVecI64x2'] = Module['_BinaryenReplaceLaneVecI64x2']();
+Module['SplatVecF32x4'] = Module['_BinaryenSplatVecF32x4']();
+Module['ExtractLaneVecF32x4'] = Module['_BinaryenExtractLaneVecF32x4']();
+Module['ReplaceLaneVecF32x4'] = Module['_BinaryenReplaceLaneVecF32x4']();
+Module['SplatVecF64x2'] = Module['_BinaryenSplatVecF64x2']();
+Module['ExtractLaneVecF64x2'] = Module['_BinaryenExtractLaneVecF64x2']();
+Module['ReplaceLaneVecF64x2'] = Module['_BinaryenReplaceLaneVecF64x2']();
+Module['EqVecI8x16'] = Module['_BinaryenEqVecI8x16']();
+Module['NeVecI8x16'] = Module['_BinaryenNeVecI8x16']();
+Module['LtSVecI8x16'] = Module['_BinaryenLtSVecI8x16']();
+Module['LtUVecI8x16'] = Module['_BinaryenLtUVecI8x16']();
+Module['GtSVecI8x16'] = Module['_BinaryenGtSVecI8x16']();
+Module['GtUVecI8x16'] = Module['_BinaryenGtUVecI8x16']();
+Module['LeSVecI8x16'] = Module['_BinaryenLeSVecI8x16']();
+Module['LeUVecI8x16'] = Module['_BinaryenLeUVecI8x16']();
+Module['GeSVecI8x16'] = Module['_BinaryenGeSVecI8x16']();
+Module['GeUVecI8x16'] = Module['_BinaryenGeUVecI8x16']();
+Module['EqVecI16x8'] = Module['_BinaryenEqVecI16x8']();
+Module['NeVecI16x8'] = Module['_BinaryenNeVecI16x8']();
+Module['LtSVecI16x8'] = Module['_BinaryenLtSVecI16x8']();
+Module['LtUVecI16x8'] = Module['_BinaryenLtUVecI16x8']();
+Module['GtSVecI16x8'] = Module['_BinaryenGtSVecI16x8']();
+Module['GtUVecI16x8'] = Module['_BinaryenGtUVecI16x8']();
+Module['LeSVecI16x8'] = Module['_BinaryenLeSVecI16x8']();
+Module['LeUVecI16x8'] = Module['_BinaryenLeUVecI16x8']();
+Module['GeSVecI16x8'] = Module['_BinaryenGeSVecI16x8']();
+Module['GeUVecI16x8'] = Module['_BinaryenGeUVecI16x8']();
+Module['EqVecI32x4'] = Module['_BinaryenEqVecI32x4']();
+Module['NeVecI32x4'] = Module['_BinaryenNeVecI32x4']();
+Module['LtSVecI32x4'] = Module['_BinaryenLtSVecI32x4']();
+Module['LtUVecI32x4'] = Module['_BinaryenLtUVecI32x4']();
+Module['GtSVecI32x4'] = Module['_BinaryenGtSVecI32x4']();
+Module['GtUVecI32x4'] = Module['_BinaryenGtUVecI32x4']();
+Module['LeSVecI32x4'] = Module['_BinaryenLeSVecI32x4']();
+Module['LeUVecI32x4'] = Module['_BinaryenLeUVecI32x4']();
+Module['GeSVecI32x4'] = Module['_BinaryenGeSVecI32x4']();
+Module['GeUVecI32x4'] = Module['_BinaryenGeUVecI32x4']();
+Module['EqVecF32x4'] = Module['_BinaryenEqVecF32x4']();
+Module['NeVecF32x4'] = Module['_BinaryenNeVecF32x4']();
+Module['LtVecF32x4'] = Module['_BinaryenLtVecF32x4']();
+Module['GtVecF32x4'] = Module['_BinaryenGtVecF32x4']();
+Module['LeVecF32x4'] = Module['_BinaryenLeVecF32x4']();
+Module['GeVecF32x4'] = Module['_BinaryenGeVecF32x4']();
+Module['EqVecF64x2'] = Module['_BinaryenGeVecF32x4']();
+Module['NeVecF64x2'] = Module['_BinaryenNeVecF64x2']();
+Module['LtVecF64x2'] = Module['_BinaryenLtVecF64x2']();
+Module['GtVecF64x2'] = Module['_BinaryenGtVecF64x2']();
+Module['LeVecF64x2'] = Module['_BinaryenLeVecF64x2']();
+Module['GeVecF64x2'] = Module['_BinaryenGeVecF64x2']();
+Module['NotVec128'] = Module['_BinaryenNotVec128']();
+Module['AndVec128'] = Module['_BinaryenAndVec128']();
+Module['OrVec128'] = Module['_BinaryenOrVec128']();
+Module['XorVec128'] = Module['_BinaryenXorVec128']();
+Module['NegVecI8x16'] = Module['_BinaryenNegVecI8x16']();
+Module['AnyTrueVecI8x16'] = Module['_BinaryenAnyTrueVecI8x16']();
+Module['AllTrueVecI8x16'] = Module['_BinaryenAllTrueVecI8x16']();
+Module['ShlVecI8x16'] = Module['_BinaryenShlVecI8x16']();
+Module['ShrSVecI8x16'] = Module['_BinaryenShrSVecI8x16']();
+Module['ShrUVecI8x16'] = Module['_BinaryenShrUVecI8x16']();
+Module['AddVecI8x16'] = Module['_BinaryenAddVecI8x16']();
+Module['AddSatSVecI8x16'] = Module['_BinaryenAddSatSVecI8x16']();
+Module['AddSatUVecI8x16'] = Module['_BinaryenAddSatUVecI8x16']();
+Module['SubVecI8x16'] = Module['_BinaryenSubVecI8x16']();
+Module['SubSatSVecI8x16'] = Module['_BinaryenSubSatSVecI8x16']();
+Module['SubSatUVecI8x16'] = Module['_BinaryenSubSatUVecI8x16']();
+Module['MulVecI8x16'] = Module['_BinaryenMulVecI8x16']();
+Module['NegVecI16x8'] = Module['_BinaryenNegVecI16x8']();
+Module['AnyTrueVecI16x8'] = Module['_BinaryenAnyTrueVecI16x8']();
+Module['AllTrueVecI16x8'] = Module['_BinaryenAllTrueVecI16x8']();
+Module['ShlVecI16x8'] = Module['_BinaryenShlVecI16x8']();
+Module['ShrSVecI16x8'] = Module['_BinaryenShrSVecI16x8']();
+Module['ShrUVecI16x8'] = Module['_BinaryenShrUVecI16x8']();
+Module['AddVecI16x8'] = Module['_BinaryenAddVecI16x8']();
+Module['AddSatSVecI16x8'] = Module['_BinaryenAddSatSVecI16x8']();
+Module['AddSatUVecI16x8'] = Module['_BinaryenAddSatUVecI16x8']();
+Module['SubVecI16x8'] = Module['_BinaryenSubVecI16x8']();
+Module['SubSatSVecI16x8'] = Module['_BinaryenSubSatSVecI16x8']();
+Module['SubSatUVecI16x8'] = Module['_BinaryenSubSatUVecI16x8']();
+Module['MulVecI16x8'] = Module['_BinaryenMulVecI16x8']();
+Module['NegVecI32x4'] = Module['_BinaryenNegVecI32x4']();
+Module['AnyTrueVecI32x4'] = Module['_BinaryenAnyTrueVecI32x4']();
+Module['AllTrueVecI32x4'] = Module['_BinaryenAllTrueVecI32x4']();
+Module['ShlVecI32x4'] = Module['_BinaryenShlVecI32x4']();
+Module['ShrSVecI32x4'] = Module['_BinaryenShrSVecI32x4']();
+Module['ShrUVecI32x4'] = Module['_BinaryenShrUVecI32x4']();
+Module['AddVecI32x4'] = Module['_BinaryenAddVecI32x4']();
+Module['SubVecI32x4'] = Module['_BinaryenSubVecI32x4']();
+Module['MulVecI32x4'] = Module['_BinaryenMulVecI32x4']();
+Module['NegVecI64x2'] = Module['_BinaryenNegVecI64x2']();
+Module['AnyTrueVecI64x2'] = Module['_BinaryenAnyTrueVecI64x2']();
+Module['AllTrueVecI64x2'] = Module['_BinaryenAllTrueVecI64x2']();
+Module['ShlVecI64x2'] = Module['_BinaryenShlVecI64x2']();
+Module['ShrSVecI64x2'] = Module['_BinaryenShrSVecI64x2']();
+Module['ShrUVecI64x2'] = Module['_BinaryenShrUVecI64x2']();
+Module['AddVecI64x2'] = Module['_BinaryenAddVecI64x2']();
+Module['SubVecI64x2'] = Module['_BinaryenSubVecI64x2']();
+Module['AbsVecF32x4'] = Module['_BinaryenAbsVecF32x4']();
+Module['NegVecF32x4'] = Module['_BinaryenNegVecF32x4']();
+Module['SqrtVecF32x4'] = Module['_BinaryenSqrtVecF32x4']();
+Module['AddVecF32x4'] = Module['_BinaryenAddVecF32x4']();
+Module['SubVecF32x4'] = Module['_BinaryenSubVecF32x4']();
+Module['MulVecF32x4'] = Module['_BinaryenMulVecF32x4']();
+Module['DivVecF32x4'] = Module['_BinaryenDivVecF32x4']();
+Module['MinVecF32x4'] = Module['_BinaryenMinVecF32x4']();
+Module['MaxVecF32x4'] = Module['_BinaryenMaxVecF32x4']();
+Module['AbsVecF64x2'] = Module['_BinaryenAbsVecF64x2']();
+Module['NegVecF64x2'] = Module['_BinaryenNegVecF64x2']();
+Module['SqrtVecF64x2'] = Module['_BinaryenSqrtVecF64x2']();
+Module['AddVecF64x2'] = Module['_BinaryenAddVecF64x2']();
+Module['SubVecF64x2'] = Module['_BinaryenSubVecF64x2']();
+Module['MulVecF64x2'] = Module['_BinaryenMulVecF64x2']();
+Module['DivVecF64x2'] = Module['_BinaryenDivVecF64x2']();
+Module['MinVecF64x2'] = Module['_BinaryenMinVecF64x2']();
+Module['MaxVecF64x2'] = Module['_BinaryenMaxVecF64x2']();
+Module['TruncSatSVecF32x4ToVecI32x4'] = Module['_BinaryenTruncSatSVecF32x4ToVecI32x4']();
+Module['TruncSatUVecF32x4ToVecI32x4'] = Module['_BinaryenTruncSatUVecF32x4ToVecI32x4']();
+Module['TruncSatSVecF64x2ToVecI64x2'] = Module['_BinaryenTruncSatSVecF64x2ToVecI64x2']();
+Module['TruncSatUVecF64x2ToVecI64x2'] = Module['_BinaryenTruncSatUVecF64x2ToVecI64x2']();
+Module['ConvertSVecI32x4ToVecF32x4'] = Module['_BinaryenConvertSVecI32x4ToVecF32x4']();
+Module['ConvertUVecI32x4ToVecF32x4'] = Module['_BinaryenConvertUVecI32x4ToVecF32x4']();
+Module['ConvertSVecI64x2ToVecF64x2'] = Module['_BinaryenConvertSVecI64x2ToVecF64x2']();
+Module['ConvertUVecI64x2ToVecF64x2'] = Module['_BinaryenConvertUVecI64x2ToVecF64x2']();
 
 // 'Module' interface
 Module['Module'] = function(module) {
@@ -276,21 +424,35 @@ function wrapModule(module, self) {
       return Module['_BinaryenCallIndirect'](module, target, i32sToStack(operands), operands.length, strToStack(type));
     });
   };
-  self['getLocal'] = self['get_local'] = function(index, type) {
-    return Module['_BinaryenGetLocal'](module, index, type);
-  };
-  self['setLocal'] = self['set_local'] = self['set_local'] = function(index, value) {
-    return Module['_BinaryenSetLocal'](module, index, value);
-  };
-  self['teeLocal'] = self['tee_local'] = function(index, value) {
-    return Module['_BinaryenTeeLocal'](module, index, value);
-  };
-  self['getGlobal'] = self['get_global'] = function(name, type) {
-    return Module['_BinaryenGetGlobal'](module, strToStack(name), type);
+
+  self['local'] = {
+    'get': function(index, type) {
+      return Module['_BinaryenGetLocal'](module, index, type);
+    },
+    'set': function(index, value) {
+      return Module['_BinaryenSetLocal'](module, index, value);
+    },
+    'tee': function(index, value) {
+      return Module['_BinaryenTeeLocal'](module, index, value);
+    }
   }
-  self['setGlobal'] = self['set_global'] = function(name, value) {
-    return Module['_BinaryenSetGlobal'](module, strToStack(name), value);
+
+  self['getLocal'] = self['local']['get'];
+  self['setLocal'] = self['local']['set'];
+  self['teeLocal'] = self['local']['tee'];
+
+  self['global'] = {
+    'get': function(name, type) {
+      return Module['_BinaryenGetGlobal'](module, strToStack(name), type);
+    },
+    'set': function(name, value) {
+      return Module['_BinaryenSetGlobal'](module, strToStack(name), value);
+    }
   }
+
+  self['getGlobal'] = self['global']['get'];
+  self['setGlobal'] = self['global']['set'];
+
   self['currentMemory'] = self['current_memory'] = function() {
     return Module['_BinaryenHost'](module, Module['CurrentMemory']);
   }
@@ -1055,6 +1217,455 @@ function wrapModule(module, self) {
     },
   };
 
+  self['v128'] = {
+    'load': function(offset, align, ptr) {
+      return Module['_BinaryenLoad'](module, 16, false, offset, align, Module['v128'], ptr);
+    },
+    'store': function(offset, align, ptr, value) {
+      return Module['_BinaryenStore'](module, 16, offset, align, ptr, value, Module['v128']);
+    },
+    'const': function(i8s) {
+      return preserveStack(function() {
+        Module['_BinaryenLiteralVec128'](temp, i8sToStack(i8s));
+        return Module['_BinaryenConst'](module, temp);
+      });
+    },
+    'not': function(value) {
+      return Module['_BinaryenUnary'](module, Module['NotVec128'], value);
+    },
+    'and': function(value) {
+      return Module['_BinaryenUnary'](module, Module['AndVec128'], value);
+    },
+    'or': function(value) {
+      return Module['_BinaryenUnary'](module, Module['OrVec128'], value);
+    },
+    'xor': function(value) {
+      return Module['_BinaryenUnary'](module, Module['XorVec128'], value);
+    },
+    'bitselect': function(left, right, cond) {
+      return Module['_BinaryenSIMDBitselect'](module, left, right, cond);
+    }
+  };
+
+  self['v8x16'] = {
+    'shuffle': function(left, right, mask) {
+      return preserveStack(function() {
+        return Module['_BinaryenSIMDShuffle'](module, left, right, i8sToStack(mask));
+      });
+    },
+  };
+
+  self['i8x16'] = {
+    'splat': function(value) {
+      return Module['_BinaryenUnary'](module, Module['SplatVecI8x16'], value);
+    },
+    'extract_lane_s': function(vec, index) {
+      return Module['_BinaryenSIMDExtract'](module, Module['ExtractLaneSVecI8x16'], vec, index);
+    },
+    'extract_lane_u': function(vec, index) {
+      return Module['_BinaryenSIMDExtract'](module, Module['ExtractLaneUVecI8x16'], vec, index);
+    },
+    'replace_lane': function(vec, index, value) {
+      return Module['_BinaryenSIMDReplace'](module, Module['ReplaceLaneVecI8x16'], vec, index, value);
+    },
+    'eq': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['EqVecI8x16'], left, right);
+    },
+    'ne': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['NeVecI8x16'], left, right);
+    },
+    'lt_s': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['LtSVecI8x16'], left, right);
+    },
+    'lt_u': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['LtUVecI8x16'], left, right);
+    },
+    'gt_s': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['GtSVecI8x16'], left, right);
+    },
+    'gt_u': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['GtUVecI8x16'], left, right);
+    },
+    'le_s': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['LeSVecI8x16'], left, right);
+    },
+    'le_u': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['LeUVecI8x16'], left, right);
+    },
+    'ge_s': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['GeSVecI8x16'], left, right);
+    },
+    'ge_u': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['GeUVecI8x16'], left, right);
+    },
+    'neg': function(value) {
+      return Module['_BinaryenUnary'](module, Module['NegVecI8x16'], value);
+    },
+    'any_true': function(value) {
+      return Module['_BinaryenUnary'](module, Module['AnyTrueVecI8x16'], value);
+    },
+    'all_true': function(value) {
+      return Module['_BinaryenUnary'](module, Module['AllTrueVecI8x16'], value);
+    },
+    'shl': function(vec, shift) {
+      return Module['_BinaryenSIMDShift'](module, Module['ShlVecI8x16'], vec, shift);
+    },
+    'shr_s': function(vec, shift) {
+      return Module['_BinaryenSIMDShift'](module, Module['ShrSVecI8x16'], vec, shift);
+    },
+    'shr_u': function(vec, shift) {
+      return Module['_BinaryenSIMDShift'](module, Module['ShrUVecI8x16'], vec, shift);
+    },
+    'add': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['AddVecI8x16'], left, right);
+    },
+    'add_saturate_s': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['AddSatSVecI8x16'], left, right);
+    },
+    'add_saturate_u': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['AddSatUVecI8x16'], left, right);
+    },
+    'sub': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['SubVecI8x16'], left, right);
+    },
+    'sub_saturate_s': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['SubSatSVecI8x16'], left, right);
+    },
+    'sub_saturate_u': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['SubSatUVecI8x16'], left, right);
+    },
+    'mul': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['MulVecI8x16'], left, right);
+    },
+  };
+
+  self['i16x8'] = {
+    'splat': function(value) {
+      return Module['_BinaryenUnary'](module, Module['SplatVecI16x8'], value);
+    },
+    'extract_lane_s': function(vec, index) {
+      return Module['_BinaryenSIMDExtract'](module, Module['ExtractLaneSVecI16x8'], vec, index);
+    },
+    'extract_lane_u': function(vec, index) {
+      return Module['_BinaryenSIMDExtract'](module, Module['ExtractLaneUVecI16x8'], vec, index);
+    },
+    'replace_lane': function(vec, index, value) {
+      return Module['_BinaryenSIMDReplace'](module, Module['ReplaceLaneVecI16x8'], vec, index, value);
+    },
+    'eq': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['EqVecI16x8'], left, right);
+    },
+    'ne': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['NeVecI16x8'], left, right);
+    },
+    'lt_s': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['LtSVecI16x8'], left, right);
+    },
+    'lt_u': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['LtUVecI16x8'], left, right);
+    },
+    'gt_s': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['GtSVecI16x8'], left, right);
+    },
+    'gt_u': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['GtUVecI16x8'], left, right);
+    },
+    'le_s': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['LeSVecI16x8'], left, right);
+    },
+    'le_u': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['LeUVecI16x8'], left, right);
+    },
+    'ge_s': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['GeSVecI16x8'], left, right);
+    },
+    'ge_u': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['GeUVecI16x8'], left, right);
+    },
+    'neg': function(value) {
+      return Module['_BinaryenUnary'](module, Module['NegVecI16x8'], value);
+    },
+    'any_true': function(value) {
+      return Module['_BinaryenUnary'](module, Module['AnyTrueVecI16x8'], value);
+    },
+    'all_true': function(value) {
+      return Module['_BinaryenUnary'](module, Module['AllTrueVecI16x8'], value);
+    },
+    'shl': function(vec, shift) {
+      return Module['_BinaryenSIMDShift'](module, Module['ShlVecI16x8'], vec, shift);
+    },
+    'shr_s': function(vec, shift) {
+      return Module['_BinaryenSIMDShift'](module, Module['ShrSVecI16x8'], vec, shift);
+    },
+    'shr_u': function(vec, shift) {
+      return Module['_BinaryenSIMDShift'](module, Module['ShrUVecI16x8'], vec, shift);
+    },
+    'add': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['AddVecI16x8'], left, right);
+    },
+    'add_saturate_s': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['AddSatSVecI16x8'], left, right);
+    },
+    'add_saturate_u': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['AddSatUVecI16x8'], left, right);
+    },
+    'sub': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['SubVecI16x8'], left, right);
+    },
+    'sub_saturate_s': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['SubSatSVecI16x8'], left, right);
+    },
+    'sub_saturate_u': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['SubSatUVecI16x8'], left, right);
+    },
+    'mul': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['MulVecI16x8'], left, right);
+    },
+  };
+
+  self['i32x4'] = {
+    'splat': function(value) {
+      return Module['_BinaryenUnary'](module, Module['SplatVecI32x4'], value);
+    },
+    'extract_lane': function(vec, index) {
+      return Module['_BinaryenSIMDExtract'](module, Module['ExtractLaneVecI32x4'], vec, index);
+    },
+    'replace_lane': function(vec, index, value) {
+      return Module['_BinaryenSIMDReplace'](module, Module['ReplaceLaneVecI32x4'], vec, index, value);
+    },
+    'eq': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['EqVecI32x4'], left, right);
+    },
+    'ne': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['NeVecI32x4'], left, right);
+    },
+    'lt_s': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['LtSVecI32x4'], left, right);
+    },
+    'lt_u': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['LtUVecI32x4'], left, right);
+    },
+    'gt_s': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['GtSVecI32x4'], left, right);
+    },
+    'gt_u': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['GtUVecI32x4'], left, right);
+    },
+    'le_s': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['LeSVecI32x4'], left, right);
+    },
+    'le_u': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['LeUVecI32x4'], left, right);
+    },
+    'ge_s': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['GeSVecI32x4'], left, right);
+    },
+    'ge_u': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['GeUVecI32x4'], left, right);
+    },
+    'neg': function(value) {
+      return Module['_BinaryenUnary'](module, Module['NegVecI32x4'], value);
+    },
+    'any_true': function(value) {
+      return Module['_BinaryenUnary'](module, Module['AnyTrueVecI32x4'], value);
+    },
+    'all_true': function(value) {
+      return Module['_BinaryenUnary'](module, Module['AllTrueVecI32x4'], value);
+    },
+    'shl': function(vec, shift) {
+      return Module['_BinaryenSIMDShift'](module, Module['ShlVecI32x4'], vec, shift);
+    },
+    'shr_s': function(vec, shift) {
+      return Module['_BinaryenSIMDShift'](module, Module['ShrSVecI32x4'], vec, shift);
+    },
+    'shr_u': function(vec, shift) {
+      return Module['_BinaryenSIMDShift'](module, Module['ShrUVecI32x4'], vec, shift);
+    },
+    'add': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['AddVecI32x4'], left, right);
+    },
+    'sub': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['SubVecI32x4'], left, right);
+    },
+    'mul': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['MulVecI32x4'], left, right);
+    },
+    'trunc_s/f32x4:sat': function(value) {
+      return Module['_BinaryenUnary'](module, Module['TruncSatSVecF32x4ToVecI32x4'], value);
+    },
+    'trunc_u/f32x4:sat': function(value) {
+      return Module['_BinaryenUnary'](module, Module['TruncSatUVecF32x4ToVecI32x4'], value);
+    },
+  };
+
+  self['i64x2'] = {
+    'splat': function(value) {
+      return Module['_BinaryenUnary'](module, Module['SplatVecI64x2'], value);
+    },
+    'extract_lane': function(vec, index) {
+      return Module['_BinaryenSIMDExtract'](module, Module['ExtractLaneVecI64x2'], vec, index);
+    },
+    'replace_lane': function(vec, index, value) {
+      return Module['_BinaryenSIMDReplace'](module, Module['ReplaceLaneVecI64x2'], vec, index, value);
+    },
+    'neg': function(value) {
+      return Module['_BinaryenUnary'](module, Module['NegVecI64x2'], value);
+    },
+    'any_true': function(value) {
+      return Module['_BinaryenUnary'](module, Module['AnyTrueVecI64x2'], value);
+    },
+    'all_true': function(value) {
+      return Module['_BinaryenUnary'](module, Module['AllTrueVecI64x2'], value);
+    },
+    'shl': function(vec, shift) {
+      return Module['_BinaryenSIMDShift'](module, Module['ShlVecI64x2'], vec, shift);
+    },
+    'shr_s': function(vec, shift) {
+      return Module['_BinaryenSIMDShift'](module, Module['ShrSVecI64x2'], vec, shift);
+    },
+    'shr_u': function(vec, shift) {
+      return Module['_BinaryenSIMDShift'](module, Module['ShrUVecI64x2'], vec, shift);
+    },
+    'add': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['AddVecI64x2'], left, right);
+    },
+    'sub': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['SubVecI64x2'], left, right);
+    },
+    'trunc_s/f64x2:sat': function(value) {
+      return Module['_BinaryenUnary'](module, Module['TruncSatSVecF64x2ToVecI64x2'], value);
+    },
+    'trunc_u/f64x2:sat': function(value) {
+      return Module['_BinaryenUnary'](module, Module['TruncSatUVecF64x2ToVecI64x2'], value);
+    },
+  };
+
+  self['f32x4'] = {
+    'splat': function(value) {
+      return Module['_BinaryenUnary'](module, Module['SplatVecF32x4'], value);
+    },
+    'extract_lane': function(vec, index) {
+      return Module['_BinaryenSIMDExtract'](module, Module['ExtractLaneVecF32x4'], vec, index);
+    },
+    'replace_lane': function(vec, index, value) {
+      return Module['_BinaryenSIMDReplace'](module, Module['ReplaceLaneVecF32x4'], vec, index, value);
+    },
+    'eq': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['EqVecF32x4'], left, right);
+    },
+    'ne': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['NeVecF32x4'], left, right);
+    },
+    'lt': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['LtVecF32x4'], left, right);
+    },
+    'gt': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['GtVecF32x4'], left, right);
+    },
+    'le': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['LeVecF32x4'], left, right);
+    },
+    'ge': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['GeVecF32x4'], left, right);
+    },
+    'abs': function(value) {
+      return Module['_BinaryenUnary'](module, Module['AbsVecF32x4'], value);
+    },
+    'neg': function(value) {
+      return Module['_BinaryenUnary'](module, Module['NegVecF32x4'], value);
+    },
+    'sqrt': function(value) {
+      return Module['_BinaryenUnary'](module, Module['SqrtVecF32x4'], value);
+    },
+    'add': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['AddVecF32x4'], left, right);
+    },
+    'sub': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['SubVecF32x4'], left, right);
+    },
+    'mul': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['MulVecF32x4'], left, right);
+    },
+    'div': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['DivVecF32x4'], left, right);
+    },
+    'min': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['MinVecF32x4'], left, right);
+    },
+    'max': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['MaxVecF32x4'], left, right);
+    },
+    'convert_s/i32x4': function(value) {
+      return Module['_BinaryenUnary'](module, Module['ConvertSVecI32x4ToVecF32x4'], value);
+    },
+    'convert_u/i32x4': function(value) {
+      return Module['_BinaryenUnary'](module, Module['ConvertUVecI32x4ToVecF32x4'], value);
+    },
+  };
+
+  self['f64x2'] = {
+    'splat': function(value) {
+      return Module['_BinaryenUnary'](module, Module['SplatVecF64x2'], value);
+    },
+    'extract_lane': function(vec, index) {
+      return Module['_BinaryenSIMDExtract'](module, Module['ExtractLaneVecF64x2'], vec, index);
+    },
+    'replace_lane': function(vec, index, value) {
+      return Module['_BinaryenSIMDReplace'](module, Module['ReplaceLaneVecF64x2'], vec, index, value);
+    },
+    'eq': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['EqVecF64x2'], left, right);
+    },
+    'ne': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['NeVecF64x2'], left, right);
+    },
+    'lt': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['LtVecF64x2'], left, right);
+    },
+    'gt': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['GtVecF64x2'], left, right);
+    },
+    'le': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['LeVecF64x2'], left, right);
+    },
+    'ge': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['GeVecF64x2'], left, right);
+    },
+    'abs': function(value) {
+      return Module['_BinaryenUnary'](module, Module['AbsVecF64x2'], value);
+    },
+    'neg': function(value) {
+      return Module['_BinaryenUnary'](module, Module['NegVecF64x2'], value);
+    },
+    'sqrt': function(value) {
+      return Module['_BinaryenUnary'](module, Module['SqrtVecF64x2'], value);
+    },
+    'add': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['AddVecF64x2'], left, right);
+    },
+    'sub': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['SubVecF64x2'], left, right);
+    },
+    'mul': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['MulVecF64x2'], left, right);
+    },
+    'div': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['DivVecF64x2'], left, right);
+    },
+    'min': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['MinVecF64x2'], left, right);
+    },
+    'max': function(left, right) {
+      return Module['_BinaryenBinary'](module, Module['MaxVecF64x2'], left, right);
+    },
+    'convert_s/i64x2': function(value) {
+      return Module['_BinaryenUnary'](module, Module['ConvertSVecI64x2ToVecF64x2'], value);
+    },
+    'convert_u/i64x2': function(value) {
+      return Module['_BinaryenUnary'](module, Module['ConvertUVecI64x2ToVecF64x2'], value);
+    },
+  };
+
   self['select'] = function(condition, ifTrue, ifFalse) {
     return Module['_BinaryenSelect'](module, condition, ifTrue, ifFalse);
   };
@@ -1551,6 +2162,55 @@ Module['getExpressionInfo'] = function(expr) {
         'ptr': Module['_BinaryenAtomicWakeGetPtr'](expr),
         'wakeCount': Module['_BinaryenAtomicWakeGetWakeCount'](expr)
       };
+    case Module['SIMDExtractId']:
+      return {
+        'id': id,
+        'type': type,
+        'op': Module['_BinaryenSIMDExtractGetOp'](expr),
+        'vec': Module['_BinaryenSIMDExtractGetVec'](expr),
+        'index': Module['_BinaryenSIMDExtractGetIndex'](expr)
+      };
+    case Module['SIMDReplaceId']:
+      return {
+        'id': id,
+        'type': type,
+        'op': Module['_BinaryenSIMDReplaceGetOp'](expr),
+        'vec': Module['_BinaryenSIMDReplaceGetVec'](expr),
+        'index': Module['_BinaryenSIMDReplaceGetIndex'](expr),
+        'value': Module['_BinaryenSIMDReplaceGetValue'](expr)
+      };
+    case Module['SIMDShuffleId']:
+      return preserveStack(function() {
+        var ret = stackAlloc(16);
+        Module['_BinaryenSIMDShuffleGetMask'](expr, ret);
+        var mask = [];
+        for (var i = 0 ; i < 16; i++) {
+          mask[i] = HEAP8[ret + i];
+        }
+        return {
+          'id': id,
+          'type': type,
+          'left': Module['_BinaryenSIMDShuffleGetLeft'](expr),
+          'right': Module['_BinaryenSIMDShuffleGetRight'](expr),
+          'mask': mask
+        };
+      });
+    case Module['SIMDBitselectId']:
+      return {
+        'id': id,
+        'type': type,
+        'left': Module['_BinaryenSIMDBitselectGetLeft'](expr),
+        'right': Module['_BinaryenSIMDBitselectGetRight'](expr),
+        'cond': Module['_BinaryenSIMDBitselectGetCond'](expr)
+      };
+    case Module['SIMDShiftId']:
+      return {
+        'id': id,
+        'type': type,
+        'op': Module['_BinaryenSIMDShiftGetOp'](expr),
+        'vec': Module['_BinaryenSIMDShiftGetVec'](expr),
+        'shift': Module['_BinaryenSIMDShiftGetShift'](expr)
+      };
     default:
       throw Error('unexpected id: ' + id);
   }
diff --git a/src/literal.h b/src/literal.h
index 4ed23b80d..0c9f9da96 100644
--- a/src/literal.h
+++ b/src/literal.h
@@ -18,6 +18,7 @@
 #define wasm_literal_h
 
 #include <iostream>
+#include <array>
 
 #include "support/hash.h"
 #include "support/utilities.h"
@@ -36,21 +37,50 @@ private:
   union {
     int32_t i32;
     int64_t i64;
+    uint8_t v128[16];
   };
 
 public:
-  Literal() : type(Type::none), i64(0) {}
-  explicit Literal(Type type) : type(type), i64(0) {}
+  Literal() : type(Type::none), v128() {}
+  explicit Literal(Type type) : type(type), v128() {}
   explicit Literal(int32_t  init) : type(Type::i32), i32(init) {}
   explicit Literal(uint32_t init) : type(Type::i32), i32(init) {}
   explicit Literal(int64_t  init) : type(Type::i64), i64(init) {}
   explicit Literal(uint64_t init) : type(Type::i64), i64(init) {}
   explicit Literal(float    init) : type(Type::f32), i32(bit_cast<int32_t>(init)) {}
   explicit Literal(double   init) : type(Type::f64), i64(bit_cast<int64_t>(init)) {}
+  // v128 literal from bytes
+  explicit Literal(const uint8_t init[16]);
+  // v128 literal from lane value literals
+  explicit Literal(const std::array<Literal, 16>&);
+  explicit Literal(const std::array<Literal, 8>&);
+  explicit Literal(const std::array<Literal, 4>&);
+  explicit Literal(const std::array<Literal, 2>&);
 
   bool isConcrete() { return type != none; }
   bool isNull() { return type == none; }
 
+  inline static Literal makeFromInt32(int32_t x, Type type) {
+    switch (type) {
+      case Type::i32: return Literal(int32_t(x)); break;
+      case Type::i64: return Literal(int64_t(x)); break;
+      case Type::f32: return Literal(float(x)); break;
+      case Type::f64: return Literal(double(x)); break;
+      case Type::v128: return Literal(
+        std::array<Literal, 4>{{
+          Literal(x), Literal(int32_t(0)), Literal(int32_t(0)), Literal(int32_t(0))
+        }}
+      );
+      case none:
+      case unreachable: WASM_UNREACHABLE();
+    }
+    WASM_UNREACHABLE();
+  }
+
+  inline static Literal makeZero(Type type) {
+    return makeFromInt32(0, type);
+  }
+
   Literal castToF32();
   Literal castToF64();
   Literal castToI32();
@@ -60,8 +90,12 @@ public:
   int64_t geti64() const { assert(type == Type::i64); return i64; }
   float   getf32() const { assert(type == Type::f32); return bit_cast<float>(i32); }
   double  getf64() const { assert(type == Type::f64); return bit_cast<double>(i64); }
+  std::array<uint8_t, 16> getv128() const;
 
-  int32_t* geti32Ptr() { assert(type == Type::i32); return &i32; } // careful!
+  // careful!
+  int32_t* geti32Ptr() { assert(type == Type::i32); return &i32; }
+  uint8_t* getv128Ptr() { assert(type == Type::v128); return v128; }
+  const uint8_t* getv128Ptr() const { assert(type == Type::v128); return v128; }
 
   int32_t reinterpreti32() const { assert(type == Type::f32); return i32; }
   int64_t reinterpreti64() const { assert(type == Type::f64); return i64; }
@@ -70,7 +104,7 @@ public:
 
   int64_t getInteger() const;
   double getFloat() const;
-  int64_t getBits() const;
+  void getBits(uint8_t (&buf)[16]) const;
   // Equality checks for the type and the bits, so a nan float would
   // be compared bitwise (which means that a Literal containing a nan
   // would be equal to itself, if the bits are equal).
@@ -84,6 +118,7 @@ public:
 
   static void printFloat(std::ostream &o, float f);
   static void printDouble(std::ostream& o, double d);
+  static void printVec128(std::ostream& o, const std::array<uint8_t, 16>& v);
 
   friend std::ostream& operator<<(std::ostream& o, Literal literal);
 
@@ -158,6 +193,163 @@ public:
   Literal min(const Literal& other) const;
   Literal max(const Literal& other) const;
   Literal copysign(const Literal& other) const;
+
+  std::array<Literal, 16> getLanesSI8x16() const;
+  std::array<Literal, 16> getLanesUI8x16() const;
+  std::array<Literal, 8> getLanesSI16x8() const;
+  std::array<Literal, 8> getLanesUI16x8() const;
+  std::array<Literal, 4> getLanesI32x4() const;
+  std::array<Literal, 2> getLanesI64x2() const;
+  std::array<Literal, 4> getLanesF32x4() const;
+  std::array<Literal, 2> getLanesF64x2() const;
+
+  Literal shuffleV8x16(const Literal& other, const std::array<uint8_t, 16>& mask) const;
+  Literal splatI8x16() const;
+  Literal extractLaneSI8x16(uint8_t index) const;
+  Literal extractLaneUI8x16(uint8_t index) const;
+  Literal replaceLaneI8x16(const Literal& other, uint8_t index) const;
+  Literal splatI16x8() const;
+  Literal extractLaneSI16x8(uint8_t index) const;
+  Literal extractLaneUI16x8(uint8_t index) const;
+  Literal replaceLaneI16x8(const Literal& other, uint8_t index) const;
+  Literal splatI32x4() const;
+  Literal extractLaneI32x4(uint8_t index) const;
+  Literal replaceLaneI32x4(const Literal& other, uint8_t index) const;
+  Literal splatI64x2() const;
+  Literal extractLaneI64x2(uint8_t index) const;
+  Literal replaceLaneI64x2(const Literal& other, uint8_t index) const;
+  Literal splatF32x4() const;
+  Literal extractLaneF32x4(uint8_t index) const;
+  Literal replaceLaneF32x4(const Literal& other, uint8_t index) const;
+  Literal splatF64x2() const;
+  Literal extractLaneF64x2(uint8_t index) const;
+  Literal replaceLaneF64x2(const Literal& other, uint8_t index) const;
+  Literal eqI8x16(const Literal& other) const;
+  Literal neI8x16(const Literal& other) const;
+  Literal ltSI8x16(const Literal& other) const;
+  Literal ltUI8x16(const Literal& other) const;
+  Literal gtSI8x16(const Literal& other) const;
+  Literal gtUI8x16(const Literal& other) const;
+  Literal leSI8x16(const Literal& other) const;
+  Literal leUI8x16(const Literal& other) const;
+  Literal geSI8x16(const Literal& other) const;
+  Literal geUI8x16(const Literal& other) const;
+  Literal eqI16x8(const Literal& other) const;
+  Literal neI16x8(const Literal& other) const;
+  Literal ltSI16x8(const Literal& other) const;
+  Literal ltUI16x8(const Literal& other) const;
+  Literal gtSI16x8(const Literal& other) const;
+  Literal gtUI16x8(const Literal& other) const;
+  Literal leSI16x8(const Literal& other) const;
+  Literal leUI16x8(const Literal& other) const;
+  Literal geSI16x8(const Literal& other) const;
+  Literal geUI16x8(const Literal& other) const;
+  Literal eqI32x4(const Literal& other) const;
+  Literal neI32x4(const Literal& other) const;
+  Literal ltSI32x4(const Literal& other) const;
+  Literal ltUI32x4(const Literal& other) const;
+  Literal gtSI32x4(const Literal& other) const;
+  Literal gtUI32x4(const Literal& other) const;
+  Literal leSI32x4(const Literal& other) const;
+  Literal leUI32x4(const Literal& other) const;
+  Literal geSI32x4(const Literal& other) const;
+  Literal geUI32x4(const Literal& other) const;
+  Literal eqF32x4(const Literal& other) const;
+  Literal neF32x4(const Literal& other) const;
+  Literal ltF32x4(const Literal& other) const;
+  Literal gtF32x4(const Literal& other) const;
+  Literal leF32x4(const Literal& other) const;
+  Literal geF32x4(const Literal& other) const;
+  Literal eqF64x2(const Literal& other) const;
+  Literal neF64x2(const Literal& other) const;
+  Literal ltF64x2(const Literal& other) const;
+  Literal gtF64x2(const Literal& other) const;
+  Literal leF64x2(const Literal& other) const;
+  Literal geF64x2(const Literal& other) const;
+  Literal notV128() const;
+  Literal andV128(const Literal& other) const;
+  Literal orV128(const Literal& other) const;
+  Literal xorV128(const Literal& other) const;
+  Literal bitselectV128(const Literal& left, const Literal& right) const;
+  Literal negI8x16() const;
+  Literal anyTrueI8x16() const;
+  Literal allTrueI8x16() const;
+  Literal shlI8x16(const Literal& other) const;
+  Literal shrSI8x16(const Literal& other) const;
+  Literal shrUI8x16(const Literal& other) const;
+  Literal addI8x16(const Literal& other) const;
+  Literal addSaturateSI8x16(const Literal& other) const;
+  Literal addSaturateUI8x16(const Literal& other) const;
+  Literal subI8x16(const Literal& other) const;
+  Literal subSaturateSI8x16(const Literal& other) const;
+  Literal subSaturateUI8x16(const Literal& other) const;
+  Literal mulI8x16(const Literal& other) const;
+  Literal negI16x8() const;
+  Literal anyTrueI16x8() const;
+  Literal allTrueI16x8() const;
+  Literal shlI16x8(const Literal& other) const;
+  Literal shrSI16x8(const Literal& other) const;
+  Literal shrUI16x8(const Literal& other) const;
+  Literal addI16x8(const Literal& other) const;
+  Literal addSaturateSI16x8(const Literal& other) const;
+  Literal addSaturateUI16x8(const Literal& other) const;
+  Literal subI16x8(const Literal& other) const;
+  Literal subSaturateSI16x8(const Literal& other) const;
+  Literal subSaturateUI16x8(const Literal& other) const;
+  Literal mulI16x8(const Literal& other) const;
+  Literal negI32x4() const;
+  Literal anyTrueI32x4() const;
+  Literal allTrueI32x4() const;
+  Literal shlI32x4(const Literal& other) const;
+  Literal shrSI32x4(const Literal& other) const;
+  Literal shrUI32x4(const Literal& other) const;
+  Literal addI32x4(const Literal& other) const;
+  Literal subI32x4(const Literal& other) const;
+  Literal mulI32x4(const Literal& other) const;
+  Literal negI64x2() const;
+  Literal anyTrueI64x2() const;
+  Literal allTrueI64x2() const;
+  Literal shlI64x2(const Literal& other) const;
+  Literal shrSI64x2(const Literal& other) const;
+  Literal shrUI64x2(const Literal& other) const;
+  Literal addI64x2(const Literal& other) const;
+  Literal subI64x2(const Literal& other) const;
+  Literal absF32x4() const;
+  Literal negF32x4() const;
+  Literal sqrtF32x4() const;
+  Literal addF32x4(const Literal& other) const;
+  Literal subF32x4(const Literal& other) const;
+  Literal mulF32x4(const Literal& other) const;
+  Literal divF32x4(const Literal& other) const;
+  Literal minF32x4(const Literal& other) const;
+  Literal maxF32x4(const Literal& other) const;
+  Literal absF64x2() const;
+  Literal negF64x2() const;
+  Literal sqrtF64x2() const;
+  Literal addF64x2(const Literal& other) const;
+  Literal subF64x2(const Literal& other) const;
+  Literal mulF64x2(const Literal& other) const;
+  Literal divF64x2(const Literal& other) const;
+  Literal minF64x2(const Literal& other) const;
+  Literal maxF64x2(const Literal& other) const;
+  Literal truncSatToSI32x4() const;
+  Literal truncSatToUI32x4() const;
+  Literal truncSatToSI64x2() const;
+  Literal truncSatToUI64x2() const;
+  Literal convertSToF32x4() const;
+  Literal convertUToF32x4() const;
+  Literal convertSToF64x2() const;
+  Literal convertUToF64x2() const;
+
+ private:
+  Literal addSatSI8(const Literal& other) const;
+  Literal addSatUI8(const Literal& other) const;
+  Literal addSatSI16(const Literal& other) const;
+  Literal addSatUI16(const Literal& other) const;
+  Literal subSatSI8(const Literal& other) const;
+  Literal subSatUI8(const Literal& other) const;
+  Literal subSatSI16(const Literal& other) const;
+  Literal subSatUI16(const Literal& other) const;
 };
 
 } // namespace wasm
@@ -165,9 +357,16 @@ public:
 namespace std {
 template<> struct hash<wasm::Literal> {
   size_t operator()(const wasm::Literal& a) const {
+    uint8_t bytes[16];
+    a.getBits(bytes);
+    int64_t chunks[2];
+    memcpy(chunks, bytes, sizeof(chunks));
     return wasm::rehash(
-      uint64_t(hash<size_t>()(size_t(a.type))),
-      uint64_t(hash<int64_t>()(a.getBits()))
+      wasm::rehash(
+        uint64_t(hash<size_t>()(size_t(a.type))),
+        uint64_t(hash<int64_t>()(chunks[0]))
+      ),
+      uint64_t(hash<int64_t>()(chunks[1]))
     );
   }
 };
@@ -175,7 +374,16 @@ template<> struct less<wasm::Literal> {
   bool operator()(const wasm::Literal& a, const wasm::Literal& b) const {
     if (a.type < b.type) return true;
     if (a.type > b.type) return false;
-    return a.getBits() < b.getBits();
+    switch (a.type) {
+      case wasm::Type::i32: return a.geti32() < b.geti32();
+      case wasm::Type::f32: return a.reinterpreti32() < b.reinterpreti32();
+      case wasm::Type::i64: return a.geti64() < b.geti64();
+      case wasm::Type::f64: return a.reinterpreti64() < b.reinterpreti64();
+      case wasm::Type::v128: return memcmp(a.getv128Ptr(), b.getv128Ptr(), 16) < 0;
+      case wasm::Type::none:
+      case wasm::Type::unreachable: return false;
+    }
+    WASM_UNREACHABLE();
   }
 };
 }
diff --git a/src/mixed_arena.h b/src/mixed_arena.h
index 4c62514d1..46487b7fc 100644
--- a/src/mixed_arena.h
+++ b/src/mixed_arena.h
@@ -19,13 +19,14 @@
 
 #include <atomic>
 #include <cassert>
-#include <cstdlib>
 #include <memory>
 #include <mutex>
 #include <thread>
 #include <type_traits>
 #include <vector>
 
+#include <support/alloc.h>
+
 //
 // Arena allocation for mixed-type data.
 //
@@ -63,11 +64,9 @@ struct MixedArena {
   static const size_t CHUNK_SIZE = 32768;
   static const size_t MAX_ALIGN = 16; // allow 128bit SIMD
 
-  typedef std::aligned_storage<CHUNK_SIZE, MAX_ALIGN>::type Chunk;
-
-  // Each pointer in chunks is to an array of Chunk structs; typically 1,
+  // Each pointer in chunks is to a multiple of CHUNK_SIZE - typically 1,
   // but possibly more.
-  std::vector<Chunk*> chunks;
+  std::vector<void*> chunks;
 
   size_t index = 0; // in last chunk
 
@@ -122,10 +121,12 @@ struct MixedArena {
       // Allocate a new chunk.
       auto numChunks = (size + CHUNK_SIZE - 1) / CHUNK_SIZE;
       assert(size <= numChunks * CHUNK_SIZE);
-      chunks.push_back(new Chunk[numChunks]);
+      auto* allocation = wasm::aligned_malloc(MAX_ALIGN, numChunks * CHUNK_SIZE);
+      if (!allocation) abort();
+      chunks.push_back(allocation);
       index = 0;
     }
-    uint8_t* ret = static_cast<uint8_t*>(static_cast<void*>(chunks.back()));
+    uint8_t* ret = static_cast<uint8_t*>(chunks.back());
     ret += index;
     index += size; // TODO: if we allocated more than 1 chunk, reuse the remainder, right now we allocate another next time
     return static_cast<void*>(ret);
@@ -141,7 +142,7 @@ struct MixedArena {
 
   void clear() {
     for (auto* chunk : chunks) {
-      delete[] chunk;
+      wasm::aligned_free(chunk);
     }
     chunks.clear();
   }
diff --git a/src/passes/CMakeLists.txt b/src/passes/CMakeLists.txt
index b4e396750..0ad9dbf05 100644
--- a/src/passes/CMakeLists.txt
+++ b/src/passes/CMakeLists.txt
@@ -30,6 +30,7 @@ SET(passes_SOURCES
   Metrics.cpp
   MinifyImportsAndExports.cpp
   NameList.cpp
+  NoExitRuntime.cpp
   OptimizeInstructions.cpp
   PickLoadSigns.cpp
   PostEmscripten.cpp
diff --git a/src/passes/CodeFolding.cpp b/src/passes/CodeFolding.cpp
index b639c7681..a79980cfe 100644
--- a/src/passes/CodeFolding.cpp
+++ b/src/passes/CodeFolding.cpp
@@ -524,13 +524,21 @@ private:
     // if we have enough to investigate, do so
     if (next.size() >= 2) {
       // now we want to find a mergeable item - any item that is equal among a subset
-      std::map<uint32_t, std::vector<Expression*>> hashed; // hash value => expressions with that hash
+      std::map<Expression*, HashType> hashes; // expression => hash value
+      std::map<HashType, std::vector<Expression*>> hashed; // hash value => expressions with that hash
       for (auto& tail : next) {
         auto* item = getItem(tail, num);
-        hashed[ExpressionAnalyzer::hash(item)].push_back(item);
+        auto hash = hashes[item] = ExpressionAnalyzer::hash(item);
+        hashed[hash].push_back(item);
       }
-      for (auto& iter : hashed) {
-        auto& items = iter.second;
+      // look at each hash value exactly once. we do this in a deterministic order.
+      std::set<HashType> seen;
+      for (auto& tail : next) {
+        auto* item = getItem(tail, num);
+        auto hash = hashes[item];
+        if (seen.count(hash)) continue;
+        seen.insert(hash);
+        auto& items = hashed[hash];
         if (items.size() == 1) continue;
         assert(items.size() > 0);
         // look for an item that has another match.
diff --git a/src/passes/CodePushing.cpp b/src/passes/CodePushing.cpp
index fefceb6ec..931df140d 100644
--- a/src/passes/CodePushing.cpp
+++ b/src/passes/CodePushing.cpp
@@ -29,8 +29,8 @@ namespace wasm {
 //
 // Analyzers some useful local properties: # of sets and gets, and SFA.
 //
-// Single First Assignment (SFA) form: the local has a single set_local, is
-// not a parameter, and has no get_locals before the set_local in postorder.
+// Single First Assignment (SFA) form: the local has a single local.set, is
+// not a parameter, and has no local.gets before the local.set in postorder.
 // This is a much weaker property than SSA, obviously, but together with
 // our implicit dominance properties in the structured AST is quite useful.
 //
diff --git a/src/passes/ConstHoisting.cpp b/src/passes/ConstHoisting.cpp
index 77ac5d251..11188a9ba 100644
--- a/src/passes/ConstHoisting.cpp
+++ b/src/passes/ConstHoisting.cpp
@@ -15,7 +15,7 @@
  */
 
 //
-// Hoists repeated constants to a local. A get_local takes 2 bytes
+// Hoists repeated constants to a local. A local.get takes 2 bytes
 // in most cases, and if a const is larger than that, it may be
 // better to store it to a local, then get it from that local.
 //
@@ -108,7 +108,7 @@ private:
     // or
     //   num > (size+2)/(size-2)
     auto before = num * size;
-    auto after = size + 2 /* set_local */ + (2 /* get_local */ * num);
+    auto after = size + 2 /* local.set */ + (2 /* local.get */ * num);
     return after < before;
   }
 
diff --git a/src/passes/DataFlowOpts.cpp b/src/passes/DataFlowOpts.cpp
index e32fcb700..702b3e7f4 100644
--- a/src/passes/DataFlowOpts.cpp
+++ b/src/passes/DataFlowOpts.cpp
@@ -88,7 +88,7 @@ struct DataFlowOpts : public WalkerPass<PostWalker<DataFlowOpts>> {
     //       then copy the result if it's smaller.
     if (node->isPhi() && DataFlow::allInputsIdentical(node)) {
       // Note we don't need to check for effects when replacing, as in
-      // flattened IR expression children are get_locals or consts.
+      // flattened IR expression children are local.gets or consts.
       auto* value = node->getValue(1);
       if (value->isConst()) {
         replaceAllUsesWith(node, value);
@@ -112,7 +112,7 @@ struct DataFlowOpts : public WalkerPass<PostWalker<DataFlowOpts>> {
     //dump(node, std::cout);
     auto* expr = node->expr;
     // First, note that some of the expression's children may be
-    // get_locals that we inferred during SSA analysis as constant.
+    // local.gets that we inferred during SSA analysis as constant.
     // We can apply those now.
     for (Index i = 0; i < node->values.size(); i++) {
       if (node->values[i]->isConst()) {
diff --git a/src/passes/DeadCodeElimination.cpp b/src/passes/DeadCodeElimination.cpp
index 2e62197b5..6e70fc55d 100644
--- a/src/passes/DeadCodeElimination.cpp
+++ b/src/passes/DeadCodeElimination.cpp
@@ -257,6 +257,11 @@ struct DeadCodeElimination : public WalkerPass<PostWalker<DeadCodeElimination>>
         case Expression::Id::AtomicRMWId: DELEGATE(AtomicRMW);
         case Expression::Id::AtomicWaitId: DELEGATE(AtomicWait);
         case Expression::Id::AtomicWakeId: DELEGATE(AtomicWake);
+        case Expression::Id::SIMDExtractId: DELEGATE(SIMDExtract);
+        case Expression::Id::SIMDReplaceId: DELEGATE(SIMDReplace);
+        case Expression::Id::SIMDShuffleId: DELEGATE(SIMDShuffle);
+        case Expression::Id::SIMDBitselectId: DELEGATE(SIMDBitselect);
+        case Expression::Id::SIMDShiftId: DELEGATE(SIMDShift);
         case Expression::Id::InvalidId: WASM_UNREACHABLE();
         case Expression::Id::NumExpressionIds: WASM_UNREACHABLE();
       }
diff --git a/src/passes/Flatten.cpp b/src/passes/Flatten.cpp
index aa5c1a491..61fc60b2b 100644
--- a/src/passes/Flatten.cpp
+++ b/src/passes/Flatten.cpp
@@ -27,26 +27,26 @@
 //  )
 // =>
 //  (if (..condition..)
-//    (set_local $temp
+//    (local.set $temp
 //      (..if true..)
 //    )
-//    (set_local $temp
+//    (local.set $temp
 //      (..if false..)
 //    )
 //  )
 //  (i32.add
-//    (get_local $temp)
+//    (local.get $temp)
 //    (i32.const 1)
 //  )
 //
 // Formally, this pass flattens in the precise sense of
 // making the AST have these properties:
 //
-//  1. The operands of an instruction must be a get_local or a const.
+//  1. The operands of an instruction must be a local.get or a const.
 //     anything else is written to a local earlier.
 //  2. Disallow block, loop, and if return values, i.e., do not use
 //     control flow to pass around values.
-//  3. Disallow tee_local, setting a local is always done in a set_local
+//  3. Disallow local.tee, setting a local is always done in a local.set
 //     on a non-nested-expression location.
 //
 
@@ -62,7 +62,7 @@ namespace wasm {
 // We use the following algorithm: we maintain a list of "preludes", code
 // that runs right before an expression. When we visit an expression we
 // must handle it and its preludes. If the expression has side effects,
-// we reduce it to a get_local and add a prelude for that. We then handle
+// we reduce it to a local.get and add a prelude for that. We then handle
 // the preludes, by moving them to the parent or handling them directly.
 // we can move them to the parent if the parent is not a control flow
 // structure. Otherwise, if the parent is a control flow structure, it
@@ -190,7 +190,7 @@ struct Flatten : public WalkerPass<ExpressionStackWalker<Flatten, UnifiedExpress
       // special handling
       if (auto* set = curr->dynCast<SetLocal>()) {
         if (set->isTee()) {
-          // we disallow tee_local
+          // we disallow local.tee
           if (set->value->type == unreachable) {
             replaceCurrent(set->value); // trivial, no set happens
           } else {
diff --git a/src/passes/I64ToI32Lowering.cpp b/src/passes/I64ToI32Lowering.cpp
index 200cebedb..4575dd2f8 100644
--- a/src/passes/I64ToI32Lowering.cpp
+++ b/src/passes/I64ToI32Lowering.cpp
@@ -235,9 +235,7 @@ struct I64ToI32Lowering : public WalkerPass<PostWalker<I64ToI32Lowering>> {
     setOutParam(curr, std::move(highBits));
   }
 
-  // If and Select have identical code
-  template<typename T>
-  void visitBranching(T* curr) {
+  void visitIf(If* curr) {
     if (!hasOutParam(curr->ifTrue)) return;
     assert(curr->ifFalse != nullptr && "Nullable ifFalse found");
     TempVar highBits = fetchOutParam(curr->ifTrue);
@@ -255,10 +253,6 @@ struct I64ToI32Lowering : public WalkerPass<PostWalker<I64ToI32Lowering>> {
     setOutParam(curr, std::move(highBits));
   }
 
-  void visitIf(If* curr) {
-    visitBranching<If>(curr);
-  }
-
   void visitLoop(Loop* curr) {
     assert(labelHighBitVars.find(curr->name) == labelHighBitVars.end());
     if (curr->type != i64) return;
@@ -1526,7 +1520,36 @@ struct I64ToI32Lowering : public WalkerPass<PostWalker<I64ToI32Lowering>> {
   }
 
   void visitSelect(Select* curr) {
-    visitBranching<Select>(curr);
+    if (!hasOutParam(curr->ifTrue)) {
+      assert(!hasOutParam(curr->ifFalse));
+      return;
+    }
+    assert(hasOutParam(curr->ifFalse));
+    TempVar highBits = getTemp();
+    TempVar lowBits = getTemp();
+    TempVar cond = getTemp();
+    Block* result = builder->blockify(
+      builder->makeSetLocal(cond, curr->condition),
+      builder->makeSetLocal(
+        lowBits,
+        builder->makeSelect(
+          builder->makeGetLocal(cond, i32),
+          curr->ifTrue,
+          curr->ifFalse
+        )
+      ),
+      builder->makeSetLocal(
+        highBits,
+        builder->makeSelect(
+          builder->makeGetLocal(cond, i32),
+          builder->makeGetLocal(fetchOutParam(curr->ifTrue), i32),
+          builder->makeGetLocal(fetchOutParam(curr->ifFalse), i32)
+        )
+      ),
+      builder->makeGetLocal(lowBits, i32)
+    );
+    setOutParam(result, std::move(highBits));
+    replaceCurrent(result);
   }
 
   void visitDrop(Drop* curr) {
diff --git a/src/passes/Inlining.cpp b/src/passes/Inlining.cpp
index ebc33bf97..f801662e0 100644
--- a/src/passes/Inlining.cpp
+++ b/src/passes/Inlining.cpp
@@ -55,11 +55,11 @@ static const int FLEXIBLE_SIZE_LIMIT = 20;
 // smaller than the call instruction itself. 2 is a safe number because
 // there is no risk of things like
 //  (func $reverse (param $x i32) (param $y i32)
-//   (call $something (get_local $y) (get_local $x))
+//   (call $something (local.get $y) (local.get $x))
 //  )
 // in which case the reversing of the params means we'll possibly need
 // a block and a temp local. But that takes at least 3 nodes, and 2 < 3.
-// More generally, with 2 items we may have a get_local, but no way to
+// More generally, with 2 items we may have a local.get, but no way to
 // require it to be saved instead of directly consumed.
 static const int INLINING_OPTIMIZING_WILL_DECREASE_SIZE_LIMIT = 2;
 
@@ -349,11 +349,11 @@ struct Inlining : public Pass {
   }
 };
 
-Pass *createInliningPass() {
+Pass* createInliningPass() {
   return new Inlining();
 }
 
-Pass *createInliningOptimizingPass() {
+Pass* createInliningOptimizingPass() {
   auto* ret = new Inlining();
   ret->optimize = true;
   return ret;
diff --git a/src/passes/InstrumentLocals.cpp b/src/passes/InstrumentLocals.cpp
index f582004d5..a1835eb64 100644
--- a/src/passes/InstrumentLocals.cpp
+++ b/src/passes/InstrumentLocals.cpp
@@ -20,22 +20,22 @@
 // gets:
 //
 //  Before:
-//   (get_local $x)
+//   (local.get $x)
 //
 //  After:
 //    (call $get_TYPE
 //     (i32.const n) // call id
 //     (i32.const n) // local id
-//     (get_local $x)
+//     (local.get $x)
 //    )
 //
 // sets:
 //
 //  Before:
-//   (set_local $x (i32.const 1))
+//   (local.set $x (i32.const 1))
 //
 //  After:
-//   (set_local $x
+//   (local.set $x
 //    (call $set_TYPE
 //     (i32.const n) // call id
 //     (i32.const n) // local id
diff --git a/src/passes/LegalizeJSInterface.cpp b/src/passes/LegalizeJSInterface.cpp
index b8d16894b..9d223390e 100644
--- a/src/passes/LegalizeJSInterface.cpp
+++ b/src/passes/LegalizeJSInterface.cpp
@@ -22,6 +22,13 @@
 // stub methods added in this pass, that thunk i64s into i32, i32 and
 // vice versa as necessary.
 //
+// We can also legalize in a "minimal" way, that is, only JS-specific
+// components, that only JS will care about, such as dynCall methods
+// (wasm will never call them, as it can share the table directly). E.g.
+// is dynamic linking, where we can avoid legalizing wasm=>wasm calls
+// across modules, we still want to legalize dynCalls so JS can call into the
+// table even to a signature that is not legal.
+//
 // This pass also legalizes according to asm.js FFI rules, which
 // disallow f32s. TODO: an option to not do that, if it matters?
 //
@@ -40,68 +47,74 @@
 namespace wasm {
 
 struct LegalizeJSInterface : public Pass {
+  bool full;
+
+  LegalizeJSInterface(bool full) : full(full) {}
+
   void run(PassRunner* runner, Module* module) override {
     // for each illegal export, we must export a legalized stub instead
     for (auto& ex : module->exports) {
       if (ex->kind == ExternalKind::Function) {
         // if it's an import, ignore it
         auto* func = module->getFunction(ex->value);
-        if (isIllegal(func)) {
+        if (isIllegal(func) && shouldBeLegalized(ex.get(), func)) {
           auto legalName = makeLegalStub(func, module);
           ex->value = legalName;
         }
       }
     }
-    // Avoid iterator invalidation later.
-    std::vector<Function*> originalFunctions;
-    for (auto& func : module->functions) {
-      originalFunctions.push_back(func.get());
-    }
-    // for each illegal import, we must call a legalized stub instead
-    for (auto* im : originalFunctions) {
-      if (im->imported() && isIllegal(module->getFunctionType(im->type))) {
-        auto funcName = makeLegalStubForCalledImport(im, module);
-        illegalImportsToLegal[im->name] = funcName;
-        // we need to use the legalized version in the table, as the import from JS
-        // is legal for JS. Our stub makes it look like a native wasm function.
-        for (auto& segment : module->table.segments) {
-          for (auto& name : segment.data) {
-            if (name == im->name) {
-              name = funcName;
+    if (full) {
+      // Avoid iterator invalidation later.
+      std::vector<Function*> originalFunctions;
+      for (auto& func : module->functions) {
+        originalFunctions.push_back(func.get());
+      }
+      // for each illegal import, we must call a legalized stub instead
+      for (auto* im : originalFunctions) {
+        if (im->imported() && isIllegal(module->getFunctionType(im->type))) {
+          auto funcName = makeLegalStubForCalledImport(im, module);
+          illegalImportsToLegal[im->name] = funcName;
+          // we need to use the legalized version in the table, as the import from JS
+          // is legal for JS. Our stub makes it look like a native wasm function.
+          for (auto& segment : module->table.segments) {
+            for (auto& name : segment.data) {
+              if (name == im->name) {
+                name = funcName;
+              }
             }
           }
         }
       }
-    }
-    if (illegalImportsToLegal.size() > 0) {
-      for (auto& pair : illegalImportsToLegal) {
-        module->removeFunction(pair.first);
-      }
+      if (illegalImportsToLegal.size() > 0) {
+        for (auto& pair : illegalImportsToLegal) {
+          module->removeFunction(pair.first);
+        }
 
-      // fix up imports: call_import of an illegal must be turned to a call of a legal
+        // fix up imports: call_import of an illegal must be turned to a call of a legal
 
-      struct FixImports : public WalkerPass<PostWalker<FixImports>> {
-        bool isFunctionParallel() override { return true; }
+        struct FixImports : public WalkerPass<PostWalker<FixImports>> {
+          bool isFunctionParallel() override { return true; }
 
-        Pass* create() override { return new FixImports(illegalImportsToLegal); }
+          Pass* create() override { return new FixImports(illegalImportsToLegal); }
 
-        std::map<Name, Name>* illegalImportsToLegal;
+          std::map<Name, Name>* illegalImportsToLegal;
 
-        FixImports(std::map<Name, Name>* illegalImportsToLegal) : illegalImportsToLegal(illegalImportsToLegal) {}
+          FixImports(std::map<Name, Name>* illegalImportsToLegal) : illegalImportsToLegal(illegalImportsToLegal) {}
 
-        void visitCall(Call* curr) {
-          auto iter = illegalImportsToLegal->find(curr->target);
-          if (iter == illegalImportsToLegal->end()) return;
+          void visitCall(Call* curr) {
+            auto iter = illegalImportsToLegal->find(curr->target);
+            if (iter == illegalImportsToLegal->end()) return;
 
-          if (iter->second == getFunction()->name) return; // inside the stub function itself, is the one safe place to do the call
-          replaceCurrent(Builder(*getModule()).makeCall(iter->second, curr->operands, curr->type));
-        }
-      };
+            if (iter->second == getFunction()->name) return; // inside the stub function itself, is the one safe place to do the call
+            replaceCurrent(Builder(*getModule()).makeCall(iter->second, curr->operands, curr->type));
+          }
+        };
 
-      PassRunner passRunner(module);
-      passRunner.setIsNested(true);
-      passRunner.add<FixImports>(&illegalImportsToLegal);
-      passRunner.run();
+        PassRunner passRunner(module);
+        passRunner.setIsNested(true);
+        passRunner.add<FixImports>(&illegalImportsToLegal);
+        passRunner.run();
+      }
     }
   }
 
@@ -118,6 +131,11 @@ private:
     return false;
   }
 
+  bool shouldBeLegalized(Export* ex, Function* func) {
+    if (full) return true;
+    // We are doing minimal legalization - just what JS needs.
+    return ex->name.startsWith("dynCall_");
+  }
 
   // JS calls the export, so it must call a legal stub that calls the actual wasm function
   Name makeLegalStub(Function* func, Module* module) {
@@ -256,7 +274,11 @@ private:
 };
 
 Pass *createLegalizeJSInterfacePass() {
-  return new LegalizeJSInterface();
+  return new LegalizeJSInterface(true);
+}
+
+Pass *createLegalizeJSInterfaceMinimallyPass() {
+  return new LegalizeJSInterface(false);
 }
 
 } // namespace wasm
diff --git a/src/passes/LocalCSE.cpp b/src/passes/LocalCSE.cpp
index 12f0d9d93..32cc97b34 100644
--- a/src/passes/LocalCSE.cpp
+++ b/src/passes/LocalCSE.cpp
@@ -20,7 +20,7 @@
 // This requires --flatten to be run before in order to be effective,
 // and preserves flatness. The reason flatness is required is that
 // this pass assumes everything is stored in a local, and all it does
-// is alter set_locals to do get_locals of an existing value when
+// is alter local.sets to do local.gets of an existing value when
 // possible, replacing a recomputing of that value. That design means that
 // if there are block and if return values, nested expressions not stored
 // to a local, etc., then it can't operate on them (and will just not
@@ -42,6 +42,7 @@
 #include <wasm-traversal.h>
 #include <pass.h>
 #include <ir/effects.h>
+#include <ir/cost.h>
 #include <ir/equivalent_sets.h>
 #include <ir/hashed.h>
 
@@ -55,7 +56,7 @@ struct LocalCSE : public WalkerPass<LinearExecutionWalker<LocalCSE>> {
   // information for an expression we can reuse
   struct UsableInfo {
     Expression* value; // the value we can reuse
-    Index index; // the local we are assigned to, get_local that to reuse us
+    Index index; // the local we are assigned to, local.get that to reuse us
     EffectAnalyzer effects;
 
     UsableInfo(Expression* value, Index index, PassOptions& passOptions) : value(value), index(index), effects(passOptions, value) {}
@@ -208,8 +209,19 @@ struct LocalCSE : public WalkerPass<LinearExecutionWalker<LocalCSE>> {
     if (EffectAnalyzer(getPassOptions(), value).hasSideEffects()) {
       return false; // we can't combine things with side effects
     }
-    // check what we care about TODO: use optimize/shrink levels?
-    return Measurer::measure(value) > 1;
+    auto& options = getPassRunner()->options;
+    // If the size is at least 3, then if we have two of them we have 6,
+    // and so adding one set+two gets and removing one of the items itself
+    // is not detrimental, and may be beneficial.
+    if (options.shrinkLevel > 0 && Measurer::measure(value) >= 3) {
+      return true;
+    }
+    // If we focus on speed, any reduction in cost is beneficial, as the
+    // cost of a get is essentially free.
+    if (options.shrinkLevel == 0 && CostAnalyzer(value).cost > 0) {
+      return true;
+    }
+    return false;
   }
 };
 
diff --git a/src/passes/MergeBlocks.cpp b/src/passes/MergeBlocks.cpp
index 57f8b7e41..38e9fc6a2 100644
--- a/src/passes/MergeBlocks.cpp
+++ b/src/passes/MergeBlocks.cpp
@@ -373,7 +373,7 @@ struct MergeBlocks : public WalkerPass<PostWalker<MergeBlocks>> {
 
   Pass* create() override { return new MergeBlocks; }
 
-  void visitBlock(Block *curr) {
+  void visitBlock(Block* curr) {
     optimizeBlock(curr, getModule(), getPassOptions());
   }
 
diff --git a/src/passes/MergeLocals.cpp b/src/passes/MergeLocals.cpp
index 8dcaa0cb9..4092e1ea8 100644
--- a/src/passes/MergeLocals.cpp
+++ b/src/passes/MergeLocals.cpp
@@ -22,11 +22,11 @@
 // example, in
 //
 //  (if (result i32)
-//   (tee_local $x
-//    (get_local $y)
+//   (local.tee $x
+//    (local.get $y)
 //   )
 //   (i32.const 100)
-//   (get_local $x)
+//   (local.get $x)
 //  )
 // 
 // If that assignment of $y is never used again, everything is fine. But if
@@ -60,13 +60,13 @@ struct MergeLocals : public WalkerPass<PostWalker<MergeLocals, UnifiedExpression
 
   void doWalkFunction(Function* func) {
     // first, instrument the graph by modifying each copy
-    //   (set_local $x
-    //    (get_local $y)
+    //   (local.set $x
+    //    (local.get $y)
     //   )
     // to
-    //   (set_local $x
-    //    (tee_local $y
-    //     (get_local $y)
+    //   (local.set $x
+    //    (local.tee $y
+    //     (local.get $y)
     //    )
     //   )
     // That is, we add a trivial assign of $y. This ensures we
@@ -128,8 +128,8 @@ struct MergeLocals : public WalkerPass<PostWalker<MergeLocals, UnifiedExpression
         optimizedToCopy[copy] = trivial;
       } else {
         // alternatively, we can try to remove the conflict in the opposite way: given
-        //   (set_local $x
-        //    (get_local $y)
+        //   (local.set $x
+        //    (local.get $y)
         //   )
         // we can look for uses of $x that could instead be uses of $y. this extends
         // $y's live range, but if it removes the conflict between $x and $y, it may be
diff --git a/src/passes/Metrics.cpp b/src/passes/Metrics.cpp
index 5176f8762..b794ea32d 100644
--- a/src/passes/Metrics.cpp
+++ b/src/passes/Metrics.cpp
@@ -194,11 +194,11 @@ struct Metrics : public WalkerPass<PostWalker<Metrics, UnifiedExpressionVisitor<
   }
 };
 
-Pass *createMetricsPass() {
+Pass* createMetricsPass() {
   return new Metrics(false);
 }
 
-Pass *createFunctionMetricsPass() {
+Pass* createFunctionMetricsPass() {
   return new Metrics(true);
 }
 
diff --git a/src/passes/NoExitRuntime.cpp b/src/passes/NoExitRuntime.cpp
new file mode 100644
index 000000000..05dd639c9
--- /dev/null
+++ b/src/passes/NoExitRuntime.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2016 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//
+// Assumes the program will never exit the runtime (as in the emscripten
+// NO_EXIT_RUNTIME option). That means that atexit()s do not need to be
+// run.
+//
+
+#include <pass.h>
+#include <wasm.h>
+#include <wasm-builder.h>
+#include <asmjs/shared-constants.h>
+
+using namespace std;
+
+namespace wasm {
+
+struct NoExitRuntime : public WalkerPass<PostWalker<NoExitRuntime>> {
+  bool isFunctionParallel() override { return true; }
+
+  Pass* create() override { return new NoExitRuntime; }
+
+  // Remove all possible manifestations of atexit, across asm2wasm and llvm wasm backend.
+  std::array<Name, 4> ATEXIT_NAMES = {{ "___cxa_atexit",
+                                        "__cxa_atexit",
+                                        "_atexit",
+                                        "atexit" }};
+
+  void visitCall(Call* curr) {
+    auto* import = getModule()->getFunctionOrNull(curr->target);
+    if (!import || !import->imported() || import->module != ENV) return;
+    for (auto name : ATEXIT_NAMES) {
+      if (name == import->base) {
+        replaceCurrent(
+          Builder(*getModule()).replaceWithIdenticalType(curr)
+        );
+      }
+    }
+  }
+};
+
+Pass* createNoExitRuntimePass() {
+  return new NoExitRuntime();
+}
+
+} // namespace wasm
diff --git a/src/passes/OptimizeInstructions.cpp b/src/passes/OptimizeInstructions.cpp
index 1dd05dd0a..7d4735686 100644
--- a/src/passes/OptimizeInstructions.cpp
+++ b/src/passes/OptimizeInstructions.cpp
@@ -48,7 +48,7 @@ Name I32_EXPR  = "i32.expr",
 // returns the maximum amount of bits used in an integer expression
 // not extremely precise (doesn't look into add operands, etc.)
 // LocalInfoProvider is an optional class that can provide answers about
-// get_local.
+// local.get.
 template<typename LocalInfoProvider>
 Index getMaxBits(Expression* curr, LocalInfoProvider* localInfoProvider) {
   if (auto* const_ = curr->dynCast<Const>()) {
@@ -461,7 +461,7 @@ struct OptimizeInstructions : public WalkerPass<PostWalker<OptimizeInstructions,
                 auto total = Bits::getEffectiveShifts(leftRight) + Bits::getEffectiveShifts(right);
                 if (total == Bits::getEffectiveShifts(total, right->type)) {
                   // no overflow, we can do this
-                  leftRight->value = LiteralUtils::makeLiteralFromInt32(total, right->type);
+                  leftRight->value = Literal::makeFromInt32(total, right->type);
                   return left;
                 } // TODO: handle overflows
               }
@@ -1096,7 +1096,7 @@ private:
     auto* right = binary->right->cast<Const>();
     if (isIntegerType(type)) {
       // operations on zero
-      if (right->value == LiteralUtils::makeLiteralFromInt32(0, type)) {
+      if (right->value == Literal::makeFromInt32(0, type)) {
         if (binary->op == Abstract::getBinary(type, Abstract::Shl) ||
             binary->op == Abstract::getBinary(type, Abstract::ShrU) ||
             binary->op == Abstract::getBinary(type, Abstract::ShrS) ||
@@ -1148,17 +1148,20 @@ private:
         }
       }
     }
-    // note that this is correct even on floats with a NaN on the left,
-    // as a NaN would skip the computation and just return the NaN,
-    // and that is precisely what we do here. but, the same with -1
-    // (change to a negation) would be incorrect for that reason.
-    if (right->value == LiteralUtils::makeLiteralFromInt32(1, type)) {
-      if (binary->op == Abstract::getBinary(type, Abstract::Mul) ||
-          binary->op == Abstract::getBinary(type, Abstract::DivS) ||
-          binary->op == Abstract::getBinary(type, Abstract::DivU)) {
-        return binary->left;
+    if (isIntegerType(type) || isFloatType(type)) {
+      // note that this is correct even on floats with a NaN on the left,
+      // as a NaN would skip the computation and just return the NaN,
+      // and that is precisely what we do here. but, the same with -1
+      // (change to a negation) would be incorrect for that reason.
+      if (right->value == Literal::makeFromInt32(1, type)) {
+        if (binary->op == Abstract::getBinary(type, Abstract::Mul) ||
+            binary->op == Abstract::getBinary(type, Abstract::DivS) ||
+            binary->op == Abstract::getBinary(type, Abstract::DivU)) {
+          return binary->left;
+        }
       }
     }
+    // TODO: v128 not implemented yet
     return nullptr;
   }
 
@@ -1171,7 +1174,7 @@ private:
     auto* left = binary->left->cast<Const>();
     if (isIntegerType(type)) {
       // operations on zero
-      if (left->value == LiteralUtils::makeLiteralFromInt32(0, type)) {
+      if (left->value == Literal::makeFromInt32(0, type)) {
         if ((binary->op == Abstract::getBinary(type, Abstract::Shl) ||
              binary->op == Abstract::getBinary(type, Abstract::ShrU) ||
              binary->op == Abstract::getBinary(type, Abstract::ShrS)) &&
@@ -1191,9 +1194,9 @@ private:
     // x + 5 == 7
     //   =>
     //     x == 2
-    if (binary->op == Abstract::getBinary(type, Abstract::Eq) ||
-        binary->op == Abstract::getBinary(type, Abstract::Ne)) {
-      if (isIntegerType(binary->left->type)) {
+    if (isIntegerType(binary->left->type)) {
+      if (binary->op == Abstract::getBinary(type, Abstract::Eq) ||
+          binary->op == Abstract::getBinary(type, Abstract::Ne)) {
         if (auto* left = binary->left->dynCast<Binary>()) {
           if (left->op == Abstract::getBinary(type, Abstract::Add) ||
               left->op == Abstract::getBinary(type, Abstract::Sub)) {
diff --git a/src/passes/Precompute.cpp b/src/passes/Precompute.cpp
index db34b9ffb..c23babda4 100644
--- a/src/passes/Precompute.cpp
+++ b/src/passes/Precompute.cpp
@@ -46,7 +46,7 @@ class PrecomputingExpressionRunner : public ExpressionRunner<PrecomputingExpress
   GetValues& getValues;
 
   // Whether we are trying to precompute down to an expression (which we can do on
-  // say 5 + 6) or to a value (which we can't do on a tee_local that flows a 7
+  // say 5 + 6) or to a value (which we can't do on a local.tee that flows a 7
   // through it). When we want to replace the expression, we can only do so
   // when it has no side effects. When we don't care about replacing the expression,
   // we just want to know if it will contain a known constant.
@@ -159,8 +159,12 @@ struct Precompute : public WalkerPass<PostWalker<Precompute, UnifiedExpressionVi
   }
 
   void visitExpression(Expression* curr) {
-    // TODO: if get_local, only replace with a constant if we don't care about size...?
+    // TODO: if local.get, only replace with a constant if we don't care about size...?
     if (curr->is<Const>() || curr->is<Nop>()) return;
+    // Until engines implement v128.const and we have SIMD-aware optimizations
+    // that can break large v128.const instructions into smaller consts and
+    // splats, do not try to precompute v128 expressions.
+    if (curr->type == v128) return;
     // try to evaluate this into a const
     Flow flow = precomputeExpression(curr);
     if (flow.breaking()) {
@@ -241,7 +245,7 @@ private:
   // itself. This differs from precomputeExpression in that we care about
   // the value the expression will have, which we cannot necessary replace
   // the expression with. For example,
-  //  (tee_local (i32.const 1))
+  //  (local.tee (i32.const 1))
   // will have value 1 which we can optimize here, but in precomputeExpression
   // we could not do anything.
   Literal precomputeValue(Expression* curr) {
@@ -297,7 +301,7 @@ private:
           Literal curr;
           if (set == nullptr) {
             if (getFunction()->isVar(get->index)) {
-              curr = LiteralUtils::makeLiteralZero(getFunction()->getLocalType(get->index));
+              curr = Literal::makeZero(getFunction()->getLocalType(get->index));
             } else {
               // it's a param, so it's hopeless
               value = Literal();
diff --git a/src/passes/Print.cpp b/src/passes/Print.cpp
index 51c7e9f97..e544447fe 100644
--- a/src/passes/Print.cpp
+++ b/src/passes/Print.cpp
@@ -113,22 +113,22 @@ struct PrintExpressionContents : public Visitor<PrintExpressionContents> {
     printMedium(o, "call_indirect (type ") << curr->fullType << ')';
   }
   void visitGetLocal(GetLocal* curr) {
-    printMedium(o, "get_local ") << printableLocal(curr->index, currFunction);
+    printMedium(o, "local.get ") << printableLocal(curr->index, currFunction);
   }
   void visitSetLocal(SetLocal* curr) {
     if (curr->isTee()) {
-      printMedium(o, "tee_local ");
+      printMedium(o, "local.tee ");
     } else {
-      printMedium(o, "set_local ");
+      printMedium(o, "local.set ");
     }
     o << printableLocal(curr->index, currFunction);
   }
   void visitGetGlobal(GetGlobal* curr) {
-    printMedium(o, "get_global ");
+    printMedium(o, "global.get ");
     printName(curr->name, o);
   }
   void visitSetGlobal(SetGlobal* curr) {
-    printMedium(o, "set_global ");
+    printMedium(o, "global.set ");
     printName(curr->name, o);
   }
   void visitLoad(Load* curr) {
@@ -192,7 +192,6 @@ struct PrintExpressionContents : public Visitor<PrintExpressionContents> {
       } else {
         WASM_UNREACHABLE();
       }
-      o << "_u";
     }
     o << '.';
   }
@@ -207,6 +206,9 @@ struct PrintExpressionContents : public Visitor<PrintExpressionContents> {
       case Xor:  o << "xor";  break;
       case Xchg: o << "xchg"; break;
     }
+    if (curr->bytes != getTypeSize(curr->type)) {
+      o << "_u";
+    }
     restoreNormalColor(o);
     if (curr->offset) {
       o << " offset=" << curr->offset;
@@ -215,7 +217,10 @@ struct PrintExpressionContents : public Visitor<PrintExpressionContents> {
   void visitAtomicCmpxchg(AtomicCmpxchg* curr) {
     prepareColor(o);
     printRMWSize(o, curr->type, curr->bytes);
-    o << "cmpxchg";
+     o << "cmpxchg";
+    if (curr->bytes != getTypeSize(curr->type)) {
+      o << "_u";
+    }
     restoreNormalColor(o);
     if (curr->offset) {
       o << " offset=" << curr->offset;
@@ -234,6 +239,60 @@ struct PrintExpressionContents : public Visitor<PrintExpressionContents> {
       o << " offset=" << curr->offset;
     }
   }
+  void visitSIMDExtract(SIMDExtract* curr) {
+    prepareColor(o);
+    switch (curr->op) {
+      case ExtractLaneSVecI8x16: o << "i8x16.extract_lane_s"; break;
+      case ExtractLaneUVecI8x16: o << "i8x16.extract_lane_u"; break;
+      case ExtractLaneSVecI16x8: o << "i16x8.extract_lane_s"; break;
+      case ExtractLaneUVecI16x8: o << "i16x8.extract_lane_u"; break;
+      case ExtractLaneVecI32x4: o << "i32x4.extract_lane"; break;
+      case ExtractLaneVecI64x2: o << "i64x2.extract_lane"; break;
+      case ExtractLaneVecF32x4: o << "f32x4.extract_lane"; break;
+      case ExtractLaneVecF64x2: o << "f64x2.extract_lane"; break;
+    }
+    o << " " << int(curr->index);
+  }
+  void visitSIMDReplace(SIMDReplace* curr) {
+    prepareColor(o);
+    switch (curr->op) {
+      case ReplaceLaneVecI8x16: o << "i8x16.replace_lane"; break;
+      case ReplaceLaneVecI16x8: o << "i16x8.replace_lane"; break;
+      case ReplaceLaneVecI32x4: o << "i32x4.replace_lane"; break;
+      case ReplaceLaneVecI64x2: o << "i64x2.replace_lane"; break;
+      case ReplaceLaneVecF32x4: o << "f32x4.replace_lane"; break;
+      case ReplaceLaneVecF64x2: o << "f64x2.replace_lane"; break;
+    }
+    o << " " << int(curr->index);
+  }
+  void visitSIMDShuffle(SIMDShuffle* curr) {
+    prepareColor(o);
+    o << "v8x16.shuffle";
+    for (uint8_t mask_index : curr->mask) {
+      o << " " << std::to_string(mask_index);
+    }
+  }
+  void visitSIMDBitselect(SIMDBitselect* curr) {
+    prepareColor(o);
+    o << "v128.bitselect";
+  }
+  void visitSIMDShift(SIMDShift* curr) {
+    prepareColor(o);
+    switch (curr->op) {
+      case ShlVecI8x16:  o << "i8x16.shl"; break;
+      case ShrSVecI8x16: o << "i8x16.shr_s"; break;
+      case ShrUVecI8x16: o << "i8x16.shr_u"; break;
+      case ShlVecI16x8:  o << "i16x8.shl"; break;
+      case ShrSVecI16x8: o << "i16x8.shr_s"; break;
+      case ShrUVecI16x8: o << "i16x8.shr_u"; break;
+      case ShlVecI32x4:  o << "i32x4.shl"; break;
+      case ShrSVecI32x4: o << "i32x4.shr_s"; break;
+      case ShrUVecI32x4: o << "i32x4.shr_u"; break;
+      case ShlVecI64x2:  o << "i64x2.shl"; break;
+      case ShrSVecI64x2: o << "i64x2.shr_s"; break;
+      case ShrUVecI64x2: o << "i64x2.shr_u"; break;
+    }
+  }
   void visitConst(Const* curr) {
     o << curr->value;
   }
@@ -262,44 +321,77 @@ struct PrintExpressionContents : public Visitor<PrintExpressionContents> {
       case TruncFloat64:           o << "f64.trunc";   break;
       case NearestFloat64:         o << "f64.nearest"; break;
       case SqrtFloat64:            o << "f64.sqrt";    break;
-      case ExtendSInt32:           o << "i64.extend_s/i32"; break;
-      case ExtendUInt32:           o << "i64.extend_u/i32"; break;
-      case WrapInt64:              o << "i32.wrap/i64"; break;
-      case TruncSFloat32ToInt32:   o << "i32.trunc_s/f32"; break;
-      case TruncSFloat32ToInt64:   o << "i64.trunc_s/f32"; break;
-      case TruncUFloat32ToInt32:   o << "i32.trunc_u/f32"; break;
-      case TruncUFloat32ToInt64:   o << "i64.trunc_u/f32"; break;
-      case TruncSFloat64ToInt32:   o << "i32.trunc_s/f64"; break;
-      case TruncSFloat64ToInt64:   o << "i64.trunc_s/f64"; break;
-      case TruncUFloat64ToInt32:   o << "i32.trunc_u/f64"; break;
-      case TruncUFloat64ToInt64:   o << "i64.trunc_u/f64"; break;
-      case ReinterpretFloat32:     o << "i32.reinterpret/f32"; break;
-      case ReinterpretFloat64:     o << "i64.reinterpret/f64"; break;
-      case ConvertUInt32ToFloat32: o << "f32.convert_u/i32"; break;
-      case ConvertUInt32ToFloat64: o << "f64.convert_u/i32"; break;
-      case ConvertSInt32ToFloat32: o << "f32.convert_s/i32"; break;
-      case ConvertSInt32ToFloat64: o << "f64.convert_s/i32"; break;
-      case ConvertUInt64ToFloat32: o << "f32.convert_u/i64"; break;
-      case ConvertUInt64ToFloat64: o << "f64.convert_u/i64"; break;
-      case ConvertSInt64ToFloat32: o << "f32.convert_s/i64"; break;
-      case ConvertSInt64ToFloat64: o << "f64.convert_s/i64"; break;
-      case PromoteFloat32:         o << "f64.promote/f32"; break;
-      case DemoteFloat64:          o << "f32.demote/f64"; break;
-      case ReinterpretInt32:       o << "f32.reinterpret/i32"; break;
-      case ReinterpretInt64:       o << "f64.reinterpret/i64"; break;
-      case ExtendS8Int32:          o << "i32.extend8_s"; break;
-      case ExtendS16Int32:         o << "i32.extend16_s"; break;
-      case ExtendS8Int64:          o << "i64.extend8_s"; break;
-      case ExtendS16Int64:         o << "i64.extend16_s"; break;
-      case ExtendS32Int64:         o << "i64.extend32_s"; break;
-      case TruncSatSFloat32ToInt32: o << "i32.trunc_s:sat/f32"; break;
-      case TruncSatUFloat32ToInt32: o << "i32.trunc_u:sat/f32"; break;
-      case TruncSatSFloat64ToInt32: o << "i32.trunc_s:sat/f64"; break;
-      case TruncSatUFloat64ToInt32: o << "i32.trunc_u:sat/f64"; break;
-      case TruncSatSFloat32ToInt64: o << "i64.trunc_s:sat/f32"; break;
-      case TruncSatUFloat32ToInt64: o << "i64.trunc_u:sat/f32"; break;
-      case TruncSatSFloat64ToInt64: o << "i64.trunc_s:sat/f64"; break;
-      case TruncSatUFloat64ToInt64: o << "i64.trunc_u:sat/f64"; break;
+      case ExtendSInt32:           o << "i64.extend_i32_s";    break;
+      case ExtendUInt32:           o << "i64.extend_i32_u";    break;
+      case WrapInt64:              o << "i32.wrap_i64";        break;
+      case TruncSFloat32ToInt32:   o << "i32.trunc_f32_s";     break;
+      case TruncSFloat32ToInt64:   o << "i64.trunc_f32_s";     break;
+      case TruncUFloat32ToInt32:   o << "i32.trunc_f32_u";     break;
+      case TruncUFloat32ToInt64:   o << "i64.trunc_f32_u";     break;
+      case TruncSFloat64ToInt32:   o << "i32.trunc_f64_s";     break;
+      case TruncSFloat64ToInt64:   o << "i64.trunc_f64_s";     break;
+      case TruncUFloat64ToInt32:   o << "i32.trunc_f64_u";     break;
+      case TruncUFloat64ToInt64:   o << "i64.trunc_f64_u";     break;
+      case ReinterpretFloat32:     o << "i32.reinterpret_f32"; break;
+      case ReinterpretFloat64:     o << "i64.reinterpret_f64"; break;
+      case ConvertUInt32ToFloat32: o << "f32.convert_i32_u";   break;
+      case ConvertUInt32ToFloat64: o << "f64.convert_i32_u";   break;
+      case ConvertSInt32ToFloat32: o << "f32.convert_i32_s";   break;
+      case ConvertSInt32ToFloat64: o << "f64.convert_i32_s";   break;
+      case ConvertUInt64ToFloat32: o << "f32.convert_i64_u";   break;
+      case ConvertUInt64ToFloat64: o << "f64.convert_i64_u";   break;
+      case ConvertSInt64ToFloat32: o << "f32.convert_i64_s";   break;
+      case ConvertSInt64ToFloat64: o << "f64.convert_i64_s";   break;
+      case PromoteFloat32:         o << "f64.promote_f32";     break;
+      case DemoteFloat64:          o << "f32.demote_f64";      break;
+      case ReinterpretInt32:       o << "f32.reinterpret_i32"; break;
+      case ReinterpretInt64:       o << "f64.reinterpret_i64"; break;
+      case ExtendS8Int32:          o << "i32.extend8_s";       break;
+      case ExtendS16Int32:         o << "i32.extend16_s";      break;
+      case ExtendS8Int64:          o << "i64.extend8_s";       break;
+      case ExtendS16Int64:         o << "i64.extend16_s";      break;
+      case ExtendS32Int64:         o << "i64.extend32_s";      break;
+      case TruncSatSFloat32ToInt32: o << "i32.trunc_sat_f32_s"; break;
+      case TruncSatUFloat32ToInt32: o << "i32.trunc_sat_f32_u"; break;
+      case TruncSatSFloat64ToInt32: o << "i32.trunc_sat_f64_s"; break;
+      case TruncSatUFloat64ToInt32: o << "i32.trunc_sat_f64_u"; break;
+      case TruncSatSFloat32ToInt64: o << "i64.trunc_sat_f32_s"; break;
+      case TruncSatUFloat32ToInt64: o << "i64.trunc_sat_f32_u"; break;
+      case TruncSatSFloat64ToInt64: o << "i64.trunc_sat_f64_s"; break;
+      case TruncSatUFloat64ToInt64: o << "i64.trunc_sat_f64_u"; break;
+      case SplatVecI8x16:           o << "i8x16.splat";         break;
+      case SplatVecI16x8:           o << "i16x8.splat";         break;
+      case SplatVecI32x4:           o << "i32x4.splat";         break;
+      case SplatVecI64x2:           o << "i64x2.splat";         break;
+      case SplatVecF32x4:           o << "f32x4.splat";         break;
+      case SplatVecF64x2:           o << "f64x2.splat";         break;
+      case NotVec128:               o << "v128.not";            break;
+      case NegVecI8x16:             o << "i8x16.neg";           break;
+      case AnyTrueVecI8x16:         o << "i8x16.any_true";      break;
+      case AllTrueVecI8x16:         o << "i8x16.all_true";      break;
+      case NegVecI16x8:             o << "i16x8.neg";           break;
+      case AnyTrueVecI16x8:         o << "i16x8.any_true";      break;
+      case AllTrueVecI16x8:         o << "i16x8.all_true";      break;
+      case NegVecI32x4:             o << "i32x4.neg";           break;
+      case AnyTrueVecI32x4:         o << "i32x4.any_true";      break;
+      case AllTrueVecI32x4:         o << "i32x4.all_true";      break;
+      case NegVecI64x2:             o << "i64x2.neg";           break;
+      case AnyTrueVecI64x2:         o << "i64x2.any_true";      break;
+      case AllTrueVecI64x2:         o << "i64x2.all_true";      break;
+      case AbsVecF32x4:             o << "f32x4.abs";           break;
+      case NegVecF32x4:             o << "f32x4.neg";           break;
+      case SqrtVecF32x4:            o << "f32x4.sqrt";          break;
+      case AbsVecF64x2:             o << "f64x2.abs";           break;
+      case NegVecF64x2:             o << "f64x2.neg";           break;
+      case SqrtVecF64x2:            o << "f64x2.sqrt";          break;
+      case TruncSatSVecF32x4ToVecI32x4: o << "i32x4.trunc_sat_f32x4_s"; break;
+      case TruncSatUVecF32x4ToVecI32x4: o << "i32x4.trunc_sat_f32x4_u"; break;
+      case TruncSatSVecF64x2ToVecI64x2: o << "i64x2.trunc_sat_f64x2_s"; break;
+      case TruncSatUVecF64x2ToVecI64x2: o << "i64x2.trunc_sat_f64x2_u"; break;
+      case ConvertSVecI32x4ToVecF32x4:  o << "f32x4.convert_i32x4_s";   break;
+      case ConvertUVecI32x4ToVecF32x4:  o << "f32x4.convert_i32x4_u";   break;
+      case ConvertSVecI64x2ToVecF64x2:  o << "f64x2.convert_i64x2_s";   break;
+      case ConvertUVecI64x2ToVecF64x2:  o << "f64x2.convert_i64x2_u";   break;
       case InvalidUnary: WASM_UNREACHABLE();
     }
   }
@@ -386,6 +478,86 @@ struct PrintExpressionContents : public Visitor<PrintExpressionContents> {
       case GtFloat64:       o << "f64.gt";       break;
       case GeFloat64:       o << "f64.ge";       break;
 
+      case EqVecI8x16:    o << "i8x16.eq";     break;
+      case NeVecI8x16:    o << "i8x16.ne";     break;
+      case LtSVecI8x16:   o << "i8x16.lt_s";   break;
+      case LtUVecI8x16:   o << "i8x16.lt_u";   break;
+      case GtSVecI8x16:   o << "i8x16.gt_s";   break;
+      case GtUVecI8x16:   o << "i8x16.gt_u";   break;
+      case LeSVecI8x16:   o << "i8x16.le_s";   break;
+      case LeUVecI8x16:   o << "i8x16.le_u";   break;
+      case GeSVecI8x16:   o << "i8x16.ge_s";   break;
+      case GeUVecI8x16:   o << "i8x16.ge_u";   break;
+      case EqVecI16x8:    o << "i16x8.eq";     break;
+      case NeVecI16x8:    o << "i16x8.ne";     break;
+      case LtSVecI16x8:   o << "i16x8.lt_s";   break;
+      case LtUVecI16x8:   o << "i16x8.lt_u";   break;
+      case GtSVecI16x8:   o << "i16x8.gt_s";   break;
+      case GtUVecI16x8:   o << "i16x8.gt_u";   break;
+      case LeSVecI16x8:   o << "i16x8.le_s";   break;
+      case LeUVecI16x8:   o << "i16x8.le_u";   break;
+      case GeSVecI16x8:   o << "i16x8.ge_s";   break;
+      case GeUVecI16x8:   o << "i16x8.ge_u";   break;
+      case EqVecI32x4:    o << "i32x4.eq";     break;
+      case NeVecI32x4:    o << "i32x4.ne";     break;
+      case LtSVecI32x4:   o << "i32x4.lt_s";   break;
+      case LtUVecI32x4:   o << "i32x4.lt_u";   break;
+      case GtSVecI32x4:   o << "i32x4.gt_s";   break;
+      case GtUVecI32x4:   o << "i32x4.gt_u";   break;
+      case LeSVecI32x4:   o << "i32x4.le_s";   break;
+      case LeUVecI32x4:   o << "i32x4.le_u";   break;
+      case GeSVecI32x4:   o << "i32x4.ge_s";   break;
+      case GeUVecI32x4:   o << "i32x4.ge_u";   break;
+      case EqVecF32x4:    o << "f32x4.eq";     break;
+      case NeVecF32x4:    o << "f32x4.ne";     break;
+      case LtVecF32x4:    o << "f32x4.lt";     break;
+      case GtVecF32x4:    o << "f32x4.gt";     break;
+      case LeVecF32x4:    o << "f32x4.le";     break;
+      case GeVecF32x4:    o << "f32x4.ge";     break;
+      case EqVecF64x2:    o << "f64x2.eq";     break;
+      case NeVecF64x2:    o << "f64x2.ne";     break;
+      case LtVecF64x2:    o << "f64x2.lt";     break;
+      case GtVecF64x2:    o << "f64x2.gt";     break;
+      case LeVecF64x2:    o << "f64x2.le";     break;
+      case GeVecF64x2:    o << "f64x2.ge";     break;
+
+      case AndVec128:       o << "v128.and";     break;
+      case OrVec128:        o << "v128.or";      break;
+      case XorVec128:       o << "v128.xor";     break;
+
+      case AddVecI8x16:     o << "i8x16.add";            break;
+      case AddSatSVecI8x16: o << "i8x16.add_saturate_s"; break;
+      case AddSatUVecI8x16: o << "i8x16.add_saturate_u"; break;
+      case SubVecI8x16:     o << "i8x16.sub";            break;
+      case SubSatSVecI8x16: o << "i8x16.sub_saturate_s"; break;
+      case SubSatUVecI8x16: o << "i8x16.sub_saturate_u"; break;
+      case MulVecI8x16:     o << "i8x16.mul";            break;
+      case AddVecI16x8:     o << "i16x8.add";            break;
+      case AddSatSVecI16x8: o << "i16x8.add_saturate_s"; break;
+      case AddSatUVecI16x8: o << "i16x8.add_saturate_u"; break;
+      case SubVecI16x8:     o << "i16x8.sub";            break;
+      case SubSatSVecI16x8: o << "i16x8.sub_saturate_s"; break;
+      case SubSatUVecI16x8: o << "i16x8.sub_saturate_u"; break;
+      case MulVecI16x8:     o << "i16x8.mul";            break;
+      case AddVecI32x4:     o << "i32x4.add";            break;
+      case SubVecI32x4:     o << "i32x4.sub";            break;
+      case MulVecI32x4:     o << "i32x4.mul";            break;
+      case AddVecI64x2:     o << "i64x2.add";            break;
+      case SubVecI64x2:     o << "i64x2.sub";            break;
+
+      case AddVecF32x4:   o << "f32x4.add";    break;
+      case SubVecF32x4:   o << "f32x4.sub";    break;
+      case MulVecF32x4:   o << "f32x4.mul";    break;
+      case DivVecF32x4:   o << "f32x4.div";    break;
+      case MinVecF32x4:   o << "f32x4.min";    break;
+      case MaxVecF32x4:   o << "f32x4.max";    break;
+      case AddVecF64x2:   o << "f64x2.add";    break;
+      case SubVecF64x2:   o << "f64x2.sub";    break;
+      case MulVecF64x2:   o << "f64x2.mul";    break;
+      case DivVecF64x2:   o << "f64x2.div";    break;
+      case MinVecF64x2:   o << "f64x2.min";    break;
+      case MaxVecF64x2:   o << "f64x2.max";    break;
+
       case InvalidBinary: WASM_UNREACHABLE();
     }
     restoreNormalColor(o);
@@ -724,6 +896,46 @@ struct PrintSExpression : public Visitor<PrintSExpression> {
     printFullLine(curr->wakeCount);
     decIndent();
   }
+  void visitSIMDExtract(SIMDExtract* curr) {
+    o << '(';
+    PrintExpressionContents(currFunction, o).visit(curr);
+    incIndent();
+    printFullLine(curr->vec);
+    decIndent();
+  }
+  void visitSIMDReplace(SIMDReplace* curr) {
+    o << '(';
+    PrintExpressionContents(currFunction, o).visit(curr);
+    incIndent();
+    printFullLine(curr->vec);
+    printFullLine(curr->value);
+    decIndent();
+  }
+  void visitSIMDShuffle(SIMDShuffle* curr) {
+    o << '(';
+    PrintExpressionContents(currFunction, o).visit(curr);
+    incIndent();
+    printFullLine(curr->left);
+    printFullLine(curr->right);
+    decIndent();
+  }
+  void visitSIMDBitselect(SIMDBitselect* curr) {
+    o << '(';
+    PrintExpressionContents(currFunction, o).visit(curr);
+    incIndent();
+    printFullLine(curr->left);
+    printFullLine(curr->right);
+    printFullLine(curr->cond);
+    decIndent();
+  }
+  void visitSIMDShift(SIMDShift* curr) {
+    o << '(';
+    PrintExpressionContents(currFunction, o).visit(curr);
+    incIndent();
+    printFullLine(curr->vec);
+    printFullLine(curr->shift);
+    decIndent();
+  }
   void visitConst(Const* curr) {
     o << '(';
     PrintExpressionContents(currFunction, o).visit(curr);
@@ -970,7 +1182,7 @@ struct PrintSExpression : public Visitor<PrintSExpression> {
     printName(curr->name, o) << ' ';
     o << curr->initial;
     if (curr->hasMax()) o << ' ' << curr->max;
-    o << " anyfunc)";
+    o << " funcref)";
   }
   void visitTable(Table* curr) {
     if (!curr->exists) return;
diff --git a/src/passes/RedundantSetElimination.cpp b/src/passes/RedundantSetElimination.cpp
index 8c00a0880..6f39fce9f 100644
--- a/src/passes/RedundantSetElimination.cpp
+++ b/src/passes/RedundantSetElimination.cpp
@@ -15,7 +15,7 @@
  */
 
 //
-// Eliminate redundant set_locals: if a local already has a particular
+// Eliminate redundant local.sets: if a local already has a particular
 // value, we don't need to set it again. A common case here is loops
 // that start at zero, since the default value is initialized to
 // zero anyhow.
@@ -28,7 +28,7 @@
 // values no longer necessary.
 //
 // So far this tracks constant values, and for everything else it considers
-// them unique (so each set_local of a non-constant is a unique value, each
+// them unique (so each local.set of a non-constant is a unique value, each
 // merge is a unique value, etc.; there is no sophisticated value numbering
 // here).
 //
@@ -172,7 +172,7 @@ struct RedundantSetElimination : public WalkerPass<CFGWalker<RedundantSetElimina
 #endif
             start[i] = getUniqueValue();
           } else {
-            start[i] = getLiteralValue(LiteralUtils::makeLiteralZero(func->getLocalType(i)));
+            start[i] = getLiteralValue(Literal::makeZero(func->getLocalType(i)));
           }
         }
       } else {
@@ -375,4 +375,3 @@ Pass *createRedundantSetEliminationPass() {
 }
 
 } // namespace wasm
-
diff --git a/src/passes/RemoveUnusedBrs.cpp b/src/passes/RemoveUnusedBrs.cpp
index cb3ff69ee..614503581 100644
--- a/src/passes/RemoveUnusedBrs.cpp
+++ b/src/passes/RemoveUnusedBrs.cpp
@@ -106,10 +106,11 @@ struct RemoveUnusedBrs : public WalkerPass<PostWalker<RemoveUnusedBrs>> {
     } else if (auto* block = curr->dynCast<Block>()) {
       // any breaks flowing to here are unnecessary, as we get here anyhow
       auto name = block->name;
+      auto& list = block->list;
       if (name.is()) {
-        size_t size = flows.size();
-        size_t skip = 0;
-        for (size_t i = 0; i < size; i++) {
+        Index size = flows.size();
+        Index skip = 0;
+        for (Index i = 0; i < size; i++) {
           auto* flow = (*flows[i])->dynCast<Break>();
           if (flow && flow->name == name) {
             if (!flow->value) {
@@ -129,11 +130,22 @@ struct RemoveUnusedBrs : public WalkerPass<PostWalker<RemoveUnusedBrs>> {
           flows.resize(size - skip);
         }
         // drop a nop at the end of a block, which prevents a value flowing
-        while (block->list.size() > 0 && block->list.back()->is<Nop>()) {
-          block->list.resize(block->list.size() - 1);
+        while (list.size() > 0 && list.back()->is<Nop>()) {
+          list.resize(list.size() - 1);
           self->anotherCycle = true;
         }
       }
+      // A value flowing is only valid if it is a value that the block actually
+      // flows out. If it is never reached, it does not flow out, and may be
+      // invalid to represent as such.
+      auto size = list.size();
+      for (Index i = 0; i < size; i++) {
+        if (i != size - 1 && list[i]->type == unreachable) {
+          // No value flows out of this block.
+          self->stopValueFlow();
+          break;
+        }
+      }
     } else if (curr->is<Nop>()) {
       // ignore (could be result of a previous cycle)
       self->stopValueFlow();
@@ -280,7 +292,7 @@ struct RemoveUnusedBrs : public WalkerPass<PostWalker<RemoveUnusedBrs>> {
       }
     }
     // TODO: if-else can be turned into a br_if as well, if one of the sides is a dead end
-    //       we handle the case of a returned value to a set_local later down, see
+    //       we handle the case of a returned value to a local.set later down, see
     //       visitSetLocal.
   }
 
@@ -515,7 +527,7 @@ struct RemoveUnusedBrs : public WalkerPass<PostWalker<RemoveUnusedBrs>> {
       super::doWalkFunction(func);
       assert(ifStack.empty());
       // flows may contain returns, which are flowing out and so can be optimized
-      for (size_t i = 0; i < flows.size(); i++) {
+      for (Index i = 0; i < flows.size(); i++) {
         auto* flow = (*flows[i])->dynCast<Return>();
         if (!flow) continue;
         if (!flow->value) {
@@ -835,7 +847,7 @@ struct RemoveUnusedBrs : public WalkerPass<PostWalker<RemoveUnusedBrs>> {
       }
 
       // If one arm is a br, we prefer a br_if and the set later:
-      //  (set_local $x
+      //  (local.set $x
       //    (if (result i32)
       //      (..condition..)
       //      (br $somewhere)
@@ -846,7 +858,7 @@ struct RemoveUnusedBrs : public WalkerPass<PostWalker<RemoveUnusedBrs>> {
       //  (br_if $somewhere
       //    (..condition..)
       //  )
-      //  (set_local $x
+      //  (local.set $x
       //    (..result)
       //  )
       // TODO: handle a condition in the br? need to watch for side effects
@@ -888,38 +900,38 @@ struct RemoveUnusedBrs : public WalkerPass<PostWalker<RemoveUnusedBrs>> {
       // we can remove. If this is not a tee, then we remove the get
       // as well as the if-else opcode in the binary format, which is
       // great:
-      //  (set_local $x
+      //  (local.set $x
       //    (if (result i32)
       //      (..condition..)
       //      (..result)
-      //      (get_local $x)
+      //      (local.get $x)
       //    )
       //  )
       // =>
       //  (if
       //    (..condition..)
-      //    (set_local $x
+      //    (local.set $x
       //      (..result)
       //    )
       //  )
       // If this is a tee, then we can do the same operation but
       // inside a block, and keep the get:
-      //  (tee_local $x
+      //  (local.tee $x
       //    (if (result i32)
       //      (..condition..)
       //      (..result)
-      //      (get_local $x)
+      //      (local.get $x)
       //    )
       //  )
       // =>
       //  (block (result i32)
       //    (if
       //      (..condition..)
-      //      (set_local $x
+      //      (local.set $x
       //        (..result)
       //      )
       //    )
-      //    (get_local $x)
+      //    (local.get $x)
       //  )
       // We save the if-else opcode, and add the block's opcodes.
       // This may be detrimental, however, often the block can be
diff --git a/src/passes/SSAify.cpp b/src/passes/SSAify.cpp
index 35432d9eb..9e6ff2de2 100644
--- a/src/passes/SSAify.cpp
+++ b/src/passes/SSAify.cpp
@@ -34,6 +34,7 @@
 #include "pass.h"
 #include "wasm-builder.h"
 #include "support/permutations.h"
+#include "ir/find_all.h"
 #include "ir/literal-utils.h"
 #include "ir/local-graph.h"
 
@@ -59,26 +60,24 @@ struct SSAify : public Pass {
     func = func_;
     LocalGraph graph(func);
     // create new local indexes, one for each set
-    createNewIndexes(graph);
+    createNewIndexes();
     // we now know the sets for each get, and can compute get indexes and handle phis
     computeGetsAndPhis(graph);
     // add prepends to function
     addPrepends();
   }
 
-  void createNewIndexes(LocalGraph& graph) {
-    for (auto& pair : graph.locations) {
-      auto* curr = pair.first;
-      if (auto* set = curr->dynCast<SetLocal>()) {
-        set->index = addLocal(func->getLocalType(set->index));
-      }
+  void createNewIndexes() {
+    FindAll<SetLocal> sets(func->body);
+    for (auto* set : sets.list) {
+      set->index = addLocal(func->getLocalType(set->index));
     }
   }
 
   void computeGetsAndPhis(LocalGraph& graph) {
-    for (auto& iter : graph.getSetses) {
-      auto* get = iter.first;
-      auto& sets = iter.second;
+    FindAll<GetLocal> gets(func->body);
+    for (auto* get : gets.list) {
+      auto& sets = graph.getSetses[get];
       if (sets.size() == 0) {
         continue; // unreachable, ignore
       }
diff --git a/src/passes/SafeHeap.cpp b/src/passes/SafeHeap.cpp
index ce1adff15..f170041e1 100644
--- a/src/passes/SafeHeap.cpp
+++ b/src/passes/SafeHeap.cpp
@@ -109,7 +109,7 @@ struct SafeHeap : public Pass {
     instrumenter.add<AccessInstrumenter>();
     instrumenter.run();
     // add helper checking funcs and imports
-    addGlobals(module);
+    addGlobals(module, runner->options.features);
   }
 
   Name dynamicTopPtr, segfault, alignfault;
@@ -156,18 +156,22 @@ struct SafeHeap : public Pass {
     return align == bytes && shared && isIntegerType(type);
   }
 
-  void addGlobals(Module* module) {
+  void addGlobals(Module* module, FeatureSet features) {
     // load funcs
     Load load;
-    for (auto type : { i32, i64, f32, f64 }) {
+    for (auto type : { i32, i64, f32, f64, v128 }) {
+      if (type == v128 && !features.hasSIMD()) continue;
       load.type = type;
-      for (Index bytes : { 1, 2, 4, 8 }) {
+      for (Index bytes : { 1, 2, 4, 8, 16 }) {
         load.bytes = bytes;
-        if (bytes > getTypeSize(type)) continue;
+        if (bytes > getTypeSize(type) ||
+            (type == f32 && bytes != 4) ||
+            (type == f64 && bytes != 8) ||
+            (type == v128 && bytes != 16)) continue;
         for (auto signed_ : { true, false }) {
           load.signed_ = signed_;
           if (isFloatType(type) && signed_) continue;
-          for (Index align : { 1, 2, 4, 8 }) {
+          for (Index align : { 1, 2, 4, 8, 16 }) {
             load.align = align;
             if (align > bytes) continue;
             for (auto isAtomic : { true, false }) {
@@ -184,13 +188,17 @@ struct SafeHeap : public Pass {
     }
     // store funcs
     Store store;
-    for (auto valueType : { i32, i64, f32, f64 }) {
+    for (auto valueType : { i32, i64, f32, f64, v128 }) {
+      if (valueType == v128 && !features.hasSIMD()) continue;
       store.valueType = valueType;
       store.type = none;
-      for (Index bytes : { 1, 2, 4, 8 }) {
+      for (Index bytes : { 1, 2, 4, 8, 16 }) {
         store.bytes = bytes;
-        if (bytes > getTypeSize(valueType)) continue;
-        for (Index align : { 1, 2, 4, 8 }) {
+        if (bytes > getTypeSize(valueType) ||
+            (valueType == f32 && bytes != 4) ||
+            (valueType == f64 && bytes != 8) ||
+            (valueType == v128 && bytes != 16)) continue;
+        for (Index align : { 1, 2, 4, 8, 16 }) {
           store.align = align;
           if (align > bytes) continue;
           for (auto isAtomic : { true, false }) {
diff --git a/src/passes/SimplifyLocals.cpp b/src/passes/SimplifyLocals.cpp
index 75dadfbac..91f0f8d4f 100644
--- a/src/passes/SimplifyLocals.cpp
+++ b/src/passes/SimplifyLocals.cpp
@@ -17,15 +17,15 @@
 //
 // Locals-related optimizations
 //
-// This "sinks" set_locals, pushing them to the next get_local where possible,
+// This "sinks" local.sets, pushing them to the next local.get where possible,
 // and removing the set if there are no gets remaining (the latter is
 // particularly useful in ssa mode, but not only).
 //
-// We also note where set_locals coalesce: if all breaks of a block set
+// We also note where local.sets coalesce: if all breaks of a block set
 // a specific local, we can use a block return value for it, in effect
-// removing multiple set_locals and replacing them with one that the
+// removing multiple local.sets and replacing them with one that the
 // block returns to. Further optimization rounds then have the opportunity
-// to remove that set_local as well. TODO: support partial traces; right
+// to remove that local.set as well. TODO: support partial traces; right
 // now, whenever control flow splits, we invalidate everything.
 //
 // After this pass, some locals may be completely unused. reorder-locals
@@ -37,7 +37,7 @@
 //   * Tee: allow teeing, i.e., sinking a local with more than one use,
 //          and so after sinking we have a tee for the first use.
 //   * Structure: create block and if return values, by merging the
-//                internal set_locals into one on the outside,
+//                internal local.sets into one on the outside,
 //                that can itself then be sunk further.
 //
 // There is also an option to disallow nesting entirely, which disallows
@@ -67,7 +67,7 @@ struct SimplifyLocals : public WalkerPass<LinearExecutionWalker<SimplifyLocals<a
 
   Pass* create() override { return new SimplifyLocals<allowTee, allowStructure, allowNesting>(); }
 
-  // information for a set_local we can sink
+  // information for a local.set we can sink
   struct SinkableInfo {
     Expression** item;
     EffectAnalyzer effects;
@@ -109,7 +109,7 @@ struct SimplifyLocals : public WalkerPass<LinearExecutionWalker<SimplifyLocals<a
   // whether this is the first cycle, in which we always disallow teeing
   bool firstCycle;
 
-  // local => # of get_locals for it
+  // local => # of local.gets for it
   GetLocalCounter getCounter;
 
   static void doNoteNonLinear(SimplifyLocals<allowTee, allowStructure, allowNesting>* self, Expression** currp) {
@@ -373,7 +373,7 @@ struct SimplifyLocals : public WalkerPass<LinearExecutionWalker<SimplifyLocals<a
     blockBreaks.erase(block->name);
     if (breaks.size() == 0) return; // block has no branches TODO we might optimize trivial stuff here too
     assert(!(*breaks[0].brp)->template cast<Break>()->value); // block does not already have a return value (if one break has one, they all do)
-    // look for a set_local that is present in them all
+    // look for a local.set that is present in them all
     bool found = false;
     Index sharedIndex = -1;
     for (auto& sinkable : sinkables) {
@@ -398,19 +398,19 @@ struct SimplifyLocals : public WalkerPass<LinearExecutionWalker<SimplifyLocals<a
     //  (br_if
     //   (block
     //    ..use $x..
-    //    (set_local $x ..)
+    //    (local.set $x ..)
     //   )
     //  )
     // =>
     //  (br_if
-    //   (tee_local $x ..) ;; this now affects the use!
+    //   (local.tee $x ..) ;; this now affects the use!
     //   (block
     //    ..use $x..
     //   )
     //  )
     // so we must check for that.
     for (size_t j = 0; j < breaks.size(); j++) {
-      // move break set_local's value to the break
+      // move break local.set's value to the break
       auto* breakSetLocalPointer = breaks[j].sinkables.at(sharedIndex).item;
       auto* brp = breaks[j].brp;
       auto* br = (*brp)->template cast<Break>();
@@ -446,14 +446,14 @@ struct SimplifyLocals : public WalkerPass<LinearExecutionWalker<SimplifyLocals<a
       blocksToEnlarge.push_back(block);
       return;
     }
-    // move block set_local's value to the end, in return position, and nop the set
+    // move block local.set's value to the end, in return position, and nop the set
     auto* blockSetLocalPointer = sinkables.at(sharedIndex).item;
     auto* value = (*blockSetLocalPointer)->template cast<SetLocal>()->value;
     block->list[block->list.size() - 1] = value;
     block->type = value->type;
     ExpressionManipulator::nop(*blockSetLocalPointer);
     for (size_t j = 0; j < breaks.size(); j++) {
-      // move break set_local's value to the break
+      // move break local.set's value to the break
       auto* breakSetLocalPointer = breaks[j].sinkables.at(sharedIndex).item;
       auto* brp = breaks[j].brp;
       auto* br = (*brp)->template cast<Break>();
@@ -472,14 +472,14 @@ struct SimplifyLocals : public WalkerPass<LinearExecutionWalker<SimplifyLocals<a
         ExpressionManipulator::nop(set);
       }
     }
-    // finally, create a set_local on the block itself
+    // finally, create a local.set on the block itself
     auto* newSetLocal = Builder(*this->getModule()).makeSetLocal(sharedIndex, block);
     this->replaceCurrent(newSetLocal);
     sinkables.clear();
     anotherCycle = true;
   }
 
-  // optimize set_locals from both sides of an if into a return value
+  // optimize local.sets from both sides of an if into a return value
   void optimizeIfElseReturn(If* iff, Expression** currp, Sinkables& ifTrue) {
     assert(iff->ifFalse);
     // if this if already has a result, or is unreachable code, we have
@@ -491,10 +491,10 @@ struct SimplifyLocals : public WalkerPass<LinearExecutionWalker<SimplifyLocals<a
     //   (if
     //     (..)
     //     (br $x)
-    //     (set_local $y (..))
+    //     (local.set $y (..))
     //   )
     //    =>
-    //   (set_local $y
+    //   (local.set $y
     //     (if (result i32)
     //       (..)
     //       (br $x)
@@ -562,27 +562,27 @@ struct SimplifyLocals : public WalkerPass<LinearExecutionWalker<SimplifyLocals<a
     }
     iff->finalize(); // update type
     assert(iff->type != none);
-    // finally, create a set_local on the iff itself
+    // finally, create a local.set on the iff itself
     auto* newSetLocal = Builder(*this->getModule()).makeSetLocal(goodIndex, iff);
     *currp = newSetLocal;
     anotherCycle = true;
   }
 
-  // Optimize set_locals from a one-sided iff, adding a get on the other:
+  // Optimize local.sets from a one-sided iff, adding a get on the other:
   //  (if
   //    (..condition..)
   //    (block
-  //      (set_local $x (..value..))
+  //      (local.set $x (..value..))
   //    )
   //  )
   // =>
-  //  (set_local $x
+  //  (local.set $x
   //    (if (result ..)
   //      (..condition..)
   //      (block (result ..)
   //        (..value..)
   //      )
-  //      (get_local $x)
+  //      (local.get $x)
   //    )
   //  )
   // This is a speculative optimization: we add a get here, as well as a branch
@@ -617,7 +617,7 @@ struct SimplifyLocals : public WalkerPass<LinearExecutionWalker<SimplifyLocals<a
     // Update the get count.
     getCounter.num[set->index]++;
     assert(iff->type != none);
-    // Finally, reuse the set_local on the iff itself.
+    // Finally, reuse the local.set on the iff itself.
     set->value = iff;
     set->finalize();
     *currp = set;
@@ -648,7 +648,7 @@ struct SimplifyLocals : public WalkerPass<LinearExecutionWalker<SimplifyLocals<a
   }
 
   void doWalkFunction(Function* func) {
-    // scan get_locals
+    // scan local.gets
     getCounter.analyze(func);
     // multiple passes may be required per function, consider this:
     //    x = load
@@ -741,11 +741,11 @@ struct SimplifyLocals : public WalkerPass<LinearExecutionWalker<SimplifyLocals<a
     // we do that at the very end, and only after structure, as removing
     // the copy here:
     //   (if
-    //    (get_local $var$0)
-    //    (set_local $var$0
-    //     (get_local $var$0)
+    //    (local.get $var$0)
+    //    (local.set $var$0
+    //     (local.get $var$0)
     //    )
-    //    (set_local $var$0
+    //    (local.set $var$0
     //     (i32.const 208)
     //    )
     //   )
diff --git a/src/passes/Souperify.cpp b/src/passes/Souperify.cpp
index 5875c8f42..62ec133fe 100644
--- a/src/passes/Souperify.cpp
+++ b/src/passes/Souperify.cpp
@@ -131,7 +131,7 @@ struct UseFinder {
 };
 
 // Generates a trace: all the information to generate a Souper LHS
-// for a specific set_local whose value we want to infer.
+// for a specific local.set whose value we want to infer.
 struct Trace {
   Graph& graph;
   Node* toInfer;
diff --git a/src/passes/StackIR.cpp b/src/passes/StackIR.cpp
index 3772500c4..a8d66ae42 100644
--- a/src/passes/StackIR.cpp
+++ b/src/passes/StackIR.cpp
@@ -118,12 +118,12 @@ private:
     }
   }
 
-  // If ordered properly, we can avoid a set_local/get_local pair,
+  // If ordered properly, we can avoid a local.set/local.get pair,
   // and use the value directly from the stack, for example
   //    [..produce a value on the stack..]
-  //    set_local $x
+  //    local.set $x
   //    [..much code..]
-  //    get_local $x
+  //    local.get $x
   //    call $foo ;; use the value, foo(value)
   // As long as the code in between does not modify $x, and has
   // no control flow branching out, we can remove both the set
diff --git a/src/passes/Untee.cpp b/src/passes/Untee.cpp
index b61875243..00f2ffe5d 100644
--- a/src/passes/Untee.cpp
+++ b/src/passes/Untee.cpp
@@ -15,7 +15,7 @@
  */
 
 //
-// Removes tee_locals, replacing them with gets and sets.
+// Removes local.tees, replacing them with gets and sets.
 //
 // This makes the code "flatter", with less nested side
 // effects. That can make some passes, like CodePushing,
diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp
index c42a3d144..cae69860a 100644
--- a/src/passes/pass.cpp
+++ b/src/passes/pass.cpp
@@ -85,6 +85,7 @@ void PassRegistry::registerPasses() {
   registerPass("inlining", "inline functions (you probably want inlining-optimizing)", createInliningPass);
   registerPass("inlining-optimizing", "inline functions and optimizes where we inlined", createInliningOptimizingPass);
   registerPass("legalize-js-interface", "legalizes i64 types on the import/export boundary", createLegalizeJSInterfacePass);
+  registerPass("legalize-js-interface-minimally", "legalizes i64 types on the import/export boundary in a minimal manner, only on things only JS will call", createLegalizeJSInterfaceMinimallyPass);
   registerPass("local-cse", "common subexpression elimination inside basic blocks", createLocalCSEPass);
   registerPass("log-execution", "instrument the build with logging of where execution goes", createLogExecutionPass);
   registerPass("i64-to-i32-lowering", "lower all uses of i64s to use i32s instead", createI64ToI32LoweringPass);
@@ -98,6 +99,7 @@ void PassRegistry::registerPasses() {
   registerPass("minify-imports", "minifies import names (only those, and not export names), and emits a mapping to the minified ones", createMinifyImportsPass);
   registerPass("minify-imports-and-exports", "minifies both import and export names, and emits a mapping to the minified ones", createMinifyImportsAndExportsPass);
   registerPass("nm", "name list", createNameListPass);
+  registerPass("no-exit-runtime", "removes calls to atexit(), which is valid if the C runtime will never be exited", createNoExitRuntimePass);
   registerPass("optimize-instructions", "optimizes instruction combinations", createOptimizeInstructionsPass);
   registerPass("optimize-stack-ir", "optimize Stack IR", createOptimizeStackIRPass);
   registerPass("pick-load-signs", "pick load signs based on their uses", createPickLoadSignsPass);
@@ -120,7 +122,7 @@ void PassRegistry::registerPasses() {
   registerPass("reorder-functions", "sorts functions by access frequency", createReorderFunctionsPass);
   registerPass("reorder-locals", "sorts locals by access frequency", createReorderLocalsPass);
   registerPass("rereloop", "re-optimize control flow using the relooper algorithm", createReReloopPass);
-  registerPass("rse", "remove redundant set_locals", createRedundantSetEliminationPass);
+  registerPass("rse", "remove redundant local.sets", createRedundantSetEliminationPass);
   registerPass("safe-heap", "instrument loads and stores to check for invalid behavior", createSafeHeapPass);
   registerPass("simplify-locals", "miscellaneous locals-related optimizations", createSimplifyLocalsPass);
   registerPass("simplify-locals-nonesting", "miscellaneous locals-related optimizations (no nesting at all; preserves flatness)", createSimplifyLocalsNoNestingPass);
@@ -134,7 +136,7 @@ void PassRegistry::registerPasses() {
   registerPass("strip", "strip debug info (including the names section)", createStripPass);
   registerPass("trap-mode-clamp", "replace trapping operations with clamping semantics", createTrapModeClamp);
   registerPass("trap-mode-js", "replace trapping operations with js semantics", createTrapModeJS);
-  registerPass("untee", "removes tee_locals, replacing them with sets and gets", createUnteePass);
+  registerPass("untee", "removes local.tees, replacing them with sets and gets", createUnteePass);
   registerPass("vacuum", "removes obviously unneeded code", createVacuumPass);
 //  registerPass("lower-i64", "lowers i64 into pairs of i32s", createLowerInt64Pass);
 }
diff --git a/src/passes/passes.h b/src/passes/passes.h
index 8f5ca7e0a..b04303429 100644
--- a/src/passes/passes.h
+++ b/src/passes/passes.h
@@ -42,6 +42,7 @@ Pass* createI64ToI32LoweringPass();
 Pass* createInliningPass();
 Pass* createInliningOptimizingPass();
 Pass* createLegalizeJSInterfacePass();
+Pass* createLegalizeJSInterfaceMinimallyPass();
 Pass* createLocalCSEPass();
 Pass* createLogExecutionPass();
 Pass* createInstrumentLocalsPass();
@@ -55,6 +56,7 @@ Pass* createMinifyImportsPass();
 Pass* createMinifyImportsAndExportsPass();
 Pass* createMetricsPass();
 Pass* createNameListPass();
+Pass* createNoExitRuntimePass();
 Pass* createOptimizeInstructionsPass();
 Pass* createOptimizeStackIRPass();
 Pass* createPickLoadSignsPass();
diff --git a/src/passes/wasm-intrinsics.wast b/src/passes/wasm-intrinsics.wast
index 8cd14d51d..26687508d 100644
--- a/src/passes/wasm-intrinsics.wast
+++ b/src/passes/wasm-intrinsics.wast
@@ -40,24 +40,24 @@
    (loop $label$2
     (drop
      (br_if $label$1
-      (get_local $var$1)
+      (local.get $var$1)
       (i32.eqz
-       (get_local $var$0)
+       (local.get $var$0)
       )
      )
     )
-    (set_local $var$0
+    (local.set $var$0
      (i32.and
-      (get_local $var$0)
+      (local.get $var$0)
       (i32.sub
-       (get_local $var$0)
+       (local.get $var$0)
        (i32.const 1)
       )
      )
     )
-    (set_local $var$1
+    (local.set $var$1
      (i32.add
-      (get_local $var$1)
+      (local.get $var$1)
       (i32.const 1)
      )
     )
@@ -73,24 +73,24 @@
    (loop $label$2
     (drop
      (br_if $label$1
-      (get_local $var$1)
+      (local.get $var$1)
       (i64.eqz
-       (get_local $var$0)
+       (local.get $var$0)
       )
      )
     )
-    (set_local $var$0
+    (local.set $var$0
      (i64.and
-      (get_local $var$0)
+      (local.get $var$0)
       (i64.sub
-       (get_local $var$0)
+       (local.get $var$0)
        (i64.const 1)
       )
      )
     )
-    (set_local $var$1
+    (local.set $var$1
      (i64.add
-      (get_local $var$1)
+      (local.get $var$1)
       (i64.const 1)
      )
     )
@@ -101,30 +101,30 @@
  ;; lowering of the i64.div_s instruction, return $var0 / $var$1
  (func $__wasm_i64_sdiv (; 0 ;) (type $0) (param $var$0 i64) (param $var$1 i64) (result i64)
   (call $_ZN17compiler_builtins3int4sdiv3Div3div17he78fc483e41d7ec7E
-   (get_local $var$0)
-   (get_local $var$1)
+   (local.get $var$0)
+   (local.get $var$1)
   )
  )
  ;; lowering of the i64.div_u instruction, return $var0 / $var$1
  (func $__wasm_i64_udiv (; 1 ;) (type $0) (param $var$0 i64) (param $var$1 i64) (result i64)
   (call $_ZN17compiler_builtins3int4udiv10divmod_u6417h6026910b5ed08e40E
-   (get_local $var$0)
-   (get_local $var$1)
+   (local.get $var$0)
+   (local.get $var$1)
   )
  )
  ;; lowering of the i64.rem_s instruction, return $var0 % $var$1
  (func $__wasm_i64_srem (; 2 ;) (type $0) (param $var$0 i64) (param $var$1 i64) (result i64)
   (call $_ZN17compiler_builtins3int4sdiv3Mod4mod_17h2cbb7bbf36e41d68E
-   (get_local $var$0)
-   (get_local $var$1)
+   (local.get $var$0)
+   (local.get $var$1)
   )
  )
  ;; lowering of the i64.rem_u instruction, return $var0 % $var$1
  (func $__wasm_i64_urem (; 3 ;) (type $0) (param $var$0 i64) (param $var$1 i64) (result i64)
   (drop
    (call $_ZN17compiler_builtins3int4udiv10divmod_u6417h6026910b5ed08e40E
-    (get_local $var$0)
-    (get_local $var$1)
+    (local.get $var$0)
+    (local.get $var$1)
    )
   )
   (i64.load
@@ -134,8 +134,8 @@
  ;; lowering of the i64.mul instruction, return $var0 * $var$1
  (func $__wasm_i64_mul (; 4 ;) (type $0) (param $var$0 i64) (param $var$1 i64) (result i64)
   (call $_ZN17compiler_builtins3int3mul3Mul3mul17h070e9a1c69faec5bE
-   (get_local $var$0)
-   (get_local $var$1)
+   (local.get $var$0)
+   (local.get $var$1)
   )
  )
  ;; lowering of the f32.trunc instruction, rounds to the nearest integer,
@@ -143,13 +143,13 @@
  (func $__wasm_trunc_f32 (; 5 ;) (type $1) (param $var$0 f32) (result f32)
   (select
    (f32.ceil
-    (get_local $var$0)
+    (local.get $var$0)
    )
    (f32.floor
-    (get_local $var$0)
+    (local.get $var$0)
    )
    (f32.lt
-    (get_local $var$0)
+    (local.get $var$0)
     (f32.const 0)
    )
   )
@@ -159,13 +159,13 @@
  (func $__wasm_trunc_f64 (; 6 ;) (type $2) (param $var$0 f64) (result f64)
   (select
    (f64.ceil
-    (get_local $var$0)
+    (local.get $var$0)
    )
    (f64.floor
-    (get_local $var$0)
+    (local.get $var$0)
    )
    (f64.lt
-    (get_local $var$0)
+    (local.get $var$0)
     (f64.const 0)
    )
   )
@@ -173,17 +173,17 @@
  ;; lowering of the i32.ctz instruction, counting the number of zeros in $var$0
  (func $__wasm_ctz_i32 (; 7 ;) (type $3) (param $var$0 i32) (result i32)
   (if
-   (get_local $var$0)
+   (local.get $var$0)
    (return
     (i32.sub
      (i32.const 31)
      (i32.clz
       (i32.xor
        (i32.add
-        (get_local $var$0)
+        (local.get $var$0)
         (i32.const -1)
        )
-       (get_local $var$0)
+       (local.get $var$0)
       )
      )
     )
@@ -196,7 +196,7 @@
   (if
    (i32.eqz
     (i64.eqz
-     (get_local $var$0)
+     (local.get $var$0)
     )
    )
    (return
@@ -205,10 +205,10 @@
      (i64.clz
       (i64.xor
        (i64.add
-        (get_local $var$0)
+        (local.get $var$0)
         (i64.const -1)
        )
-       (get_local $var$0)
+       (local.get $var$0)
       )
      )
     )
@@ -225,34 +225,34 @@
     (i32.and
      (i32.shr_u
       (i32.const -1)
-      (tee_local $var$2
+      (local.tee $var$2
        (i32.and
-        (get_local $var$1)
+        (local.get $var$1)
         (i32.const 31)
        )
       )
      )
-     (get_local $var$0)
+     (local.get $var$0)
     )
-    (get_local $var$2)
+    (local.get $var$2)
    )
    (i32.shr_u
     (i32.and
      (i32.shl
       (i32.const -1)
-      (tee_local $var$1
+      (local.tee $var$1
        (i32.and
         (i32.sub
          (i32.const 0)
-         (get_local $var$1)
+         (local.get $var$1)
         )
         (i32.const 31)
        )
       )
      )
-     (get_local $var$0)
+     (local.get $var$0)
     )
-    (get_local $var$1)
+    (local.get $var$1)
    )
   )
  )
@@ -265,34 +265,34 @@
     (i32.and
      (i32.shl
       (i32.const -1)
-      (tee_local $var$2
+      (local.tee $var$2
        (i32.and
-        (get_local $var$1)
+        (local.get $var$1)
         (i32.const 31)
        )
       )
      )
-     (get_local $var$0)
+     (local.get $var$0)
     )
-    (get_local $var$2)
+    (local.get $var$2)
    )
    (i32.shl
     (i32.and
      (i32.shr_u
       (i32.const -1)
-      (tee_local $var$1
+      (local.tee $var$1
        (i32.and
         (i32.sub
          (i32.const 0)
-         (get_local $var$1)
+         (local.get $var$1)
         )
         (i32.const 31)
        )
       )
      )
-     (get_local $var$0)
+     (local.get $var$0)
     )
-    (get_local $var$1)
+    (local.get $var$1)
    )
   )
  )
@@ -305,34 +305,34 @@
     (i64.and
      (i64.shr_u
       (i64.const -1)
-      (tee_local $var$2
+      (local.tee $var$2
        (i64.and
-        (get_local $var$1)
+        (local.get $var$1)
         (i64.const 63)
        )
       )
      )
-     (get_local $var$0)
+     (local.get $var$0)
     )
-    (get_local $var$2)
+    (local.get $var$2)
    )
    (i64.shr_u
     (i64.and
      (i64.shl
       (i64.const -1)
-      (tee_local $var$1
+      (local.tee $var$1
        (i64.and
         (i64.sub
          (i64.const 0)
-         (get_local $var$1)
+         (local.get $var$1)
         )
         (i64.const 63)
        )
       )
      )
-     (get_local $var$0)
+     (local.get $var$0)
     )
-    (get_local $var$1)
+    (local.get $var$1)
    )
   )
  )
@@ -345,34 +345,34 @@
     (i64.and
      (i64.shl
       (i64.const -1)
-      (tee_local $var$2
+      (local.tee $var$2
        (i64.and
-        (get_local $var$1)
+        (local.get $var$1)
         (i64.const 63)
        )
       )
      )
-     (get_local $var$0)
+     (local.get $var$0)
     )
-    (get_local $var$2)
+    (local.get $var$2)
    )
    (i64.shl
     (i64.and
      (i64.shr_u
       (i64.const -1)
-      (tee_local $var$1
+      (local.tee $var$1
        (i64.and
         (i64.sub
          (i64.const 0)
-         (get_local $var$1)
+         (local.get $var$1)
         )
         (i64.const 63)
        )
       )
      )
-     (get_local $var$0)
+     (local.get $var$0)
     )
-    (get_local $var$1)
+    (local.get $var$1)
    )
   )
  )
@@ -384,12 +384,12 @@
   (if
    (i32.eqz
     (f32.lt
-     (tee_local $var$2
+     (local.tee $var$2
       (f32.sub
-       (get_local $var$0)
-       (tee_local $var$1
+       (local.get $var$0)
+       (local.tee $var$1
         (f32.floor
-         (get_local $var$0)
+         (local.get $var$0)
         )
        )
       )
@@ -398,34 +398,34 @@
     )
    )
    (block
-    (set_local $var$0
+    (local.set $var$0
      (f32.ceil
-      (get_local $var$0)
+      (local.get $var$0)
      )
     )
     (if
      (f32.gt
-      (get_local $var$2)
+      (local.get $var$2)
       (f32.const 0.5)
      )
      (return
-      (get_local $var$0)
+      (local.get $var$0)
      )
     )
-    (set_local $var$1
+    (local.set $var$1
      (select
-      (get_local $var$1)
-      (get_local $var$0)
+      (local.get $var$1)
+      (local.get $var$0)
       (f32.eq
        (f32.sub
-        (tee_local $var$2
+        (local.tee $var$2
          (f32.mul
-          (get_local $var$1)
+          (local.get $var$1)
           (f32.const 0.5)
          )
         )
         (f32.floor
-         (get_local $var$2)
+         (local.get $var$2)
         )
        )
        (f32.const 0)
@@ -434,7 +434,7 @@
     )
    )
   )
-  (get_local $var$1)
+  (local.get $var$1)
  )
  ;; lowering of the f64.nearest instruction, rounding the input to the nearest
  ;; integer while breaking ties by rounding to even
@@ -444,12 +444,12 @@
   (if
    (i32.eqz
     (f64.lt
-     (tee_local $var$2
+     (local.tee $var$2
       (f64.sub
-       (get_local $var$0)
-       (tee_local $var$1
+       (local.get $var$0)
+       (local.tee $var$1
         (f64.floor
-         (get_local $var$0)
+         (local.get $var$0)
         )
        )
       )
@@ -458,34 +458,34 @@
     )
    )
    (block
-    (set_local $var$0
+    (local.set $var$0
      (f64.ceil
-      (get_local $var$0)
+      (local.get $var$0)
      )
     )
     (if
      (f64.gt
-      (get_local $var$2)
+      (local.get $var$2)
       (f64.const 0.5)
      )
      (return
-      (get_local $var$0)
+      (local.get $var$0)
      )
     )
-    (set_local $var$1
+    (local.set $var$1
      (select
-      (get_local $var$1)
-      (get_local $var$0)
+      (local.get $var$1)
+      (local.get $var$0)
       (f64.eq
        (f64.sub
-        (tee_local $var$2
+        (local.tee $var$2
          (f64.mul
-          (get_local $var$1)
+          (local.get $var$1)
           (f64.const 0.5)
          )
         )
         (f64.floor
-         (get_local $var$2)
+         (local.get $var$2)
         )
        )
        (f64.const 0)
@@ -494,7 +494,7 @@
     )
    )
   )
-  (get_local $var$1)
+  (local.get $var$1)
  )
  (func $_ZN17compiler_builtins3int4udiv10divmod_u6417h6026910b5ed08e40E (; 14 ;) (type $0) (param $var$0 i64) (param $var$1 i64) (result i64)
   (local $var$2 i32)
@@ -516,10 +516,10 @@
            (block $label$10
             (block $label$11
              (if
-              (tee_local $var$2
-               (i32.wrap/i64
+              (local.tee $var$2
+               (i32.wrap_i64
                 (i64.shr_u
-                 (get_local $var$0)
+                 (local.get $var$0)
                  (i64.const 32)
                 )
                )
@@ -527,19 +527,19 @@
               (block
                (br_if $label$11
                 (i32.eqz
-                 (tee_local $var$3
-                  (i32.wrap/i64
-                   (get_local $var$1)
+                 (local.tee $var$3
+                  (i32.wrap_i64
+                   (local.get $var$1)
                   )
                  )
                 )
                )
                (br_if $label$9
                 (i32.eqz
-                 (tee_local $var$4
-                  (i32.wrap/i64
+                 (local.tee $var$4
+                  (i32.wrap_i64
                    (i64.shr_u
-                    (get_local $var$1)
+                    (local.get $var$1)
                     (i64.const 32)
                    )
                   )
@@ -548,13 +548,13 @@
                )
                (br_if $label$8
                 (i32.le_u
-                 (tee_local $var$2
+                 (local.tee $var$2
                   (i32.sub
                    (i32.clz
-                    (get_local $var$4)
+                    (local.get $var$4)
                    )
                    (i32.clz
-                    (get_local $var$2)
+                    (local.get $var$2)
                    )
                   )
                  )
@@ -566,97 +566,97 @@
              )
              (br_if $label$2
               (i64.ge_u
-               (get_local $var$1)
+               (local.get $var$1)
                (i64.const 4294967296)
               )
              )
              (i64.store
               (i32.const 1024)
-              (i64.extend_u/i32
+              (i64.extend_i32_u
                (i32.sub
-                (tee_local $var$2
-                 (i32.wrap/i64
-                  (get_local $var$0)
+                (local.tee $var$2
+                 (i32.wrap_i64
+                  (local.get $var$0)
                  )
                 )
                 (i32.mul
-                 (tee_local $var$2
+                 (local.tee $var$2
                   (i32.div_u
-                   (get_local $var$2)
-                   (tee_local $var$3
-                    (i32.wrap/i64
-                     (get_local $var$1)
+                   (local.get $var$2)
+                   (local.tee $var$3
+                    (i32.wrap_i64
+                     (local.get $var$1)
                     )
                    )
                   )
                  )
-                 (get_local $var$3)
+                 (local.get $var$3)
                 )
                )
               )
              )
              (return
-              (i64.extend_u/i32
-               (get_local $var$2)
+              (i64.extend_i32_u
+               (local.get $var$2)
               )
              )
             )
-            (set_local $var$3
-             (i32.wrap/i64
+            (local.set $var$3
+             (i32.wrap_i64
               (i64.shr_u
-               (get_local $var$1)
+               (local.get $var$1)
                (i64.const 32)
               )
              )
             )
             (br_if $label$7
              (i32.eqz
-              (i32.wrap/i64
-               (get_local $var$0)
+              (i32.wrap_i64
+               (local.get $var$0)
               )
              )
             )
             (br_if $label$6
              (i32.eqz
-              (get_local $var$3)
+              (local.get $var$3)
              )
             )
             (br_if $label$6
              (i32.and
-              (tee_local $var$4
+              (local.tee $var$4
                (i32.add
-                (get_local $var$3)
+                (local.get $var$3)
                 (i32.const -1)
                )
               )
-              (get_local $var$3)
+              (local.get $var$3)
              )
             )
             (i64.store
              (i32.const 1024)
              (i64.or
               (i64.shl
-               (i64.extend_u/i32
+               (i64.extend_i32_u
                 (i32.and
-                 (get_local $var$4)
-                 (get_local $var$2)
+                 (local.get $var$4)
+                 (local.get $var$2)
                 )
                )
                (i64.const 32)
               )
               (i64.and
-               (get_local $var$0)
+               (local.get $var$0)
                (i64.const 4294967295)
               )
              )
             )
             (return
-             (i64.extend_u/i32
+             (i64.extend_i32_u
               (i32.shr_u
-               (get_local $var$2)
+               (local.get $var$2)
                (i32.and
                 (i32.ctz
-                 (get_local $var$3)
+                 (local.get $var$3)
                 )
                 (i32.const 31)
                )
@@ -669,29 +669,29 @@
           (br_if $label$5
            (i32.eqz
             (i32.and
-             (tee_local $var$4
+             (local.tee $var$4
               (i32.add
-               (get_local $var$3)
+               (local.get $var$3)
                (i32.const -1)
               )
              )
-             (get_local $var$3)
+             (local.get $var$3)
             )
            )
           )
-          (set_local $var$3
+          (local.set $var$3
            (i32.sub
             (i32.const 0)
-            (tee_local $var$2
+            (local.tee $var$2
              (i32.sub
               (i32.add
                (i32.clz
-                (get_local $var$3)
+                (local.get $var$3)
                )
                (i32.const 33)
               )
               (i32.clz
-               (get_local $var$2)
+               (local.get $var$2)
               )
              )
             )
@@ -699,15 +699,15 @@
           )
           (br $label$3)
          )
-         (set_local $var$3
+         (local.set $var$3
           (i32.sub
            (i32.const 63)
-           (get_local $var$2)
+           (local.get $var$2)
           )
          )
-         (set_local $var$2
+         (local.set $var$2
           (i32.add
-           (get_local $var$2)
+           (local.get $var$2)
            (i32.const 1)
           )
          )
@@ -716,17 +716,17 @@
         (i64.store
          (i32.const 1024)
          (i64.shl
-          (i64.extend_u/i32
+          (i64.extend_i32_u
            (i32.sub
-            (get_local $var$2)
+            (local.get $var$2)
             (i32.mul
-             (tee_local $var$4
+             (local.tee $var$4
               (i32.div_u
-               (get_local $var$2)
-               (get_local $var$3)
+               (local.get $var$2)
+               (local.get $var$3)
               )
              )
-             (get_local $var$3)
+             (local.get $var$3)
             )
            )
           )
@@ -734,20 +734,20 @@
          )
         )
         (return
-         (i64.extend_u/i32
-          (get_local $var$4)
+         (i64.extend_i32_u
+          (local.get $var$4)
          )
         )
        )
        (br_if $label$4
         (i32.lt_u
-         (tee_local $var$2
+         (local.tee $var$2
           (i32.sub
            (i32.clz
-            (get_local $var$3)
+            (local.get $var$3)
            )
            (i32.clz
-            (get_local $var$2)
+            (local.get $var$2)
            )
           )
          )
@@ -758,62 +758,62 @@
       )
       (i64.store
        (i32.const 1024)
-       (i64.extend_u/i32
+       (i64.extend_i32_u
         (i32.and
-         (get_local $var$4)
-         (i32.wrap/i64
-          (get_local $var$0)
+         (local.get $var$4)
+         (i32.wrap_i64
+          (local.get $var$0)
          )
         )
        )
       )
       (br_if $label$1
        (i32.eq
-        (get_local $var$3)
+        (local.get $var$3)
         (i32.const 1)
        )
       )
       (return
        (i64.shr_u
-        (get_local $var$0)
-        (i64.extend_u/i32
+        (local.get $var$0)
+        (i64.extend_i32_u
          (i32.ctz
-          (get_local $var$3)
+          (local.get $var$3)
          )
         )
        )
       )
      )
-     (set_local $var$3
+     (local.set $var$3
       (i32.sub
        (i32.const 63)
-       (get_local $var$2)
+       (local.get $var$2)
       )
      )
-     (set_local $var$2
+     (local.set $var$2
       (i32.add
-       (get_local $var$2)
+       (local.get $var$2)
        (i32.const 1)
       )
      )
     )
-    (set_local $var$5
+    (local.set $var$5
      (i64.shr_u
-      (get_local $var$0)
-      (i64.extend_u/i32
+      (local.get $var$0)
+      (i64.extend_i32_u
        (i32.and
-        (get_local $var$2)
+        (local.get $var$2)
         (i32.const 63)
        )
       )
      )
     )
-    (set_local $var$0
+    (local.set $var$0
      (i64.shl
-      (get_local $var$0)
-      (i64.extend_u/i32
+      (local.get $var$0)
+      (i64.extend_i32_u
        (i32.and
-        (get_local $var$3)
+        (local.get $var$3)
         (i32.const 63)
        )
       )
@@ -821,64 +821,64 @@
     )
     (block $label$13
      (if
-      (get_local $var$2)
+      (local.get $var$2)
       (block
-       (set_local $var$8
+       (local.set $var$8
         (i64.add
-         (get_local $var$1)
+         (local.get $var$1)
          (i64.const -1)
         )
        )
        (loop $label$15
-        (set_local $var$5
+        (local.set $var$5
          (i64.sub
-          (tee_local $var$5
+          (local.tee $var$5
            (i64.or
             (i64.shl
-             (get_local $var$5)
+             (local.get $var$5)
              (i64.const 1)
             )
             (i64.shr_u
-             (get_local $var$0)
+             (local.get $var$0)
              (i64.const 63)
             )
            )
           )
           (i64.and
-           (tee_local $var$6
+           (local.tee $var$6
             (i64.shr_s
              (i64.sub
-              (get_local $var$8)
-              (get_local $var$5)
+              (local.get $var$8)
+              (local.get $var$5)
              )
              (i64.const 63)
             )
            )
-           (get_local $var$1)
+           (local.get $var$1)
           )
          )
         )
-        (set_local $var$0
+        (local.set $var$0
          (i64.or
           (i64.shl
-           (get_local $var$0)
+           (local.get $var$0)
            (i64.const 1)
           )
-          (get_local $var$7)
+          (local.get $var$7)
          )
         )
-        (set_local $var$7
-         (tee_local $var$6
+        (local.set $var$7
+         (local.tee $var$6
           (i64.and
-           (get_local $var$6)
+           (local.get $var$6)
            (i64.const 1)
           )
          )
         )
         (br_if $label$15
-         (tee_local $var$2
+         (local.tee $var$2
           (i32.add
-           (get_local $var$2)
+           (local.get $var$2)
            (i32.const -1)
           )
          )
@@ -890,27 +890,27 @@
     )
     (i64.store
      (i32.const 1024)
-     (get_local $var$5)
+     (local.get $var$5)
     )
     (return
      (i64.or
       (i64.shl
-       (get_local $var$0)
+       (local.get $var$0)
        (i64.const 1)
       )
-      (get_local $var$6)
+      (local.get $var$6)
      )
     )
    )
    (i64.store
     (i32.const 1024)
-    (get_local $var$0)
+    (local.get $var$0)
    )
-   (set_local $var$0
+   (local.set $var$0
     (i64.const 0)
    )
   )
-  (get_local $var$0)
+  (local.get $var$0)
  )
  (func $_ZN17compiler_builtins3int3mul3Mul3mul17h070e9a1c69faec5bE (; 15 ;) (type $0) (param $var$0 i64) (param $var$1 i64) (result i64)
   (local $var$2 i32)
@@ -920,27 +920,27 @@
   (local $var$6 i32)
   (i64.or
    (i64.shl
-    (i64.extend_u/i32
+    (i64.extend_i32_u
      (i32.add
       (i32.add
        (i32.add
         (i32.add
          (i32.mul
-          (tee_local $var$4
+          (local.tee $var$4
            (i32.shr_u
-            (tee_local $var$2
-             (i32.wrap/i64
-              (get_local $var$1)
+            (local.tee $var$2
+             (i32.wrap_i64
+              (local.get $var$1)
              )
             )
             (i32.const 16)
            )
           )
-          (tee_local $var$5
+          (local.tee $var$5
            (i32.shr_u
-            (tee_local $var$3
-             (i32.wrap/i64
-              (get_local $var$0)
+            (local.tee $var$3
+             (i32.wrap_i64
+              (local.get $var$0)
              )
             )
             (i32.const 16)
@@ -948,40 +948,40 @@
           )
          )
          (i32.mul
-          (get_local $var$2)
-          (i32.wrap/i64
+          (local.get $var$2)
+          (i32.wrap_i64
            (i64.shr_u
-            (get_local $var$0)
+            (local.get $var$0)
             (i64.const 32)
            )
           )
          )
         )
         (i32.mul
-         (i32.wrap/i64
+         (i32.wrap_i64
           (i64.shr_u
-           (get_local $var$1)
+           (local.get $var$1)
            (i64.const 32)
           )
          )
-         (get_local $var$3)
+         (local.get $var$3)
         )
        )
        (i32.shr_u
-        (tee_local $var$2
+        (local.tee $var$2
          (i32.add
           (i32.shr_u
-           (tee_local $var$6
+           (local.tee $var$6
             (i32.mul
-             (tee_local $var$2
+             (local.tee $var$2
               (i32.and
-               (get_local $var$2)
+               (local.get $var$2)
                (i32.const 65535)
               )
              )
-             (tee_local $var$3
+             (local.tee $var$3
               (i32.and
-               (get_local $var$3)
+               (local.get $var$3)
                (i32.const 65535)
               )
              )
@@ -990,8 +990,8 @@
            (i32.const 16)
           )
           (i32.mul
-           (get_local $var$2)
-           (get_local $var$5)
+           (local.get $var$2)
+           (local.get $var$5)
           )
          )
         )
@@ -999,15 +999,15 @@
        )
       )
       (i32.shr_u
-       (tee_local $var$2
+       (local.tee $var$2
         (i32.add
          (i32.and
-          (get_local $var$2)
+          (local.get $var$2)
           (i32.const 65535)
          )
          (i32.mul
-          (get_local $var$4)
-          (get_local $var$3)
+          (local.get $var$4)
+          (local.get $var$3)
          )
         )
        )
@@ -1017,14 +1017,14 @@
     )
     (i64.const 32)
    )
-   (i64.extend_u/i32
+   (i64.extend_i32_u
     (i32.or
      (i32.shl
-      (get_local $var$2)
+      (local.get $var$2)
       (i32.const 16)
      )
      (i32.and
-      (get_local $var$6)
+      (local.get $var$6)
       (i32.const 65535)
      )
     )
@@ -1038,40 +1038,40 @@
     (i64.div_u
      (i64.sub
       (i64.xor
-       (tee_local $var$2
+       (local.tee $var$2
         (i64.shr_s
-         (get_local $var$0)
+         (local.get $var$0)
          (i64.const 63)
         )
        )
-       (get_local $var$0)
+       (local.get $var$0)
       )
-      (get_local $var$2)
+      (local.get $var$2)
      )
      (i64.sub
       (i64.xor
-       (tee_local $var$2
+       (local.tee $var$2
         (i64.shr_s
-         (get_local $var$1)
+         (local.get $var$1)
          (i64.const 63)
         )
        )
-       (get_local $var$1)
+       (local.get $var$1)
       )
-      (get_local $var$2)
+      (local.get $var$2)
      )
     )
-    (tee_local $var$0
+    (local.tee $var$0
      (i64.shr_s
       (i64.xor
-       (get_local $var$1)
-       (get_local $var$0)
+       (local.get $var$1)
+       (local.get $var$0)
       )
       (i64.const 63)
      )
     )
    )
-   (get_local $var$0)
+   (local.get $var$0)
   )
  )
  (func $_ZN17compiler_builtins3int4sdiv3Mod4mod_17h2cbb7bbf36e41d68E (; 17 ;) (type $0) (param $var$0 i64) (param $var$1 i64) (result i64)
@@ -1081,32 +1081,32 @@
     (i64.rem_u
      (i64.sub
       (i64.xor
-       (tee_local $var$2
+       (local.tee $var$2
         (i64.shr_s
-         (get_local $var$0)
+         (local.get $var$0)
          (i64.const 63)
         )
        )
-       (get_local $var$0)
+       (local.get $var$0)
       )
-      (get_local $var$2)
+      (local.get $var$2)
      )
      (i64.sub
       (i64.xor
-       (tee_local $var$0
+       (local.tee $var$0
         (i64.shr_s
-         (get_local $var$1)
+         (local.get $var$1)
          (i64.const 63)
         )
        )
-       (get_local $var$1)
+       (local.get $var$1)
       )
-      (get_local $var$0)
+      (local.get $var$0)
      )
     )
-    (get_local $var$2)
+    (local.get $var$2)
    )
-   (get_local $var$2)
+   (local.get $var$2)
   )
  )
  ;; custom section "linking", size 3
diff --git a/src/shared-constants.h b/src/shared-constants.h
index 55d90b057..ae7d915ef 100644
--- a/src/shared-constants.h
+++ b/src/shared-constants.h
@@ -54,7 +54,7 @@ extern Name GROW_WASM_MEMORY,
             NEG_NAN,
             CASE,
             BR,
-            ANYFUNC,
+            FUNCREF,
             FAKE_RETURN,
             MUT,
             SPECTEST,
diff --git a/src/shell-interface.h b/src/shell-interface.h
index 23f1c7de5..fc6a5897c 100644
--- a/src/shell-interface.h
+++ b/src/shell-interface.h
@@ -183,11 +183,17 @@ struct ShellExternalInterface : ModuleInstance::ExternalInterface {
   uint32_t load32u(Address addr) override { return memory.get<uint32_t>(addr); }
   int64_t load64s(Address addr) override { return memory.get<int64_t>(addr); }
   uint64_t load64u(Address addr) override { return memory.get<uint64_t>(addr); }
+  std::array<uint8_t, 16> load128(Address addr) override {
+    return memory.get<std::array<uint8_t, 16>>(addr);
+  }
 
   void store8(Address addr, int8_t value) override { memory.set<int8_t>(addr, value); }
   void store16(Address addr, int16_t value) override { memory.set<int16_t>(addr, value); }
   void store32(Address addr, int32_t value) override { memory.set<int32_t>(addr, value); }
   void store64(Address addr, int64_t value) override { memory.set<int64_t>(addr, value); }
+  void store128(Address addr, const std::array<uint8_t, 16>& value) override {
+    memory.set<std::array<uint8_t, 16>>(addr, value);
+  }
 
   void growMemory(Address /*oldSize*/, Address newSize) override {
     memory.resize(newSize);
diff --git a/src/support/alloc.h b/src/support/alloc.h
new file mode 100644
index 000000000..86c49d2f5
--- /dev/null
+++ b/src/support/alloc.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2019 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//
+// Allocation helpers
+//
+
+#ifndef wasm_support_alloc_h
+#define wasm_support_alloc_h
+
+#include <stdlib.h>
+
+#if defined(WIN32) || defined(_WIN32)
+#include <malloc.h>
+#endif
+
+namespace wasm {
+
+// An allocation of a specific size and a minimum alignment. Must be freed
+// with aligned_free. Returns nullptr on failure.
+inline void* aligned_malloc(size_t align, size_t size) {
+#if defined(WIN32) || defined(_WIN32)
+  _set_errno(0);
+  void* ret = _aligned_malloc(size, align);
+  if (errno == ENOMEM) ret = nullptr;
+  return ret;
+#else
+  return aligned_alloc(align, size);
+#endif
+}
+
+inline void aligned_free(void* ptr) {
+#if defined(WIN32) || defined(_WIN32)
+  _aligned_free(ptr);
+#else
+  free(ptr);
+#endif
+}
+
+} // namespace wasm
+
+#endif  // wasm_support_alloc_h
diff --git a/src/support/safe_integer.h b/src/support/safe_integer.h
index ea5e16425..5bd807a18 100644
--- a/src/support/safe_integer.h
+++ b/src/support/safe_integer.h
@@ -20,6 +20,7 @@
 #include <cstdint>
 
 namespace wasm {
+
 bool isInteger(double x);
 bool isUInteger32(double x);
 bool isSInteger32(double x);
@@ -39,6 +40,7 @@ bool isInRangeI32TruncS(int64_t i);
 bool isInRangeI32TruncU(int64_t i);
 bool isInRangeI64TruncS(int64_t i);
 bool isInRangeI64TruncU(int64_t i);
+
 }  // namespace wasm
 
 #endif  // wasm_safe_integer_h
diff --git a/src/tools/asm2wasm.cpp b/src/tools/asm2wasm.cpp
index a001c08a6..4907033e5 100644
--- a/src/tools/asm2wasm.cpp
+++ b/src/tools/asm2wasm.cpp
@@ -96,7 +96,7 @@ int main(int argc, const char *argv[]) {
            [&wasmOnly](Options *o, const std::string& ) {
              wasmOnly = true;
            })
-      .add("--no-legalize-javascript-ffi", "-nj", "Do not legalize (i64->i32, f32->f64) the imports and exports for interfacing with JS", Options::Arguments::Zero,
+      .add("--no-legalize-javascript-ffi", "-nj", "Do not fully legalize (i64->i32, f32->f64) the imports and exports for interfacing with JS", Options::Arguments::Zero,
            [&legalizeJavaScriptFFI](Options *o, const std::string& ) {
              legalizeJavaScriptFFI = false;
            })
diff --git a/src/tools/execution-results.h b/src/tools/execution-results.h
index 935aff4cc..89e0440bb 100644
--- a/src/tools/execution-results.h
+++ b/src/tools/execution-results.h
@@ -143,4 +143,3 @@ struct ExecutionResults {
 };
 
 } // namespace wasm
-
diff --git a/src/tools/feature-options.h b/src/tools/feature-options.h
index 3b35656fc..282aa4d4b 100644
--- a/src/tools/feature-options.h
+++ b/src/tools/feature-options.h
@@ -29,12 +29,12 @@ struct FeatureOptions : public Options {
   FeatureOptions(const std::string& command, const std::string& description)
       : Options(command, description) {
     (*this)
-        .add("--mvp-features", "-mvp", "Disable all non-MVP features (default)",
+        .add("--mvp-features", "-mvp", "Disable all non-MVP features",
              Options::Arguments::Zero,
              [this](Options *o, const std::string& arguments) {
                passOptions.features = FeatureSet::MVP;
              })
-        .add("--all-features", "-all", "Enable all features",
+        .add("--all-features", "-all", "Enable all features (default)",
              Options::Arguments::Zero,
              [this](Options *o, const std::string& arguments) {
                passOptions.features = FeatureSet::All;
@@ -70,7 +70,20 @@ struct FeatureOptions : public Options {
              Options::Arguments::Zero,
              [this](Options *o, const std::string& arguments) {
                passOptions.features.setTruncSat(false);
-             });
+             })
+        .add("--enable-simd", "",
+             "Enable nontrapping float-to-int operations",
+             Options::Arguments::Zero,
+             [this](Options *o, const std::string& arguments) {
+               passOptions.features.setSIMD();
+             })
+        .add("--disable-simd", "",
+             "Disable nontrapping float-to-int operations",
+             Options::Arguments::Zero,
+             [this](Options *o, const std::string& arguments) {
+               passOptions.features.setSIMD(false);
+             })
+        ;
   }
 
   FeatureSet getFeatures() const {
diff --git a/src/tools/fuzzing.h b/src/tools/fuzzing.h
index dcb47529f..56c633f14 100644
--- a/src/tools/fuzzing.h
+++ b/src/tools/fuzzing.h
@@ -177,6 +177,7 @@ private:
 
   // Optionally remove NaNs, which are a source of nondeterminism (which makes
   // cross-VM comparisons harder)
+  // TODO: de-NaN SIMD values
   static const bool DE_NAN = true;
 
   // Features allowed to be emitted
@@ -313,6 +314,7 @@ private:
       func->base = name;
       func->params.push_back(type);
       func->result = none;
+      func->type = ensureFunctionType(getSig(func), &wasm)->name;
       wasm.addFunction(func);
     }
   }
@@ -688,8 +690,8 @@ private:
       case i32:
       case i64:
       case f32:
-      case f64: ret = _makeConcrete(type); break;
-      case v128: assert(false && "v128 not implemented yet");
+      case f64:
+      case v128: ret = _makeConcrete(type); break;
       case none: ret = _makenone(); break;
       case unreachable: ret = _makeunreachable(); break;
     }
@@ -707,24 +709,28 @@ private:
     if (choice < 70) return makeIf(type);
     if (choice < 80) return makeLoop(type);
     if (choice < 90) return makeBreak(type);
-    switch (upTo(15)) {
-      case 0: return makeBlock(type);
-      case 1: return makeIf(type);
-      case 2: return makeLoop(type);
-      case 3: return makeBreak(type);
-      case 4: return makeCall(type);
-      case 5: return makeCallIndirect(type);
-      case 6: return makeGetLocal(type);
-      case 7: return makeSetLocal(type);
-      case 8: return makeLoad(type);
-      case 9: return makeConst(type);
-      case 10: return makeUnary(type);
-      case 11: return makeBinary(type);
-      case 12: return makeSelect(type);
-      case 13: return makeGetGlobal(type);
-      case 14: return makeAtomic(type);
-    }
-    WASM_UNREACHABLE();
+    using Self = TranslateToFuzzReader;
+    auto options = FeatureOptions<Expression* (Self::*)(Type)>()
+                   .add(FeatureSet::MVP,
+                        &Self::makeBlock,
+                        &Self::makeIf,
+                        &Self::makeLoop,
+                        &Self::makeBreak,
+                        &Self::makeCall,
+                        &Self::makeCallIndirect,
+                        &Self::makeGetLocal,
+                        &Self::makeSetLocal,
+                        &Self::makeLoad,
+                        &Self::makeConst,
+                        &Self::makeUnary,
+                        &Self::makeBinary,
+                        &Self::makeSelect,
+                        &Self::makeGetGlobal)
+                   .add(FeatureSet::SIMD, &Self::makeSIMD);
+    if (type == i32 || type == i64) {
+      options.add(FeatureSet::Atomics, &Self::makeAtomic);
+    }
+    return (this->*pick(options))(type);
   }
 
   Expression* _makenone() {
@@ -881,18 +887,18 @@ private:
     }
   }
 
+  Expression* buildIf(const struct ThreeArgs& args) {
+    return builder.makeIf(args.a, args.b, args.c);
+  }
+
   Expression* makeIf(Type type) {
     auto* condition = makeCondition();
     hangStack.push_back(nullptr);
-    auto* ret = makeIf({ condition, makeMaybeBlock(type), makeMaybeBlock(type) });
+    auto* ret = buildIf({ condition, makeMaybeBlock(type), makeMaybeBlock(type) });
     hangStack.pop_back();
     return ret;
   }
 
-  Expression* makeIf(const struct ThreeArgs& args) {
-    return builder.makeIf(args.a, args.b, args.c);
-  }
-
   Expression* makeBreak(Type type) {
     if (breakableStack.empty()) return makeTrivial(type);
     Expression* condition = nullptr;
@@ -1079,7 +1085,7 @@ private:
     return ret;
   }
 
-  Load* makeNonAtomicLoad(Type type) {
+  Expression* makeNonAtomicLoad(Type type) {
     auto offset = logify(get());
     auto ptr = makePointer();
     switch (type) {
@@ -1108,7 +1114,12 @@ private:
       case f64: {
         return builder.makeLoad(8, false, offset, pick(1, 2, 4, 8), ptr, type);
       }
-      case v128: assert(false && "v128 not implemented yet");
+      case v128: {
+        if (!features.hasSIMD()) {
+          return makeTrivial(type);
+        }
+        return builder.makeLoad(16, false, offset, pick(1, 2, 4, 8, 16), ptr, type);
+      }
       case none:
       case unreachable: WASM_UNREACHABLE();
     }
@@ -1120,24 +1131,27 @@ private:
     if (type != i32 && type != i64) return ret;
     if (!features.hasAtomics() || oneIn(2)) return ret;
     // make it atomic
+    auto* load = ret->cast<Load>();
     wasm.memory.shared = true;
-    ret->isAtomic = true;
-    ret->signed_ = false;
-    ret->align = ret->bytes;
-    return ret;
+    load->isAtomic = true;
+    load->signed_ = false;
+    load->align = load->bytes;
+    return load;
   }
 
-  Store* makeNonAtomicStore(Type type) {
+  Expression* makeNonAtomicStore(Type type) {
     if (type == unreachable) {
       // make a normal store, then make it unreachable
       auto* ret = makeNonAtomicStore(getConcreteType());
+      auto* store = ret->dynCast<Store>();
+      if (!store) return ret;
       switch (upTo(3)) {
-        case 0: ret->ptr = make(unreachable); break;
-        case 1: ret->value = make(unreachable); break;
-        case 2: ret->ptr = make(unreachable); ret->value = make(unreachable); break;
+        case 0: store->ptr = make(unreachable); break;
+        case 1: store->value = make(unreachable); break;
+        case 2: store->ptr = make(unreachable); store->value = make(unreachable); break;
       }
-      ret->finalize();
-      return ret;
+      store->finalize();
+      return store;
     }
     // the type is none or unreachable. we also need to pick the value
     // type.
@@ -1171,35 +1185,66 @@ private:
       case f64: {
         return builder.makeStore(8, offset, pick(1, 2, 4, 8), ptr, value, type);
       }
-      case v128: assert(false && "v128 not implemented yet");
+      case v128: {
+        if (!features.hasSIMD()) {
+          return makeTrivial(type);
+        }
+        return builder.makeStore(16, offset, pick(1, 2, 4, 8, 16), ptr, value, type);
+      }
       case none:
       case unreachable: WASM_UNREACHABLE();
     }
     WASM_UNREACHABLE();
   }
 
-  Store* makeStore(Type type) {
+  Expression* makeStore(Type type) {
     auto* ret = makeNonAtomicStore(type);
-    if (ret->value->type != i32 && ret->value->type != i64) return ret;
-    if (!features.hasAtomics() || oneIn(2)) return ret;
+    auto* store = ret->dynCast<Store>();
+    if (!store) return ret;
+    if (store->value->type != i32 && store->value->type != i64) return store;
+    if (!features.hasAtomics() || oneIn(2)) return store;
     // make it atomic
     wasm.memory.shared = true;
-    ret->isAtomic = true;
-    ret->align = ret->bytes;
-    return ret;
-  }
+    store->isAtomic = true;
+    store->align = store->bytes;
+    return store;
+  }
+
+  Literal makeLiteral(Type type) {
+    if (type == v128) {
+      // generate each lane individually for random lane interpretation
+      switch (upTo(6)) {
+        case 0: return Literal(
+          std::array<Literal, 16>{{
+            makeLiteral(i32), makeLiteral(i32), makeLiteral(i32), makeLiteral(i32),
+            makeLiteral(i32), makeLiteral(i32), makeLiteral(i32), makeLiteral(i32),
+            makeLiteral(i32), makeLiteral(i32), makeLiteral(i32), makeLiteral(i32),
+            makeLiteral(i32), makeLiteral(i32), makeLiteral(i32), makeLiteral(i32)
+          }}
+        );
+        case 1: return Literal(
+          std::array<Literal, 8>{{
+            makeLiteral(i32), makeLiteral(i32), makeLiteral(i32), makeLiteral(i32),
+            makeLiteral(i32), makeLiteral(i32), makeLiteral(i32), makeLiteral(i32)
+          }}
+        );
+        case 2: return Literal(std::array<Literal, 4>{{makeLiteral(i32), makeLiteral(i32), makeLiteral(i32), makeLiteral(i32)}});
+        case 3: return Literal(std::array<Literal, 2>{{makeLiteral(i64), makeLiteral(i64)}});
+        case 4: return Literal(std::array<Literal, 4>{{makeLiteral(f32), makeLiteral(f32), makeLiteral(f32), makeLiteral(f32)}});
+        case 5: return Literal(std::array<Literal, 2>{{makeLiteral(f64), makeLiteral(f64)}});
+        default: WASM_UNREACHABLE();
+      }
+    }
 
-  Expression* makeConst(Type type) {
-    Literal value;
     switch (upTo(4)) {
       case 0: {
         // totally random, entire range
         switch (type) {
-          case i32: value = Literal(get32()); break;
-          case i64: value = Literal(get64()); break;
-          case f32: value = Literal(getFloat()); break;
-          case f64: value = Literal(getDouble()); break;
-          case v128: assert(false && "v128 not implemented yet");
+          case i32: return Literal(get32());
+          case i64: return Literal(get64());
+          case f32: return Literal(getFloat());
+          case f64: return Literal(getDouble());
+          case v128:
           case none:
           case unreachable: WASM_UNREACHABLE();
         }
@@ -1218,11 +1263,11 @@ private:
           default: WASM_UNREACHABLE();
         }
         switch (type) {
-          case i32: value = Literal(int32_t(small)); break;
-          case i64: value = Literal(int64_t(small)); break;
-          case f32: value = Literal(float(small)); break;
-          case f64: value = Literal(double(small)); break;
-          case v128: assert(false && "v128 not implemented yet");
+          case i32: return Literal(int32_t(small));
+          case i64: return Literal(int64_t(small));
+          case f32: return Literal(float(small));
+          case f64: return Literal(double(small));
+          case v128:
           case none:
           case unreachable: WASM_UNREACHABLE();
         }
@@ -1230,6 +1275,7 @@ private:
       }
       case 2: {
         // special values
+        Literal value;
         switch (type) {
           case i32: value = Literal(pick<int32_t>(0,
                                                   std::numeric_limits<int8_t>::min(),  std::numeric_limits<int8_t>::max(),
@@ -1260,45 +1306,49 @@ private:
                                                  std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max(),
                                                  std::numeric_limits<uint32_t>::max(),
                                                  std::numeric_limits<uint64_t>::max())); break;
-          case v128: assert(false && "v128 not implemented yet");
+          case v128:
           case none:
-          case unreachable: {
-            WASM_UNREACHABLE();
-          }
+          case unreachable: WASM_UNREACHABLE();
         }
         // tweak around special values
         if (oneIn(3)) { // +- 1
-          value = value.add(LiteralUtils::makeLiteralFromInt32(upTo(3) - 1, type));
+          value = value.add(Literal::makeFromInt32(upTo(3) - 1, type));
         }
         if (oneIn(2)) { // flip sign
-          value = value.mul(LiteralUtils::makeLiteralFromInt32(-1, type));
+          value = value.mul(Literal::makeFromInt32(-1, type));
         }
-        break;
+        return value;
       }
       case 3: {
         // powers of 2
+        Literal value;
         switch (type) {
           case i32: value = Literal(int32_t(1) << upTo(32)); break;
           case i64: value = Literal(int64_t(1) << upTo(64)); break;
           case f32: value = Literal(float(int64_t(1) << upTo(64))); break;
           case f64: value = Literal(double(int64_t(1) << upTo(64))); break;
-          case v128: assert(false && "v128 not implemented yet");
+          case v128:
           case none:
           case unreachable: WASM_UNREACHABLE();
         }
         // maybe negative
         if (oneIn(2)) {
-          value = value.mul(LiteralUtils::makeLiteralFromInt32(-1, type));
+          value = value.mul(Literal::makeFromInt32(-1, type));
         }
+        return value;
       }
     }
+    WASM_UNREACHABLE();
+  }
+
+  Expression* makeConst(Type type) {
     auto* ret = wasm.allocator.alloc<Const>();
-    ret->value = value;
-    ret->type = value.type;
+    ret->value = makeLiteral(type);
+    ret->type = type;
     return ret;
   }
 
-  Expression* makeUnary(const UnaryArgs& args) {
+  Expression* buildUnary(const UnaryArgs& args) {
     return builder.makeUnary(args.a, args.b);
   }
 
@@ -1312,32 +1362,40 @@ private:
     }
     switch (type) {
       case i32: {
-        switch (upTo(4)) {
-          case 0: {
+        switch (getConcreteType()) {
+          case i32: {
             auto op = pick(
               FeatureOptions<UnaryOp>()
               .add(FeatureSet::MVP, EqZInt32, ClzInt32, CtzInt32, PopcntInt32)
               .add(FeatureSet::Atomics, ExtendS8Int32, ExtendS16Int32)
             );
-            return makeUnary({ op, make(i32) });
+            return buildUnary({ op, make(i32) });
           }
-          case 1: return makeUnary({ pick(EqZInt64, WrapInt64), make(i64) });
-          case 2: {
+          case i64: return buildUnary({ pick(EqZInt64, WrapInt64), make(i64) });
+          case f32: {
             auto op = pick(
               FeatureOptions<UnaryOp>()
               .add(FeatureSet::MVP, TruncSFloat32ToInt32, TruncUFloat32ToInt32, ReinterpretFloat32)
               .add(FeatureSet::TruncSat, TruncSatSFloat32ToInt32, TruncSatUFloat32ToInt32)
             );
-            return makeUnary({ op, make(f32) });
+            return buildUnary({ op, make(f32) });
           }
-          case 3: {
+          case f64: {
             auto op = pick(
               FeatureOptions<UnaryOp>()
               .add(FeatureSet::MVP, TruncSFloat64ToInt32, TruncUFloat64ToInt32)
               .add(FeatureSet::TruncSat, TruncSatSFloat64ToInt32, TruncSatUFloat64ToInt32)
             );
-            return makeUnary({ op, make(f64) });
+            return buildUnary({ op, make(f64) });
+          }
+          case v128: {
+            assert(features.hasSIMD());
+            return buildUnary({ pick(AnyTrueVecI8x16, AllTrueVecI8x16, AnyTrueVecI16x8, AllTrueVecI16x8,
+                                    AnyTrueVecI32x4, AllTrueVecI32x4, AnyTrueVecI64x2, AllTrueVecI64x2),
+                               make(v128) });
           }
+          case none:
+          case unreachable: WASM_UNREACHABLE();
         }
         WASM_UNREACHABLE();
       }
@@ -1349,16 +1407,16 @@ private:
               .add(FeatureSet::MVP, ClzInt64, CtzInt64, PopcntInt64)
               .add(FeatureSet::Atomics, ExtendS8Int64, ExtendS16Int64, ExtendS32Int64)
             );
-            return makeUnary({ op, make(i64) });
+            return buildUnary({ op, make(i64) });
           }
-          case 1: return makeUnary({ pick(ExtendSInt32, ExtendUInt32), make(i32) });
+          case 1: return buildUnary({ pick(ExtendSInt32, ExtendUInt32), make(i32) });
           case 2: {
             auto op = pick(
               FeatureOptions<UnaryOp>()
               .add(FeatureSet::MVP, TruncSFloat32ToInt64, TruncUFloat32ToInt64)
               .add(FeatureSet::TruncSat, TruncSatSFloat32ToInt64, TruncSatUFloat32ToInt64)
             );
-            return makeUnary({ op, make(f32) });
+            return buildUnary({ op, make(f32) });
           }
           case 3: {
             auto op = pick(
@@ -1366,46 +1424,59 @@ private:
               .add(FeatureSet::MVP, TruncSFloat64ToInt64, TruncUFloat64ToInt64, ReinterpretFloat64)
               .add(FeatureSet::TruncSat, TruncSatSFloat64ToInt64, TruncSatUFloat64ToInt64)
             );
-            return makeUnary({ op, make(f64) });
+            return buildUnary({ op, make(f64) });
           }
         }
         WASM_UNREACHABLE();
       }
       case f32: {
         switch (upTo(4)) {
-          case 0: return makeDeNanOp(makeUnary({ pick(NegFloat32, AbsFloat32, CeilFloat32, FloorFloat32, TruncFloat32, NearestFloat32, SqrtFloat32), make(f32) }));
-          case 1: return makeDeNanOp(makeUnary({ pick(ConvertUInt32ToFloat32, ConvertSInt32ToFloat32, ReinterpretInt32), make(i32) }));
-          case 2: return makeDeNanOp(makeUnary({ pick(ConvertUInt64ToFloat32, ConvertSInt64ToFloat32), make(i64) }));
-          case 3: return makeDeNanOp(makeUnary({ DemoteFloat64, make(f64) }));
+          case 0: return makeDeNanOp(buildUnary({ pick(NegFloat32, AbsFloat32, CeilFloat32, FloorFloat32, TruncFloat32, NearestFloat32, SqrtFloat32), make(f32) }));
+          case 1: return makeDeNanOp(buildUnary({ pick(ConvertUInt32ToFloat32, ConvertSInt32ToFloat32, ReinterpretInt32), make(i32) }));
+          case 2: return makeDeNanOp(buildUnary({ pick(ConvertUInt64ToFloat32, ConvertSInt64ToFloat32), make(i64) }));
+          case 3: return makeDeNanOp(buildUnary({ DemoteFloat64, make(f64) }));
         }
         WASM_UNREACHABLE();
       }
       case f64: {
         switch (upTo(4)) {
-          case 0: return makeDeNanOp(makeUnary({ pick(NegFloat64, AbsFloat64, CeilFloat64, FloorFloat64, TruncFloat64, NearestFloat64, SqrtFloat64), make(f64) }));
-          case 1: return makeDeNanOp(makeUnary({ pick(ConvertUInt32ToFloat64, ConvertSInt32ToFloat64), make(i32) }));
-          case 2: return makeDeNanOp(makeUnary({ pick(ConvertUInt64ToFloat64, ConvertSInt64ToFloat64, ReinterpretInt64), make(i64) }));
-          case 3: return makeDeNanOp(makeUnary({ PromoteFloat32, make(f32) }));
+          case 0: return makeDeNanOp(buildUnary({ pick(NegFloat64, AbsFloat64, CeilFloat64, FloorFloat64, TruncFloat64, NearestFloat64, SqrtFloat64), make(f64) }));
+          case 1: return makeDeNanOp(buildUnary({ pick(ConvertUInt32ToFloat64, ConvertSInt32ToFloat64), make(i32) }));
+          case 2: return makeDeNanOp(buildUnary({ pick(ConvertUInt64ToFloat64, ConvertSInt64ToFloat64, ReinterpretInt64), make(i64) }));
+          case 3: return makeDeNanOp(buildUnary({ PromoteFloat32, make(f32) }));
         }
         WASM_UNREACHABLE();
       }
-      case v128: assert(false && "v128 not implemented yet");
-      case none:
-      case unreachable: {
+      case v128: {
+        assert(features.hasSIMD());
+        switch (upTo(5)) {
+          case 0: return buildUnary({ pick(SplatVecI8x16, SplatVecI16x8, SplatVecI32x4), make(i32) });
+          case 1: return buildUnary({ SplatVecI64x2, make(i64) });
+          case 2: return buildUnary({ SplatVecF32x4, make(f32) });
+          case 3: return buildUnary({ SplatVecF64x2, make(f64) });
+          case 4: return buildUnary({
+              pick(NotVec128, NegVecI8x16, NegVecI16x8, NegVecI32x4, NegVecI64x2,
+                   AbsVecF32x4, NegVecF32x4, SqrtVecF32x4, AbsVecF64x2, NegVecF64x2, SqrtVecF64x2,
+                   TruncSatSVecF32x4ToVecI32x4, TruncSatUVecF32x4ToVecI32x4, TruncSatSVecF64x2ToVecI64x2, TruncSatUVecF64x2ToVecI64x2,
+                   ConvertSVecI32x4ToVecF32x4, ConvertUVecI32x4ToVecF32x4, ConvertSVecI64x2ToVecF64x2, ConvertUVecI64x2ToVecF64x2),
+              make(v128) });
+        }
         WASM_UNREACHABLE();
       }
+      case none:
+      case unreachable: WASM_UNREACHABLE();
     }
     WASM_UNREACHABLE();
   }
 
-  Expression* makeBinary(const BinaryArgs& args) {
+  Expression* buildBinary(const BinaryArgs& args) {
     return builder.makeBinary(args.a, args.b, args.c);
   }
 
   Expression* makeBinary(Type type) {
     if (type == unreachable) {
       if (auto* binary = makeBinary(getConcreteType())->dynCast<Binary>()) {
-        return makeDeNanOp(makeBinary({ binary->op, make(unreachable), make(unreachable) }));
+        return makeDeNanOp(buildBinary({ binary->op, make(unreachable), make(unreachable) }));
       }
       // give up
       return makeTrivial(type);
@@ -1413,35 +1484,47 @@ private:
     switch (type) {
       case i32: {
         switch (upTo(4)) {
-          case 0: return makeBinary({ pick(AddInt32, SubInt32, MulInt32, DivSInt32, DivUInt32, RemSInt32, RemUInt32, AndInt32, OrInt32, XorInt32, ShlInt32, ShrUInt32, ShrSInt32, RotLInt32, RotRInt32, EqInt32, NeInt32, LtSInt32, LtUInt32, LeSInt32, LeUInt32, GtSInt32, GtUInt32, GeSInt32, GeUInt32), make(i32), make(i32) });
-          case 1: return makeBinary({ pick(EqInt64, NeInt64, LtSInt64, LtUInt64, LeSInt64, LeUInt64, GtSInt64, GtUInt64, GeSInt64, GeUInt64), make(i64), make(i64) });
-          case 2: return makeBinary({ pick(EqFloat32, NeFloat32, LtFloat32, LeFloat32, GtFloat32, GeFloat32), make(f32), make(f32) });
-          case 3: return makeBinary({ pick(EqFloat64, NeFloat64, LtFloat64, LeFloat64, GtFloat64, GeFloat64), make(f64), make(f64) });
+          case 0: return buildBinary({ pick(AddInt32, SubInt32, MulInt32, DivSInt32, DivUInt32, RemSInt32, RemUInt32, AndInt32, OrInt32, XorInt32, ShlInt32, ShrUInt32, ShrSInt32, RotLInt32, RotRInt32, EqInt32, NeInt32, LtSInt32, LtUInt32, LeSInt32, LeUInt32, GtSInt32, GtUInt32, GeSInt32, GeUInt32), make(i32), make(i32) });
+          case 1: return buildBinary({ pick(EqInt64, NeInt64, LtSInt64, LtUInt64, LeSInt64, LeUInt64, GtSInt64, GtUInt64, GeSInt64, GeUInt64), make(i64), make(i64) });
+          case 2: return buildBinary({ pick(EqFloat32, NeFloat32, LtFloat32, LeFloat32, GtFloat32, GeFloat32), make(f32), make(f32) });
+          case 3: return buildBinary({ pick(EqFloat64, NeFloat64, LtFloat64, LeFloat64, GtFloat64, GeFloat64), make(f64), make(f64) });
         }
         WASM_UNREACHABLE();
       }
       case i64: {
-        return makeBinary({ pick(AddInt64, SubInt64, MulInt64, DivSInt64, DivUInt64, RemSInt64, RemUInt64, AndInt64, OrInt64, XorInt64, ShlInt64, ShrUInt64, ShrSInt64, RotLInt64, RotRInt64), make(i64), make(i64) });
+        return buildBinary({ pick(AddInt64, SubInt64, MulInt64, DivSInt64, DivUInt64, RemSInt64, RemUInt64, AndInt64, OrInt64, XorInt64, ShlInt64, ShrUInt64, ShrSInt64, RotLInt64, RotRInt64), make(i64), make(i64) });
       }
       case f32: {
-        return makeDeNanOp(makeBinary({ pick(AddFloat32, SubFloat32, MulFloat32, DivFloat32, CopySignFloat32, MinFloat32, MaxFloat32), make(f32), make(f32) }));
+        return makeDeNanOp(buildBinary({ pick(AddFloat32, SubFloat32, MulFloat32, DivFloat32, CopySignFloat32, MinFloat32, MaxFloat32), make(f32), make(f32) }));
       }
       case f64: {
-        return makeDeNanOp(makeBinary({ pick(AddFloat64, SubFloat64, MulFloat64, DivFloat64, CopySignFloat64, MinFloat64, MaxFloat64), make(f64), make(f64) }));
+        return makeDeNanOp(buildBinary({ pick(AddFloat64, SubFloat64, MulFloat64, DivFloat64, CopySignFloat64, MinFloat64, MaxFloat64), make(f64), make(f64) }));
+      }
+      case v128: {
+        assert(features.hasSIMD());
+        return buildBinary({
+            pick(EqVecI8x16, NeVecI8x16, LtSVecI8x16, LtUVecI8x16, GtSVecI8x16, GtUVecI8x16, LeSVecI8x16, LeUVecI8x16, GeSVecI8x16, GeUVecI8x16,
+                 EqVecI16x8, NeVecI16x8, LtSVecI16x8, LtUVecI16x8, GtSVecI16x8, GtUVecI16x8, LeSVecI16x8, LeUVecI16x8, GeSVecI16x8, GeUVecI16x8,
+                 EqVecI32x4, NeVecI32x4, LtSVecI32x4, LtUVecI32x4, GtSVecI32x4, GtUVecI32x4, LeSVecI32x4, LeUVecI32x4, GeSVecI32x4, GeUVecI32x4,
+                 EqVecF32x4, NeVecF32x4, LtVecF32x4, GtVecF32x4, LeVecF32x4, GeVecF32x4, EqVecF64x2, NeVecF64x2, LtVecF64x2, GtVecF64x2, LeVecF64x2, GeVecF64x2,
+                 AndVec128, OrVec128, XorVec128, AddVecI8x16, AddSatSVecI8x16, AddSatUVecI8x16, SubVecI8x16, SubSatSVecI8x16, SubSatUVecI8x16, MulVecI8x16,
+                 AddVecI16x8, AddSatSVecI16x8, AddSatUVecI16x8, SubVecI16x8, SubSatSVecI16x8, SubSatUVecI16x8, MulVecI16x8,   AddVecI32x4, SubVecI32x4, MulVecI32x4,
+                 AddVecI64x2, SubVecI64x2, AddVecF32x4, SubVecF32x4, MulVecF32x4, DivVecF32x4, MinVecF32x4, MaxVecF32x4,
+                 AddVecF64x2, SubVecF64x2, MulVecF64x2, DivVecF64x2, MinVecF64x2, MaxVecF64x2),
+            make(v128), make(v128) });
       }
-      case v128: assert(false && "v128 not implemented yet");
       case none:
       case unreachable: WASM_UNREACHABLE();
     }
     WASM_UNREACHABLE();
   }
 
-  Expression* makeSelect(const ThreeArgs& args) {
+  Expression* buildSelect(const ThreeArgs& args) {
     return builder.makeSelect(args.a, args.b, args.c);
   }
 
   Expression* makeSelect(Type type) {
-    return makeDeNanOp(makeSelect({ make(i32), make(type), make(type) }));
+    return makeDeNanOp(buildSelect({ make(i32), make(type), make(type) }));
   }
 
   Expression* makeSwitch(Type type) {
@@ -1493,7 +1576,7 @@ private:
   }
 
   Expression* makeAtomic(Type type) {
-    if (!features.hasAtomics() || (type != i32 && type != i64)) return makeTrivial(type);
+    assert(features.hasAtomics());
     wasm.memory.shared = true;
     if (type == i32 && oneIn(2)) {
       if (ATOMIC_WAITS && oneIn(2)) {
@@ -1544,6 +1627,92 @@ private:
     }
   }
 
+  Expression* makeSIMD(Type type) {
+    assert(features.hasSIMD());
+    if (type != v128) {
+      return makeSIMDExtract(type);
+    }
+    switch (upTo(6)) {
+      case 0: return makeUnary(v128);
+      case 1: return makeBinary(v128);
+      case 2: return makeSIMDReplace();
+      case 3: return makeSIMDShuffle();
+      case 4: return makeSIMDBitselect();
+      case 5: return makeSIMDShift();
+    }
+    WASM_UNREACHABLE();
+  }
+
+  Expression* makeSIMDExtract(Type type) {
+    auto op = static_cast<SIMDExtractOp>(0);
+    switch (type) {
+      case i32: op = pick(ExtractLaneSVecI8x16, ExtractLaneUVecI8x16, ExtractLaneSVecI16x8, ExtractLaneUVecI16x8, ExtractLaneVecI32x4); break;
+      case i64: op = ExtractLaneVecI64x2; break;
+      case f32: op = ExtractLaneVecF32x4; break;
+      case f64: op = ExtractLaneVecF64x2; break;
+      case v128:
+      case none:
+      case unreachable: WASM_UNREACHABLE();
+    }
+    Expression* vec = make(v128);
+    uint8_t index = 0;
+    switch (op) {
+      case ExtractLaneSVecI8x16:
+      case ExtractLaneUVecI8x16: index = upTo(16); break;
+      case ExtractLaneSVecI16x8:
+      case ExtractLaneUVecI16x8: index = upTo(8); break;
+      case ExtractLaneVecI32x4:
+      case ExtractLaneVecF32x4: index = upTo(4); break;
+      case ExtractLaneVecI64x2:
+      case ExtractLaneVecF64x2: index = upTo(2); break;
+    }
+    return builder.makeSIMDExtract(op, vec, index);
+  }
+
+  Expression* makeSIMDReplace() {
+    SIMDReplaceOp op = pick(ReplaceLaneVecI8x16, ReplaceLaneVecI16x8, ReplaceLaneVecI32x4,
+                            ReplaceLaneVecI64x2, ReplaceLaneVecF32x4, ReplaceLaneVecF64x2);
+    Expression* vec = make(v128);
+    uint8_t index;
+    Type lane_t;
+    switch (op) {
+      case ReplaceLaneVecI8x16: index = upTo(16); lane_t = i32; break;
+      case ReplaceLaneVecI16x8: index = upTo(8); lane_t = i32; break;
+      case ReplaceLaneVecI32x4: index = upTo(4); lane_t = i32; break;
+      case ReplaceLaneVecI64x2: index = upTo(2); lane_t = i64; break;
+      case ReplaceLaneVecF32x4: index = upTo(4); lane_t = f32; break;
+      case ReplaceLaneVecF64x2: index = upTo(2); lane_t = f64; break;
+      default: WASM_UNREACHABLE();
+    }
+    Expression* value = make(lane_t);
+    return builder.makeSIMDReplace(op, vec, index, value);
+  }
+
+  Expression* makeSIMDShuffle() {
+    Expression* left = make(v128);
+    Expression* right = make(v128);
+    std::array<uint8_t, 16> mask;
+    for (size_t i = 0; i < 16; ++i) {
+      mask[i] = upTo(32);
+    }
+    return builder.makeSIMDShuffle(left, right, mask);
+  }
+
+  Expression* makeSIMDBitselect() {
+    Expression* left = make(v128);
+    Expression* right = make(v128);
+    Expression* cond = make(v128);
+    return builder.makeSIMDBitselect(left, right, cond);
+  }
+
+  Expression* makeSIMDShift() {
+    SIMDShiftOp op = pick(ShlVecI8x16, ShrSVecI8x16, ShrUVecI8x16, ShlVecI16x8, ShrSVecI16x8, ShrUVecI16x8,
+                          ShlVecI32x4, ShrSVecI32x4, ShrUVecI32x4, ShlVecI64x2, ShrSVecI64x2, ShrUVecI64x2);
+    Expression* vec = make(v128);
+    Expression* shift = make(i32);
+    return builder.makeSIMDShift(op, vec, shift);
+  }
+
   // special makers
 
   Expression* makeLogging() {
@@ -1554,36 +1723,21 @@ private:
   // special getters
 
   Type getType() {
-    switch (upTo(6)) {
-      case 0: return i32;
-      case 1: return i64;
-      case 2: return f32;
-      case 3: return f64;
-      case 4: return none;
-      case 5: return unreachable;
-    }
-    WASM_UNREACHABLE();
+    return pick(FeatureOptions<Type>()
+                .add(FeatureSet::MVP, i32, i64, f32, f64, none, unreachable)
+                .add(FeatureSet::SIMD, v128));
   }
 
   Type getReachableType() {
-    switch (upTo(5)) {
-      case 0: return i32;
-      case 1: return i64;
-      case 2: return f32;
-      case 3: return f64;
-      case 4: return none;
-    }
-    WASM_UNREACHABLE();
+    return pick(FeatureOptions<Type>()
+                .add(FeatureSet::MVP, i32, i64, f32, f64, none)
+                .add(FeatureSet::SIMD, v128));
   }
 
   Type getConcreteType() {
-    switch (upTo(4)) {
-      case 0: return i32;
-      case 1: return i64;
-      case 2: return f32;
-      case 3: return f64;
-    }
-    WASM_UNREACHABLE();
+    return pick(FeatureOptions<Type>()
+                .add(FeatureSet::MVP, i32, i64, f32, f64)
+                .add(FeatureSet::SIMD, v128));
   }
 
   // statistical distributions
diff --git a/src/tools/js-wrapper.h b/src/tools/js-wrapper.h
index cb5c0bd5b..7cf2ffc53 100644
--- a/src/tools/js-wrapper.h
+++ b/src/tools/js-wrapper.h
@@ -22,10 +22,15 @@
 namespace wasm {
 
 static std::string generateJSWrapper(Module& wasm) {
+  PassRunner runner(&wasm);
+  runner.add("legalize-js-interface");
+  runner.run();
+
   std::string ret;
   ret += "if (typeof console === 'undefined') {\n"
          "  console = { log: print };\n"
          "}\n"
+         "var tempRet0;\n"
          "var binary;\n"
          "if (typeof process === 'object' && typeof require === 'function' /* node.js detection */) {\n"
          "  var args = process.argv.slice(2);\n"
@@ -44,7 +49,18 @@ static std::string generateJSWrapper(Module& wasm) {
          "    binary = read(args[0], 'binary');\n"
          "  }\n"
          "}\n"
-         "var instance = new WebAssembly.Instance(new WebAssembly.Module(binary), {});\n";
+         "var instance = new WebAssembly.Instance(new WebAssembly.Module(binary), {\n"
+         "  'fuzzing-support': {\n"
+         "    'log-i32': function(x) { console.log('i32: ' + x) },\n"
+         "    'log-i64': function(x, y) { console.log('i64: ' + x + ', ' + y) },\n"
+         "    'log-f32': function(x) { console.log('f32: ' + x) },\n"
+         "    'log-f64': function(x) { console.log('f64: ' + x) }\n"
+         "  },\n"
+         "  'env': {\n"
+         "    'setTempRet0': function(x) { tempRet0 = x },\n"
+         "    'getTempRet0': function() { return tempRet0 },\n"
+         "  },\n"
+         "});\n";
   for (auto& exp : wasm.exports) {
     auto* func = wasm.getFunctionOrNull(exp->value);
     if (!func) continue; // something exported other than a function
diff --git a/src/tools/spec-wrapper.h b/src/tools/spec-wrapper.h
index d6aa0d87e..a42230fc1 100644
--- a/src/tools/spec-wrapper.h
+++ b/src/tools/spec-wrapper.h
@@ -34,7 +34,7 @@ static std::string generateSpecWrapper(Module& wasm) {
         case i64: ret += "(i64.const 0)"; break;
         case f32: ret += "(f32.const 0)"; break;
         case f64: ret += "(f64.const 0)"; break;
-        case v128: assert(false && "v128 not implemented yet");
+        case v128: ret += "(v128.const i32 0 0 0 0)"; break;
         case none:
         case unreachable: WASM_UNREACHABLE();
       }
diff --git a/src/tools/wasm-ctor-eval.cpp b/src/tools/wasm-ctor-eval.cpp
index b0e2e2ce7..c0730877a 100644
--- a/src/tools/wasm-ctor-eval.cpp
+++ b/src/tools/wasm-ctor-eval.cpp
@@ -195,12 +195,12 @@ struct CtorEvalExternalInterface : EvallingModuleInstance::ExternalInterface {
     // fill in fake values for everything else, which is dangerous to use
     ModuleUtils::iterDefinedGlobals(wasm_, [&](Global* defined) {
       if (globals.find(defined->name) == globals.end()) {
-        globals[defined->name] = LiteralUtils::makeLiteralZero(defined->type);
+        globals[defined->name] = Literal::makeZero(defined->type);
       }
     });
     ModuleUtils::iterImportedGlobals(wasm_, [&](Global* import) {
       if (globals.find(import->name) == globals.end()) {
-        globals[import->name] = LiteralUtils::makeLiteralZero(import->type);
+        globals[import->name] = Literal::makeZero(import->type);
       }
     });
   }
@@ -226,7 +226,7 @@ struct CtorEvalExternalInterface : EvallingModuleInstance::ExternalInterface {
       } else if (segment.offset->is<GetGlobal>()) {
         start = 0;
       } else {
-        WASM_UNREACHABLE(); // wasm spec only allows const and get_global there
+        WASM_UNREACHABLE(); // wasm spec only allows const and global.get there
       }
       auto end = start + segment.data.size();
       if (start <= index && index < end) {
diff --git a/src/tools/wasm-emscripten-finalize.cpp b/src/tools/wasm-emscripten-finalize.cpp
index 6c5682703..3d74138cc 100644
--- a/src/tools/wasm-emscripten-finalize.cpp
+++ b/src/tools/wasm-emscripten-finalize.cpp
@@ -30,6 +30,7 @@
 #include "wasm-io.h"
 #include "wasm-printing.h"
 #include "wasm-validator.h"
+#include "abi/js.h"
 
 using namespace cashew;
 using namespace wasm;
@@ -83,7 +84,7 @@ int main(int argc, const char *argv[]) {
       .add("--input-source-map", "-ism", "Consume source map from the specified file",
            Options::Arguments::One,
            [&inputSourceMapFilename](Options *o, const std::string& argument) { inputSourceMapFilename = argument; })
-      .add("--no-legalize-javascript-ffi", "-nj", "Do not legalize (i64->i32, "
+      .add("--no-legalize-javascript-ffi", "-nj", "Do not fully legalize (i64->i32, "
            "f32->f64) the imports and exports for interfacing with JS",
            Options::Arguments::Zero,
            [&legalizeJavaScriptFFI](Options *o, const std::string& ) {
@@ -158,13 +159,14 @@ int main(int argc, const char *argv[]) {
   EmscriptenGlueGenerator generator(wasm);
   generator.fixInvokeFunctionNames();
 
-  if (legalizeJavaScriptFFI) {
-    PassRunner passRunner(&wasm);
-    passRunner.setDebug(options.debug);
-    passRunner.setDebugInfo(debugInfo);
-    passRunner.add("legalize-js-interface");
-    passRunner.run();
-  }
+  PassRunner passRunner(&wasm);
+  passRunner.setDebug(options.debug);
+  passRunner.setDebugInfo(debugInfo);
+  passRunner.add(ABI::getLegalizationPass(
+    legalizeJavaScriptFFI ? ABI::LegalizationLevel::Full
+                          : ABI::LegalizationLevel::Minimal
+  ));
+  passRunner.run();
 
   std::vector<Name> initializerFunctions;
 
diff --git a/src/tools/wasm-merge.cpp b/src/tools/wasm-merge.cpp
index 52f682e16..e9ac6d649 100644
--- a/src/tools/wasm-merge.cpp
+++ b/src/tools/wasm-merge.cpp
@@ -500,7 +500,7 @@ struct InputMergeable : public ExpressionStackWalker<InputMergeable, Visitor<Inp
   }
 
 private:
-  // add an offset to a get_global. we look above, and if there is already an add,
+  // add an offset to a global.get. we look above, and if there is already an add,
   // we can add into it, avoiding creating a new node
   void addBump(Index bump) {
     if (expressionStack.size() >= 2) {
diff --git a/src/tools/wasm-reduce.cpp b/src/tools/wasm-reduce.cpp
index 8c5df7a1b..02174bb8a 100644
--- a/src/tools/wasm-reduce.cpp
+++ b/src/tools/wasm-reduce.cpp
@@ -839,7 +839,7 @@ struct Reducer : public WalkerPass<PostWalker<Reducer, UnifiedExpressionVisitor<
     // try to replace with a trivial value
     Const* c = builder->makeConst(Literal(int32_t(0)));
     if (tryToReplaceCurrent(c)) return true;
-    c->value = LiteralUtils::makeLiteralFromInt32(1, curr->type);
+    c->value = Literal::makeFromInt32(1, curr->type);
     c->type = curr->type;
     return tryToReplaceCurrent(c);
   }
diff --git a/src/wasm-binary.h b/src/wasm-binary.h
index 45052bcb2..998952bde 100644
--- a/src/wasm-binary.h
+++ b/src/wasm-binary.h
@@ -330,6 +330,7 @@ enum EncodedType {
   i64 = -0x2, // 0x7e
   f32 = -0x3, // 0x7d
   f64 = -0x4, // 0x7c
+  v128 = -0x5, // 0x7b
   // elem_type
   AnyFunc = -0x10, // 0x70
   // func_type form
@@ -549,6 +550,7 @@ enum ASTNodes {
   I64ExtendS32 = 0xc4,
 
   TruncSatPrefix = 0xfc,
+  SIMDPrefix = 0xfd,
   AtomicPrefix = 0xfe
 };
 
@@ -639,6 +641,149 @@ enum TruncSatOpcodes {
   I64UTruncSatF64 = 0x07,
 };
 
+enum SIMDOpcodes {
+  V128Load = 0x00,
+  V128Store = 0x01,
+  V128Const = 0x02,
+  V8x16Shuffle = 0x03,
+  I8x16Splat = 0x04,
+  I8x16ExtractLaneS = 0x05,
+  I8x16ExtractLaneU = 0x06,
+  I8x16ReplaceLane = 0x07,
+  I16x8Splat = 0x08,
+  I16x8ExtractLaneS = 0x09,
+  I16x8ExtractLaneU = 0x0a,
+  I16x8ReplaceLane = 0x0b,
+  I32x4Splat = 0x0c,
+  I32x4ExtractLane = 0x0d,
+  I32x4ReplaceLane = 0x0e,
+  I64x2Splat = 0x0f,
+  I64x2ExtractLane = 0x10,
+  I64x2ReplaceLane = 0x11,
+  F32x4Splat = 0x12,
+  F32x4ExtractLane = 0x13,
+  F32x4ReplaceLane = 0x14,
+  F64x2Splat = 0x15,
+  F64x2ExtractLane = 0x16,
+  F64x2ReplaceLane = 0x17,
+  I8x16Eq = 0x18,
+  I8x16Ne = 0x19,
+  I8x16LtS = 0x1a,
+  I8x16LtU = 0x1b,
+  I8x16GtS = 0x1c,
+  I8x16GtU = 0x1d,
+  I8x16LeS = 0x1e,
+  I8x16LeU = 0x1f,
+  I8x16GeS = 0x20,
+  I8x16GeU = 0x21,
+  I16x8Eq = 0x22,
+  I16x8Ne = 0x23,
+  I16x8LtS = 0x24,
+  I16x8LtU = 0x25,
+  I16x8GtS = 0x26,
+  I16x8GtU = 0x27,
+  I16x8LeS = 0x28,
+  I16x8LeU = 0x29,
+  I16x8GeS = 0x2a,
+  I16x8GeU = 0x2b,
+  I32x4Eq = 0x2c,
+  I32x4Ne = 0x2d,
+  I32x4LtS = 0x2e,
+  I32x4LtU = 0x2f,
+  I32x4GtS = 0x30,
+  I32x4GtU = 0x31,
+  I32x4LeS = 0x32,
+  I32x4LeU = 0x33,
+  I32x4GeS = 0x34,
+  I32x4GeU = 0x35,
+  F32x4Eq = 0x40,
+  F32x4Ne = 0x41,
+  F32x4Lt = 0x42,
+  F32x4Gt = 0x43,
+  F32x4Le = 0x44,
+  F32x4Ge = 0x45,
+  F64x2Eq = 0x46,
+  F64x2Ne = 0x47,
+  F64x2Lt = 0x48,
+  F64x2Gt = 0x49,
+  F64x2Le = 0x4a,
+  F64x2Ge = 0x4b,
+  V128Not = 0x4c,
+  V128And = 0x4d,
+  V128Or = 0x4e,
+  V128Xor = 0x4f,
+  V128Bitselect = 0x50,
+  I8x16Neg = 0x51,
+  I8x16AnyTrue = 0x52,
+  I8x16AllTrue = 0x53,
+  I8x16Shl = 0x54,
+  I8x16ShrS = 0x55,
+  I8x16ShrU = 0x56,
+  I8x16Add = 0x57,
+  I8x16AddSatS = 0x58,
+  I8x16AddSatU = 0x59,
+  I8x16Sub = 0x5a,
+  I8x16SubSatS = 0x5b,
+  I8x16SubSatU = 0x5c,
+  I8x16Mul = 0x5d,
+  I16x8Neg = 0x62,
+  I16x8AnyTrue = 0x63,
+  I16x8AllTrue = 0x64,
+  I16x8Shl = 0x65,
+  I16x8ShrS = 0x66,
+  I16x8ShrU = 0x67,
+  I16x8Add = 0x68,
+  I16x8AddSatS = 0x69,
+  I16x8AddSatU = 0x6a,
+  I16x8Sub = 0x6b,
+  I16x8SubSatS = 0x6c,
+  I16x8SubSatU = 0x6d,
+  I16x8Mul = 0x6e,
+  I32x4Neg = 0x73,
+  I32x4AnyTrue = 0x74,
+  I32x4AllTrue = 0x75,
+  I32x4Shl = 0x76,
+  I32x4ShrS = 0x77,
+  I32x4ShrU = 0x78,
+  I32x4Add = 0x79,
+  I32x4Sub = 0x7c,
+  I32x4Mul = 0x7f,
+  I64x2Neg = 0x84,
+  I64x2AnyTrue = 0x85,
+  I64x2AllTrue = 0x86,
+  I64x2Shl = 0x87,
+  I64x2ShrS = 0x88,
+  I64x2ShrU = 0x89,
+  I64x2Add = 0x8a,
+  I64x2Sub = 0x8d,
+  F32x4Abs = 0x95,
+  F32x4Neg = 0x96,
+  F32x4Sqrt = 0x97,
+  F32x4Add = 0x9a,
+  F32x4Sub = 0x9b,
+  F32x4Mul = 0x9c,
+  F32x4Div = 0x9d,
+  F32x4Min = 0x9e,
+  F32x4Max = 0x9f,
+  F64x2Abs = 0xa0,
+  F64x2Neg = 0xa1,
+  F64x2Sqrt = 0xa2,
+  F64x2Add = 0xa5,
+  F64x2Sub = 0xa6,
+  F64x2Mul = 0xa7,
+  F64x2Div = 0xa8,
+  F64x2Min = 0xa9,
+  F64x2Max = 0xaa,
+  I32x4TruncSatSF32x4 = 0xab,
+  I32x4TruncSatUF32x4 = 0xac,
+  I64x2TruncSatSF64x2 = 0xad,
+  I64x2TruncSatUF64x2 = 0xae,
+  F32x4ConvertSI32x4 = 0xaf,
+  F32x4ConvertUI32x4 = 0xb0,
+  F64x2ConvertSI64x2 = 0xb1,
+  F64x2ConvertUI64x2 = 0xb2
+};
+
 enum MemoryAccess {
   Offset = 0x10,     // bit 4
   Alignment = 0x80,  // bit 7
@@ -662,7 +807,7 @@ inline S32LEB binaryType(Type type) {
     case i64: ret = BinaryConsts::EncodedType::i64; break;
     case f32: ret = BinaryConsts::EncodedType::f32; break;
     case f64: ret = BinaryConsts::EncodedType::f64; break;
-    case v128: assert(false && "v128 not implemented yet");
+    case v128: ret = BinaryConsts::EncodedType::v128; break;
     case unreachable: WASM_UNREACHABLE();
   }
   return S32LEB(ret);
@@ -814,9 +959,11 @@ public:
   uint16_t getInt16();
   uint32_t getInt32();
   uint64_t getInt64();
+  uint8_t getLaneIndex(size_t lanes);
   // it is unsafe to return a float directly, due to ABI issues with the signalling bit
   Literal getFloat32Literal();
   Literal getFloat64Literal();
+  Literal getVec128Literal();
   uint32_t getU32LEB();
   uint64_t getU64LEB();
   int32_t getS32LEB();
@@ -948,6 +1095,7 @@ public:
   void readMemoryAccess(Address& alignment, Address& offset);
   bool maybeVisitLoad(Expression*& out, uint8_t code, bool isAtomic);
   bool maybeVisitStore(Expression*& out, uint8_t code, bool isAtomic);
+  bool maybeVisitNontrappingTrunc(Expression*& out, uint32_t code);
   bool maybeVisitAtomicRMW(Expression*& out, uint8_t code);
   bool maybeVisitAtomicCmpxchg(Expression*& out, uint8_t code);
   bool maybeVisitAtomicWait(Expression*& out, uint8_t code);
@@ -956,6 +1104,16 @@ public:
   bool maybeVisitUnary(Expression*& out, uint8_t code);
   bool maybeVisitBinary(Expression*& out, uint8_t code);
   bool maybeVisitTruncSat(Expression*& out, uint32_t code);
+  bool maybeVisitSIMDBinary(Expression*& out, uint32_t code);
+  bool maybeVisitSIMDUnary(Expression*& out, uint32_t code);
+  bool maybeVisitSIMDConst(Expression*& out, uint32_t code);
+  bool maybeVisitSIMDLoad(Expression*& out, uint32_t code);
+  bool maybeVisitSIMDStore(Expression*& out, uint32_t code);
+  bool maybeVisitSIMDExtract(Expression*& out, uint32_t code);
+  bool maybeVisitSIMDReplace(Expression*& out, uint32_t code);
+  bool maybeVisitSIMDShuffle(Expression*& out, uint32_t code);
+  bool maybeVisitSIMDBitselect(Expression*& out, uint32_t code);
+  bool maybeVisitSIMDShift(Expression*& out, uint32_t code);
   void visitSelect(Select* curr);
   void visitReturn(Return* curr);
   bool maybeVisitHost(Expression*& out, uint8_t code);
diff --git a/src/wasm-builder.h b/src/wasm-builder.h
index f36ec7a88..f182b1df2 100644
--- a/src/wasm-builder.h
+++ b/src/wasm-builder.h
@@ -293,6 +293,47 @@ public:
     ret->finalize();
     return ret;
   }
+  SIMDExtract* makeSIMDExtract(SIMDExtractOp op, Expression* vec, uint8_t index) {
+    auto* ret = allocator.alloc<SIMDExtract>();
+    ret->op = op;
+    ret->vec = vec;
+    ret->index = index;
+    ret->finalize();
+    return ret;
+  }
+  SIMDReplace* makeSIMDReplace(SIMDReplaceOp op, Expression* vec, uint8_t index, Expression* value) {
+    auto* ret = allocator.alloc<SIMDReplace>();
+    ret->op = op;
+    ret->vec = vec;
+    ret->index = index;
+    ret->value = value;
+    ret->finalize();
+    return ret;
+  }
+  SIMDShuffle* makeSIMDShuffle(Expression* left, Expression* right, const std::array<uint8_t, 16>& mask) {
+    auto* ret = allocator.alloc<SIMDShuffle>();
+    ret->left = left;
+    ret->right = right;
+    ret->mask = mask;
+    ret->finalize();
+    return ret;
+  }
+  SIMDBitselect* makeSIMDBitselect(Expression* left, Expression* right, Expression* cond) {
+    auto* ret = allocator.alloc<SIMDBitselect>();
+    ret->left = left;
+    ret->right = right;
+    ret->cond = cond;
+    ret->finalize();
+    return ret;
+  }
+  SIMDShift* makeSIMDShift(SIMDShiftOp op, Expression* vec, Expression* shift) {
+    auto* ret = allocator.alloc<SIMDShift>();
+    ret->op = op;
+    ret->vec = vec;
+    ret->shift = shift;
+    ret->finalize();
+    return ret;
+  }
   Const* makeConst(Literal value) {
     assert(isConcreteType(value.type));
     auto* ret = allocator.alloc<Const>();
@@ -474,7 +515,12 @@ public:
       case i64: value = Literal(int64_t(0)); break;
       case f32: value = Literal(float(0)); break;
       case f64: value = Literal(double(0)); break;
-      case v128: assert(false && "v128 not implemented yet");
+      case v128: {
+        std::array<uint8_t, 16> bytes;
+        bytes.fill(0);
+        value = Literal(bytes.data());
+        break;
+      }
       case none: return ExpressionManipulator::nop(curr);
       case unreachable: return ExpressionManipulator::convert<T, Unreachable>(curr);
     }
diff --git a/src/wasm-emscripten.h b/src/wasm-emscripten.h
index a5de5a128..2b626a7c9 100644
--- a/src/wasm-emscripten.h
+++ b/src/wasm-emscripten.h
@@ -42,7 +42,7 @@ public:
   // signature in the indirect function table.
   void generateDynCallThunks();
 
-  // Convert stack pointer access from get_global/set_global to calling save
+  // Convert stack pointer access from global.get/global.set to calling save
   // and restore functions.
   void replaceStackPointerGlobal();
 
diff --git a/src/wasm-interpreter.h b/src/wasm-interpreter.h
index 8554daded..c5a08cc9b 100644
--- a/src/wasm-interpreter.h
+++ b/src/wasm-interpreter.h
@@ -303,7 +303,39 @@ public:
       case PromoteFloat32:       return value.extendToF64();
       case ReinterpretFloat64:   return value.castToI64();
       case DemoteFloat64:        return value.demote();
-
+      case SplatVecI8x16:   return value.splatI8x16();
+      case SplatVecI16x8:   return value.splatI16x8();
+      case SplatVecI32x4:   return value.splatI32x4();
+      case SplatVecI64x2:   return value.splatI64x2();
+      case SplatVecF32x4:   return value.splatF32x4();
+      case SplatVecF64x2:   return value.splatF64x2();
+      case NotVec128:       return value.notV128();
+      case NegVecI8x16:     return value.negI8x16();
+      case AnyTrueVecI8x16: return value.anyTrueI8x16();
+      case AllTrueVecI8x16: return value.allTrueI8x16();
+      case NegVecI16x8:     return value.negI16x8();
+      case AnyTrueVecI16x8: return value.anyTrueI16x8();
+      case AllTrueVecI16x8: return value.allTrueI16x8();
+      case NegVecI32x4:     return value.negI32x4();
+      case AnyTrueVecI32x4: return value.anyTrueI32x4();
+      case AllTrueVecI32x4: return value.allTrueI32x4();
+      case NegVecI64x2:     return value.negI64x2();
+      case AnyTrueVecI64x2: return value.anyTrueI64x2();
+      case AllTrueVecI64x2: return value.allTrueI64x2();
+      case AbsVecF32x4:     return value.absF32x4();
+      case NegVecF32x4:     return value.negF32x4();
+      case SqrtVecF32x4:    return value.sqrtF32x4();
+      case AbsVecF64x2:     return value.absF64x2();
+      case NegVecF64x2:     return value.negF64x2();
+      case SqrtVecF64x2:    return value.sqrtF64x2();
+      case TruncSatSVecF32x4ToVecI32x4: return value.truncSatToSI32x4();
+      case TruncSatUVecF32x4ToVecI32x4: return value.truncSatToUI32x4();
+      case TruncSatSVecF64x2ToVecI64x2: return value.truncSatToSI64x2();
+      case TruncSatUVecF64x2ToVecI64x2: return value.truncSatToUI64x2();
+      case ConvertSVecI32x4ToVecF32x4:  return value.convertSToF32x4();
+      case ConvertUVecI32x4ToVecF32x4:  return value.convertUToF32x4();
+      case ConvertSVecI64x2ToVecF64x2:  return value.convertSToF64x2();
+      case ConvertUVecI64x2ToVecF64x2:  return value.convertUToF64x2();
       case InvalidUnary: WASM_UNREACHABLE();
     }
     WASM_UNREACHABLE();
@@ -427,10 +459,172 @@ public:
       case MaxFloat32:
       case MaxFloat64:      return left.max(right);
 
+      case EqVecI8x16:      return left.eqI8x16(right);
+      case NeVecI8x16:      return left.neI8x16(right);
+      case LtSVecI8x16:     return left.ltSI8x16(right);
+      case LtUVecI8x16:     return left.ltUI8x16(right);
+      case GtSVecI8x16:     return left.gtSI8x16(right);
+      case GtUVecI8x16:     return left.gtUI8x16(right);
+      case LeSVecI8x16:     return left.leSI8x16(right);
+      case LeUVecI8x16:     return left.leUI8x16(right);
+      case GeSVecI8x16:     return left.geSI8x16(right);
+      case GeUVecI8x16:     return left.geUI8x16(right);
+      case EqVecI16x8:      return left.eqI16x8(right);
+      case NeVecI16x8:      return left.neI16x8(right);
+      case LtSVecI16x8:     return left.ltSI16x8(right);
+      case LtUVecI16x8:     return left.ltUI16x8(right);
+      case GtSVecI16x8:     return left.gtSI16x8(right);
+      case GtUVecI16x8:     return left.gtUI16x8(right);
+      case LeSVecI16x8:     return left.leSI16x8(right);
+      case LeUVecI16x8:     return left.leUI16x8(right);
+      case GeSVecI16x8:     return left.geSI16x8(right);
+      case GeUVecI16x8:     return left.geUI16x8(right);
+      case EqVecI32x4:      return left.eqI32x4(right);
+      case NeVecI32x4:      return left.neI32x4(right);
+      case LtSVecI32x4:     return left.ltSI32x4(right);
+      case LtUVecI32x4:     return left.ltUI32x4(right);
+      case GtSVecI32x4:     return left.gtSI32x4(right);
+      case GtUVecI32x4:     return left.gtUI32x4(right);
+      case LeSVecI32x4:     return left.leSI32x4(right);
+      case LeUVecI32x4:     return left.leUI32x4(right);
+      case GeSVecI32x4:     return left.geSI32x4(right);
+      case GeUVecI32x4:     return left.geUI32x4(right);
+      case EqVecF32x4:      return left.eqF32x4(right);
+      case NeVecF32x4:      return left.neF32x4(right);
+      case LtVecF32x4:      return left.ltF32x4(right);
+      case GtVecF32x4:      return left.gtF32x4(right);
+      case LeVecF32x4:      return left.leF32x4(right);
+      case GeVecF32x4:      return left.geF32x4(right);
+      case EqVecF64x2:      return left.eqF64x2(right);
+      case NeVecF64x2:      return left.neF64x2(right);
+      case LtVecF64x2:      return left.ltF64x2(right);
+      case GtVecF64x2:      return left.gtF64x2(right);
+      case LeVecF64x2:      return left.leF64x2(right);
+      case GeVecF64x2:      return left.geF64x2(right);
+
+      case AndVec128:       return left.andV128(right);
+      case OrVec128:        return left.orV128(right);
+      case XorVec128:       return left.xorV128(right);
+
+      case AddVecI8x16:     return left.addI8x16(right);
+      case AddSatSVecI8x16: return left.addSaturateSI8x16(right);
+      case AddSatUVecI8x16: return left.addSaturateUI8x16(right);
+      case SubVecI8x16:     return left.subI8x16(right);
+      case SubSatSVecI8x16: return left.subSaturateSI8x16(right);
+      case SubSatUVecI8x16: return left.subSaturateUI8x16(right);
+      case MulVecI8x16:     return left.mulI8x16(right);
+      case AddVecI16x8:     return left.addI16x8(right);
+      case AddSatSVecI16x8: return left.addSaturateSI16x8(right);
+      case AddSatUVecI16x8: return left.addSaturateUI16x8(right);
+      case SubVecI16x8:     return left.subI16x8(right);
+      case SubSatSVecI16x8: return left.subSaturateSI16x8(right);
+      case SubSatUVecI16x8: return left.subSaturateUI16x8(right);
+      case MulVecI16x8:     return left.mulI16x8(right);
+      case AddVecI32x4:     return left.addI32x4(right);
+      case SubVecI32x4:     return left.subI32x4(right);
+      case MulVecI32x4:     return left.mulI32x4(right);
+      case AddVecI64x2:     return left.addI64x2(right);
+      case SubVecI64x2:     return left.subI64x2(right);
+
+      case AddVecF32x4:     return left.addF32x4(right);
+      case SubVecF32x4:     return left.subF32x4(right);
+      case MulVecF32x4:     return left.mulF32x4(right);
+      case DivVecF32x4:     return left.divF32x4(right);
+      case MinVecF32x4:     return left.minF32x4(right);
+      case MaxVecF32x4:     return left.maxF32x4(right);
+      case AddVecF64x2:     return left.addF64x2(right);
+      case SubVecF64x2:     return left.subF64x2(right);
+      case MulVecF64x2:     return left.mulF64x2(right);
+      case DivVecF64x2:     return left.divF64x2(right);
+      case MinVecF64x2:     return left.minF64x2(right);
+      case MaxVecF64x2:     return left.maxF64x2(right);
+
       case InvalidBinary: WASM_UNREACHABLE();
     }
     WASM_UNREACHABLE();
   }
+  Flow visitSIMDExtract(SIMDExtract *curr) {
+    NOTE_ENTER("SIMDExtract");
+    Flow flow = this->visit(curr->vec);
+    if (flow.breaking()) return flow;
+    Literal vec = flow.value;
+    switch (curr->op) {
+    case ExtractLaneSVecI8x16: return vec.extractLaneSI8x16(curr->index);
+    case ExtractLaneUVecI8x16: return vec.extractLaneUI8x16(curr->index);
+    case ExtractLaneSVecI16x8: return vec.extractLaneSI16x8(curr->index);
+    case ExtractLaneUVecI16x8: return vec.extractLaneUI16x8(curr->index);
+    case ExtractLaneVecI32x4: return vec.extractLaneI32x4(curr->index);
+    case ExtractLaneVecI64x2: return vec.extractLaneI64x2(curr->index);
+    case ExtractLaneVecF32x4: return vec.extractLaneF32x4(curr->index);
+    case ExtractLaneVecF64x2: return vec.extractLaneF64x2(curr->index);
+    }
+    WASM_UNREACHABLE();
+  }
+  Flow visitSIMDReplace(SIMDReplace *curr) {
+    NOTE_ENTER("SIMDReplace");
+    Flow flow = this->visit(curr->vec);
+    if (flow.breaking()) return flow;
+    Literal vec = flow.value;
+    flow = this->visit(curr->value);
+    if (flow.breaking()) return flow;
+    Literal value = flow.value;
+    switch (curr->op) {
+    case ReplaceLaneVecI8x16: return vec.replaceLaneI8x16(value, curr->index);
+    case ReplaceLaneVecI16x8: return vec.replaceLaneI16x8(value, curr->index);
+    case ReplaceLaneVecI32x4: return vec.replaceLaneI32x4(value, curr->index);
+    case ReplaceLaneVecI64x2: return vec.replaceLaneI64x2(value, curr->index);
+    case ReplaceLaneVecF32x4: return vec.replaceLaneF32x4(value, curr->index);
+    case ReplaceLaneVecF64x2: return vec.replaceLaneF64x2(value, curr->index);
+    }
+    WASM_UNREACHABLE();
+  }
+  Flow visitSIMDShuffle(SIMDShuffle *curr) {
+    NOTE_ENTER("SIMDShuffle");
+    Flow flow = this->visit(curr->left);
+    if (flow.breaking()) return flow;
+    Literal left = flow.value;
+    flow = this->visit(curr->right);
+    if (flow.breaking()) return flow;
+    Literal right = flow.value;
+    return left.shuffleV8x16(right, curr->mask);
+  }
+  Flow visitSIMDBitselect(SIMDBitselect *curr) {
+    NOTE_ENTER("SIMDBitselect");
+    Flow flow = this->visit(curr->left);
+    if (flow.breaking()) return flow;
+    Literal left = flow.value;
+    flow = this->visit(curr->right);
+    if (flow.breaking()) return flow;
+    Literal right = flow.value;
+    flow = this->visit(curr->cond);
+    if (flow.breaking()) return flow;
+    Literal cond = flow.value;
+    return cond.bitselectV128(left, right);
+  }
+  Flow visitSIMDShift(SIMDShift *curr) {
+    NOTE_ENTER("SIMDShift");
+    Flow flow = this->visit(curr->vec);
+    if (flow.breaking()) return flow;
+    Literal vec = flow.value;
+    flow = this->visit(curr->shift);
+    if (flow.breaking()) return flow;
+    Literal shift = flow.value;
+    switch (curr->op) {
+    case ShlVecI8x16: return vec.shlI8x16(shift);
+    case ShrSVecI8x16: return vec.shrSI8x16(shift);
+    case ShrUVecI8x16: return vec.shrUI8x16(shift);
+    case ShlVecI16x8: return vec.shlI16x8(shift);
+    case ShrSVecI16x8: return vec.shrSI16x8(shift);
+    case ShrUVecI16x8: return vec.shrUI16x8(shift);
+    case ShlVecI32x4: return vec.shlI32x4(shift);
+    case ShrSVecI32x4: return vec.shrSI32x4(shift);
+    case ShrUVecI32x4: return vec.shrUI32x4(shift);
+    case ShlVecI64x2: return vec.shlI64x2(shift);
+    case ShrSVecI64x2: return vec.shrSI64x2(shift);
+    case ShrUVecI64x2: return vec.shrUI64x2(shift);
+    }
+    WASM_UNREACHABLE();
+  }
   Flow visitSelect(Select *curr) {
     NOTE_ENTER("Select");
     Flow ifTrue = visit(curr->ifTrue);
@@ -586,7 +780,7 @@ public:
         }
         case f32: return Literal(load32u(addr)).castToF32();
         case f64: return Literal(load64u(addr)).castToF64();
-        case v128: assert(false && "v128 not implemented yet");
+        case v128: return Literal(load128(addr).data());
         case none:
         case unreachable: WASM_UNREACHABLE();
       }
@@ -616,7 +810,7 @@ public:
         // write floats carefully, ensuring all bits reach memory
         case f32: store32(addr, value.reinterpreti32()); break;
         case f64: store64(addr, value.reinterpreti64()); break;
-        case v128: assert(false && "v128 not implemented yet");
+        case v128: store128(addr, value.getv128()); break;
         case none:
         case unreachable: WASM_UNREACHABLE();
       }
@@ -630,11 +824,13 @@ public:
     virtual uint32_t load32u(Address addr) { WASM_UNREACHABLE(); }
     virtual int64_t load64s(Address addr) { WASM_UNREACHABLE(); }
     virtual uint64_t load64u(Address addr) { WASM_UNREACHABLE(); }
+    virtual std::array<uint8_t, 16> load128(Address addr) { WASM_UNREACHABLE(); }
 
     virtual void store8(Address addr, int8_t value) { WASM_UNREACHABLE(); }
     virtual void store16(Address addr, int16_t value) { WASM_UNREACHABLE(); }
     virtual void store32(Address addr, int32_t value) { WASM_UNREACHABLE(); }
     virtual void store64(Address addr, int64_t value) { WASM_UNREACHABLE(); }
+    virtual void store128(Address addr, const std::array<uint8_t, 16>&) { WASM_UNREACHABLE(); }
   };
 
   SubType* self() {
diff --git a/src/wasm-js.cpp b/src/wasm-js.cpp
deleted file mode 100644
index a6e751dcc..000000000
--- a/src/wasm-js.cpp
+++ /dev/null
@@ -1,546 +0,0 @@
-/*
- * Copyright 2015 WebAssembly Community Group participants
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-//
-// WebAssembly intepreter for asm2wasm output, in a js environment.
-//
-// Receives asm.js, generates a runnable module that executes the code in a WebAssembly
-// interpreter. This is suitable as a polyfill for WebAssembly support in browsers.
-//
-
-#include <emscripten.h>
-
-#include "asm2wasm.h"
-#include "wasm-interpreter.h"
-#include "wasm-s-parser.h"
-#include "wasm-binary.h"
-#include "wasm-printing.h"
-#include "ir/module-utils.h"
-
-using namespace cashew;
-using namespace wasm;
-
-namespace wasm {
-int debug = 0;
-}
-
-// global singletons
-Asm2WasmBuilder* asm2wasm = nullptr;
-SExpressionParser* sExpressionParser = nullptr;
-SExpressionWasmBuilder* sExpressionWasmBuilder = nullptr;
-ModuleInstance* instance = nullptr;
-Module* module = nullptr;
-bool wasmJSDebug = false;
-
-static void prepare2wasm() {
-  assert(asm2wasm == nullptr && sExpressionParser == nullptr && sExpressionWasmBuilder == nullptr && instance == nullptr); // singletons
-#if WASM_JS_DEBUG
-  wasmJSDebug = 1;
-#else
-  wasmJSDebug = EM_ASM_INT_V({ return !!Module['outside']['WASM_JS_DEBUG'] }); // Set WASM_JS_DEBUG on the outside Module to get debugging
-#endif
-}
-
-// receives asm.js code, parses into wasm.
-// note: this modifies the input.
-extern "C" void EMSCRIPTEN_KEEPALIVE load_asm2wasm(char *input) {
-  prepare2wasm();
-
-  Asm2WasmPreProcessor pre;
-  pre.debugInfo = true; // FIXME: we must do this, as the input asm.js might have debug info
-  input = pre.process(input);
-
-  // proceed to parse and wasmify
-  if (wasmJSDebug) std::cerr << "asm parsing...\n";
-
-  cashew::Parser<Ref, DotZeroValueBuilder> builder;
-  Ref asmjs = builder.parseToplevel(input);
-
-  module = new Module();
-  uint32_t providedMemory = EM_ASM_INT_V({
-    return Module['providedTotalMemory']; // we receive the size of memory from emscripten
-  });
-  if (providedMemory & ~Memory::kPageMask) {
-    std::cerr << "Error: provided memory is not a multiple of the 64k wasm page size\n";
-    exit(EXIT_FAILURE);
-  }
-  module->memory.initial = Address(providedMemory / Memory::kPageSize);
-  module->memory.max = pre.memoryGrowth ? Address(Memory::kUnlimitedSize) : module->memory.initial;
-
-  if (wasmJSDebug) std::cerr << "wasming...\n";
-  asm2wasm = new Asm2WasmBuilder(*module, pre, debug, TrapMode::JS, PassOptions(), true /* runJSFFIPass */, false /* TODO: support optimizing? */, false /* TODO: support asm2wasm-i64? */);
-  asm2wasm->processAsm(asmjs);
-}
-
-void finalizeModule() {
-  uint32_t providedMemory = EM_ASM_INT_V({
-    return Module['providedTotalMemory']; // we receive the size of memory from emscripten
-  });
-  if (providedMemory & ~Memory::kPageMask) {
-    std::cerr << "Error: provided memory is not a multiple of the 64k wasm page size\n";
-    exit(EXIT_FAILURE);
-  }
-  module->memory.initial = Address(providedMemory / Memory::kPageSize);
-  module->memory.max = module->getExportOrNull(GROW_WASM_MEMORY) ? Address(Memory::kUnlimitedSize) : module->memory.initial;
-
-  // global mapping is done in js in post.js
-}
-
-// loads wasm code in s-expression format
-extern "C" void EMSCRIPTEN_KEEPALIVE load_s_expr2wasm(char *input) {
-  prepare2wasm();
-
-  if (wasmJSDebug) std::cerr << "wasm-s-expression parsing...\n";
-
-  sExpressionParser = new SExpressionParser(input);
-  Element& root = *sExpressionParser->root;
-  if (wasmJSDebug) std::cout << root << '\n';
-
-  if (wasmJSDebug) std::cerr << "wasming...\n";
-
-  module = new Module();
-  // A .wast may have multiple modules, with some asserts after them, but we just read the first here.
-  sExpressionWasmBuilder = new SExpressionWasmBuilder(*module, *root[0]);
-
-  finalizeModule();
-}
-
-// loads wasm code in binary format
-extern "C" void EMSCRIPTEN_KEEPALIVE load_binary2wasm(char *raw, int32_t size) {
-  prepare2wasm();
-
-  if (wasmJSDebug) std::cerr << "wasm-binary parsing...\n";
-
-  module = new Module();
-  std::vector<char> input;
-  input.resize(size);
-  for (int32_t i = 0; i < size; i++) {
-    input[i] = raw[i];
-  }
-  WasmBinaryBuilder parser(*module, input, debug);
-  parser.read();
-
-  finalizeModule();
-}
-
-// instantiates the loaded wasm (which might be from asm2wasm, or
-// s-expressions, or something else) with a JS external interface.
-extern "C" void EMSCRIPTEN_KEEPALIVE instantiate() {
-  if (wasmJSDebug) std::cerr << "instantiating module: \n" << module << '\n';
-
-  if (wasmJSDebug) std::cerr << "generating exports...\n";
-
-  EM_ASM({
-    Module['asmExports'] = {};
-  });
-  for (auto& curr : module->exports) {
-    if (curr->kind == ExternalKind::Function) {
-      EM_ASM_({
-        var name = Pointer_stringify($0);
-        Module['asmExports'][name] = function() {
-          Module['tempArguments'] = Array.prototype.slice.call(arguments);
-          Module['_call_from_js']($0);
-          return Module['tempReturn'];
-        };
-      }, curr->name.str);
-    }
-  }
-
-  auto verifyImportIsProvided = [&](Importable* import) {
-    EM_ASM_({
-      var mod = Pointer_stringify($0);
-      var base = Pointer_stringify($1);
-      assert(Module['lookupImport'](mod, base) !== undefined, 'checking import ' + mod + '.' + base);
-    }, import->module.str, import->base.str);
-  };
-  ModuleUtils::iterImportedFunctions(*module, verifyImportIsProvided);
-  ModuleUtils::iterImportedGlobals(*module, verifyImportIsProvided);
-
-  if (wasmJSDebug) std::cerr << "creating instance...\n";
-
-  struct JSExternalInterface : ModuleInstance::ExternalInterface {
-    Module* module = nullptr;
-
-    void init(Module& wasm, ModuleInstance& instance) override {
-      module = &wasm;
-      // look for imported memory
-      if (wasm.memory.imported()) {
-        EM_ASM({
-          Module['asmExports']['memory'] = Module['lookupImport']('env', 'memory');
-        });
-      } else {
-        // no memory import; create a new buffer here, just like native wasm support would.
-        EM_ASM_({
-          Module['asmExports']['memory'] = Module['outside']['newBuffer'] = new ArrayBuffer($0);
-        }, wasm.memory.initial * Memory::kPageSize);
-      }
-      for (auto segment : wasm.memory.segments) {
-        EM_ASM_({
-          var source = Module['HEAP8'].subarray($1, $1 + $2);
-          var target = new Int8Array(Module['asmExports']['memory']);
-          target.set(source, $0);
-        }, ConstantExpressionRunner<TrivialGlobalManager>(instance.globals).visit(segment.offset).value.geti32(), &segment.data[0], segment.data.size());
-      }
-      // look for imported table
-      if (wasm.table.imported()) {
-        EM_ASM({
-          Module['outside']['wasmTable'] = Module['lookupImport']('env', 'table');
-        });
-      } else {
-        // no table import; create a new one here, just like native wasm support would.
-        EM_ASM_({
-          Module['outside']['wasmTable'] = new Array($0);
-        }, wasm.table.initial);
-      }
-      EM_ASM({
-        Module['asmExports']['table'] = Module['outside']['wasmTable'];
-      });
-      // Emulated table support is in a JS array. If the entry is a number, it's a function pointer. If not, it's a JS method to be called directly
-      // TODO: make them all JS methods, wrapping a dynCall where necessary?
-      for (auto segment : wasm.table.segments) {
-        Address offset = ConstantExpressionRunner<TrivialGlobalManager>(instance.globals).visit(segment.offset).value.geti32();
-        assert(offset + segment.data.size() <= wasm.table.initial);
-        for (size_t i = 0; i != segment.data.size(); ++i) {
-          Name name = segment.data[i];
-          auto* func = wasm.getFunction(name);
-          if (!func->imported()) {
-            EM_ASM_({
-              Module['outside']['wasmTable'][$0] = $1;
-            }, offset + i, func);
-          } else {
-            EM_ASM_({
-              Module['outside']['wasmTable'][$0] = Module['lookupImport'](Pointer_stringify($1), Pointer_stringify($2));
-            }, offset + i, func->module.str, func->base.str);
-          }
-        }
-      }
-    }
-
-    void prepareTempArgments(LiteralList& arguments) {
-      EM_ASM({
-        Module['tempArguments'] = [];
-      });
-      for (auto& argument : arguments) {
-        if (argument.type == i32) {
-          EM_ASM_({ Module['tempArguments'].push($0) }, argument.geti32());
-        } else if (argument.type == f32) {
-          EM_ASM_({ Module['tempArguments'].push($0) }, argument.getf32());
-        } else if (argument.type == f64) {
-          EM_ASM_({ Module['tempArguments'].push($0) }, argument.getf64());
-        } else {
-          abort();
-        }
-      }
-    }
-
-    Literal getResultFromJS(double ret, Type type) {
-      switch (type) {
-        case none: return Literal();
-        case i32: return Literal((int32_t)ret);
-        case i64: WASM_UNREACHABLE();
-        case f32: return Literal((float)ret);
-        case f64: return Literal((double)ret);
-        case v128: assert(false && "v128 not implemented yet");
-        case unreachable: WASM_UNREACHABLE();
-      }
-      WASM_UNREACHABLE();
-    }
-
-    void importGlobals(std::map<Name, Literal>& globals, Module& wasm) override {
-      ModuleUtils::iterImportedGlobals(wasm, [&](Global* import) {
-        double ret = EM_ASM_DOUBLE({
-          var mod = Pointer_stringify($0);
-          var base = Pointer_stringify($1);
-          var lookup = Module['lookupImport'](mod, base);
-          return lookup;
-        }, import->module.str, import->base.str);
-
-        if (wasmJSDebug) std::cout << "calling importGlobal for " << import->name << " returning " << ret << '\n';
-
-        globals[import->name] = getResultFromJS(ret, import->type);
-      });
-    }
-
-    Literal callImport(Function *import, LiteralList& arguments) override {
-      if (wasmJSDebug) std::cout << "calling import " << import->name.str << '\n';
-      prepareTempArgments(arguments);
-      double ret = EM_ASM_DOUBLE({
-        var mod = Pointer_stringify($0);
-        var base = Pointer_stringify($1);
-        var tempArguments = Module['tempArguments'];
-        Module['tempArguments'] = null;
-        var lookup = Module['lookupImport'](mod, base);
-        return lookup.apply(null, tempArguments);
-      }, import->module.str, import->base.str);
-
-      if (wasmJSDebug) std::cout << "calling import returning " << ret << " and function type is " << module->getFunctionType(import->type)->result << '\n';
-
-      return getResultFromJS(ret, module->getFunctionType(import->type)->result);
-    }
-
-    Literal callTable(Index index, LiteralList& arguments, Type result, ModuleInstance& instance) override {
-      void* ptr = (void*)EM_ASM_INT({
-        var value = Module['outside']['wasmTable'][$0];
-        return typeof value === "number" ? value : -1;
-      }, index);
-      if (ptr == nullptr) trap("callTable overflow");
-      if (ptr != (void*)-1) {
-        // a Function we can call
-        Function* func = (Function*)ptr;
-        if (func->params.size() != arguments.size()) trap("callIndirect: bad # of arguments");
-        for (size_t i = 0; i < func->params.size(); i++) {
-          if (func->params[i] != arguments[i].type) {
-            trap("callIndirect: bad argument type");
-          }
-        }
-        return instance.callFunctionInternal(func->name, arguments);
-      } else {
-        // A JS function JS can call
-        prepareTempArgments(arguments);
-        double ret = EM_ASM_DOUBLE({
-          var func = Module['outside']['wasmTable'][$0];
-          var tempArguments = Module['tempArguments'];
-          Module['tempArguments'] = null;
-          return func.apply(null, tempArguments);
-        }, index);
-        return getResultFromJS(ret, result);
-      }
-    }
-
-    Literal load(Load* load, Address address) override {
-      uint32_t addr = address;
-      if (load->align < load->bytes || (addr & (load->bytes-1))) {
-        int64_t out64;
-        double ret = EM_ASM_DOUBLE({
-          var addr = $0;
-          var bytes = $1;
-          var isFloat = $2;
-          var isSigned = $3;
-          var out64 = $4;
-          var save0 = HEAP32[0];
-          var save1 = HEAP32[1];
-          for (var i = 0; i < bytes; i++) {
-            HEAPU8[i] = Module["info"].parent["HEAPU8"][addr + i];
-          }
-          var ret;
-          if (!isFloat) {
-            if (bytes === 1)      ret = isSigned ? HEAP8[0]  : HEAPU8[0];
-            else if (bytes === 2) ret = isSigned ? HEAP16[0] : HEAPU16[0];
-            else if (bytes === 4) ret = isSigned ? HEAP32[0] : HEAPU32[0];
-            else if (bytes === 8) {
-              for (var i = 0; i < bytes; i++) {
-                HEAPU8[out64 + i] = HEAPU8[i];
-              }
-            } else abort();
-          } else {
-            if (bytes === 4)      ret = HEAPF32[0];
-            else if (bytes === 8) ret = HEAPF64[0];
-            else abort();
-          }
-          HEAP32[0] = save0; HEAP32[1] = save1;
-          return ret;
-        }, (uint32_t)addr, load->bytes, isFloatType(load->type), load->signed_, &out64);
-        if (!isFloatType(load->type)) {
-          if (load->type == i64) {
-            if (load->bytes == 8) {
-              return Literal(out64);
-            } else {
-              if (load->signed_) {
-                return Literal(int64_t(int32_t(ret)));
-              } else {
-                return Literal(int64_t(uint32_t(ret)));
-              }
-            }
-          }
-          return Literal((int32_t)ret);
-        } else if (load->bytes == 4) {
-          return Literal((float)ret);
-        } else if (load->bytes == 8) {
-          return Literal((double)ret);
-        }
-        abort();
-      }
-      // nicely aligned
-      if (!isFloatType(load->type)) {
-        int64_t ret;
-        if (load->bytes == 1) {
-          if (load->signed_) {
-            ret = EM_ASM_INT({ return Module['info'].parent['HEAP8'][$0] }, addr);
-          } else {
-            ret = EM_ASM_INT({ return Module['info'].parent['HEAPU8'][$0] }, addr);
-          }
-        } else if (load->bytes == 2) {
-          if (load->signed_) {
-            ret = EM_ASM_INT({ return Module['info'].parent['HEAP16'][$0 >> 1] }, addr);
-          } else {
-            ret = EM_ASM_INT({ return Module['info'].parent['HEAPU16'][$0 >> 1] }, addr);
-          }
-        } else if (load->bytes == 4) {
-          if (load->signed_) {
-            ret = EM_ASM_INT({ return Module['info'].parent['HEAP32'][$0 >> 2] }, addr);
-          } else {
-            ret = uint32_t(EM_ASM_INT({ return Module['info'].parent['HEAPU32'][$0 >> 2] }, addr));
-          }
-        } else if (load->bytes == 8) {
-          uint32_t low  = EM_ASM_INT({ return Module['info'].parent['HEAP32'][$0 >> 2] }, addr);
-          uint32_t high = EM_ASM_INT({ return Module['info'].parent['HEAP32'][$0 >> 2] }, addr + 4);
-          ret = uint64_t(low) | (uint64_t(high) << 32);
-        } else abort();
-        return load->type == i32 ? Literal(int32_t(ret)) : Literal(ret);
-      } else {
-        if (load->bytes == 4) {
-          return Literal((float)EM_ASM_DOUBLE({ return Module['info'].parent['HEAPF32'][$0 >> 2] }, addr));
-        } else if (load->bytes == 8) {
-          return Literal(EM_ASM_DOUBLE({ return Module['info'].parent['HEAPF64'][$0 >> 3] }, addr));
-        }
-        abort();
-      }
-    }
-
-    void store(Store* store_, Address address, Literal value) override {
-      uint32_t addr = address;
-      // support int64 stores
-      if (value.type == Type::i64 && store_->bytes == 8) {
-        Store fake = *store_;
-        fake.bytes = 4;
-        fake.type = i32;
-        uint64_t v = value.geti64();
-        store(&fake, addr, Literal(uint32_t(v)));
-        v >>= 32;
-        store(&fake, addr + 4, Literal(uint32_t(v)));
-        return;
-      }
-      // normal non-int64 value
-      if (store_->align < store_->bytes || (addr & (store_->bytes-1))) {
-        EM_ASM_DOUBLE({
-          var addr = $0;
-          var bytes = $1;
-          var isFloat = $2;
-          var value = $3;
-          var save0 = HEAP32[0];
-          var save1 = HEAP32[1];
-          if (!isFloat) {
-            if (bytes === 1)      HEAPU8[0] = value;
-            else if (bytes === 2) HEAPU16[0] = value;
-            else if (bytes === 4) HEAPU32[0] = value;
-            else abort();
-          } else {
-            if (bytes === 4)      HEAPF32[0] = value;
-            else if (bytes === 8) HEAPF64[0] = value;
-            else abort();
-          }
-          for (var i = 0; i < bytes; i++) {
-            Module["info"].parent["HEAPU8"][addr + i] = HEAPU8[i];
-          }
-          HEAP32[0] = save0; HEAP32[1] = save1;
-        }, (uint32_t)addr, store_->bytes, isFloatType(store_->valueType), isFloatType(store_->valueType) ? value.getFloat() : (double)value.getInteger());
-        return;
-      }
-      // nicely aligned
-      if (!isFloatType(store_->valueType)) {
-        if (store_->bytes == 1) {
-          EM_ASM_INT({ Module['info'].parent['HEAP8'][$0] = $1 }, addr, (uint32_t)value.getInteger());
-        } else if (store_->bytes == 2) {
-          EM_ASM_INT({ Module['info'].parent['HEAP16'][$0 >> 1] = $1 }, addr, (uint32_t)value.getInteger());
-        } else if (store_->bytes == 4) {
-          EM_ASM_INT({ Module['info'].parent['HEAP32'][$0 >> 2] = $1 }, addr, (uint32_t)value.getInteger());
-        } else {
-          abort();
-        }
-      } else {
-        if (store_->bytes == 4) {
-          EM_ASM_DOUBLE({ Module['info'].parent['HEAPF32'][$0 >> 2] = $1 }, addr, value.getf32());
-        } else if (store_->bytes == 8) {
-          EM_ASM_DOUBLE({ Module['info'].parent['HEAPF64'][$0 >> 3] = $1 }, addr, value.getf64());
-        } else {
-          abort();
-        }
-      }
-    }
-
-    void growMemory(Address oldSize, Address newSize) override {
-      EM_ASM_({
-        var size = $0;
-        var buffer;
-        try {
-          buffer = new ArrayBuffer(size);
-        } catch(e) {
-          // fail to grow memory. post.js notices this since the buffer is unchanged
-          return;
-        }
-        var oldHEAP8 = Module['outside']['HEAP8'];
-        var temp = new Int8Array(buffer);
-        temp.set(oldHEAP8);
-        Module['outside']['buffer'] = buffer;
-      }, (uint32_t)newSize);
-    }
-
-    void trap(const char* why) override {
-      EM_ASM_({
-        abort("wasm trap: " + Pointer_stringify($0));
-      }, why);
-    }
-  };
-
-  instance = new ModuleInstance(*module, new JSExternalInterface());
-
-  // stack trace hooks
-  EM_ASM({
-    Module['outside']['extraStackTrace'] = function() {
-      return Pointer_stringify(Module['_interpreter_stack_trace']());
-    };
-  });
-}
-
-extern "C" int EMSCRIPTEN_KEEPALIVE interpreter_stack_trace() {
-  std::string stack = instance->printFunctionStack();
-  return (int)strdup(stack.c_str()); // XXX leak
-}
-
-// Does a call from js into an export of the module.
-extern "C" void EMSCRIPTEN_KEEPALIVE call_from_js(const char *target) {
-  if (wasmJSDebug) std::cout << "call_from_js " << target << '\n';
-
-  IString exportName(target);
-  IString functionName = instance->wasm.getExport(exportName)->value;
-  Function *function = instance->wasm.getFunction(functionName);
-  assert(function);
-  size_t seen = EM_ASM_INT_V({ return Module['tempArguments'].length });
-  size_t actual = function->params.size();
-  LiteralList arguments;
-  for (size_t i = 0; i < actual; i++) {
-    Type type = function->params[i];
-    // add the parameter, with a zero value if JS did not provide it.
-    if (type == i32) {
-      arguments.push_back(Literal(i < seen ? EM_ASM_INT({ return Module['tempArguments'][$0] }, i) : (int32_t)0));
-    } else if (type == f32) {
-      arguments.push_back(Literal(i < seen ? (float)EM_ASM_DOUBLE({ return Module['tempArguments'][$0] }, i) : (float)0.0));
-    } else if (type == f64) {
-      arguments.push_back(Literal(i < seen ? EM_ASM_DOUBLE({ return Module['tempArguments'][$0] }, i) : (double)0.0));
-    } else {
-      abort();
-    }
-  }
-  Literal ret = instance->callExport(exportName, arguments);
-
-  if (wasmJSDebug) std::cout << "call_from_js returning " << ret << '\n';
-
-  if (ret.type == none) EM_ASM({ Module['tempReturn'] = undefined });
-  else if (ret.type == i32) EM_ASM_({ Module['tempReturn'] = $0 }, ret.geti32());
-  else if (ret.type == f32) EM_ASM_({ Module['tempReturn'] = $0 }, ret.getf32());
-  else if (ret.type == f64) EM_ASM_({ Module['tempReturn'] = $0 }, ret.getf64());
-  else abort();
-}
diff --git a/src/wasm-s-parser.h b/src/wasm-s-parser.h
index 517398c5c..0845fa70e 100644
--- a/src/wasm-s-parser.h
+++ b/src/wasm-s-parser.h
@@ -190,6 +190,11 @@ private:
   Expression* makeAtomicCmpxchg(Element& s, Type type, uint8_t bytes, const char* extra);
   Expression* makeAtomicWait(Element& s, Type type);
   Expression* makeAtomicWake(Element& s);
+  Expression* makeSIMDExtract(Element& s, SIMDExtractOp op, size_t lanes);
+  Expression* makeSIMDReplace(Element& s, SIMDReplaceOp op, size_t lanes);
+  Expression* makeSIMDShuffle(Element& s);
+  Expression* makeSIMDBitselect(Element& s);
+  Expression* makeSIMDShift(Element& s, SIMDShiftOp);
   Expression* makeIf(Element& s);
   Expression* makeMaybeBlock(Element& s, size_t i, Type type);
   Expression* makeLoop(Element& s);
diff --git a/src/wasm-stack.h b/src/wasm-stack.h
index 6e1150981..c64fd2759 100644
--- a/src/wasm-stack.h
+++ b/src/wasm-stack.h
@@ -135,6 +135,11 @@ public:
   void visitAtomicCmpxchg(AtomicCmpxchg* curr);
   void visitAtomicWait(AtomicWait* curr);
   void visitAtomicWake(AtomicWake* curr);
+  void visitSIMDExtract(SIMDExtract* curr);
+  void visitSIMDReplace(SIMDReplace* curr);
+  void visitSIMDShuffle(SIMDShuffle* curr);
+  void visitSIMDBitselect(SIMDBitselect* curr);
+  void visitSIMDShift(SIMDShift* curr);
   void visitConst(Const* curr);
   void visitUnary(Unary* curr);
   void visitBinary(Binary* curr);
@@ -634,7 +639,7 @@ void StackWriter<Mode, Parent>::visitLoad(Load* curr) {
       }
       case f32: o << int8_t(BinaryConsts::F32LoadMem); break;
       case f64: o << int8_t(BinaryConsts::F64LoadMem); break;
-      case v128: assert(false && "v128 not implemented yet");
+      case v128: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::V128Load); break;
       case unreachable: return; // the pointer is unreachable, so we are never reached; just don't emit a load
       case none: WASM_UNREACHABLE();
     }
@@ -701,7 +706,7 @@ void StackWriter<Mode, Parent>::visitStore(Store* curr) {
       }
       case f32: o << int8_t(BinaryConsts::F32StoreMem); break;
       case f64: o << int8_t(BinaryConsts::F64StoreMem); break;
-      case v128: assert(false && "v128 not implemented yet");
+      case v128: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::V128Store); break;
       case none:
       case unreachable: WASM_UNREACHABLE();
     }
@@ -872,6 +877,84 @@ void StackWriter<Mode, Parent>::visitAtomicWake(AtomicWake* curr) {
 }
 
 template<StackWriterMode Mode, typename Parent>
+void StackWriter<Mode, Parent>::visitSIMDExtract(SIMDExtract* curr) {
+  visitChild(curr->vec);
+  if (justAddToStack(curr)) return;
+  o << int8_t(BinaryConsts::SIMDPrefix);
+  switch (curr->op) {
+    case ExtractLaneSVecI8x16: o << U32LEB(BinaryConsts::I8x16ExtractLaneS); break;
+    case ExtractLaneUVecI8x16: o << U32LEB(BinaryConsts::I8x16ExtractLaneU); break;
+    case ExtractLaneSVecI16x8: o << U32LEB(BinaryConsts::I16x8ExtractLaneS); break;
+    case ExtractLaneUVecI16x8: o << U32LEB(BinaryConsts::I16x8ExtractLaneU); break;
+    case ExtractLaneVecI32x4: o << U32LEB(BinaryConsts::I32x4ExtractLane); break;
+    case ExtractLaneVecI64x2: o << U32LEB(BinaryConsts::I64x2ExtractLane); break;
+    case ExtractLaneVecF32x4: o << U32LEB(BinaryConsts::F32x4ExtractLane); break;
+    case ExtractLaneVecF64x2: o << U32LEB(BinaryConsts::F64x2ExtractLane); break;
+  }
+  o << uint8_t(curr->index);
+}
+
+template<StackWriterMode Mode, typename Parent>
+void StackWriter<Mode, Parent>::visitSIMDReplace(SIMDReplace* curr) {
+  visitChild(curr->vec);
+  visitChild(curr->value);
+  if (justAddToStack(curr)) return;
+  o << int8_t(BinaryConsts::SIMDPrefix);
+  switch (curr->op) {
+    case ReplaceLaneVecI8x16: o << U32LEB(BinaryConsts::I8x16ReplaceLane); break;
+    case ReplaceLaneVecI16x8: o << U32LEB(BinaryConsts::I16x8ReplaceLane); break;
+    case ReplaceLaneVecI32x4: o << U32LEB(BinaryConsts::I32x4ReplaceLane); break;
+    case ReplaceLaneVecI64x2: o << U32LEB(BinaryConsts::I64x2ReplaceLane); break;
+    case ReplaceLaneVecF32x4: o << U32LEB(BinaryConsts::F32x4ReplaceLane); break;
+    case ReplaceLaneVecF64x2: o << U32LEB(BinaryConsts::F64x2ReplaceLane); break;
+  }
+  assert(curr->index < 16);
+  o << uint8_t(curr->index);
+}
+
+template<StackWriterMode Mode, typename Parent>
+void StackWriter<Mode, Parent>::visitSIMDShuffle(SIMDShuffle* curr) {
+  visitChild(curr->left);
+  visitChild(curr->right);
+  if (justAddToStack(curr)) return;
+  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::V8x16Shuffle);
+  for (uint8_t m : curr->mask) {
+    o << m;
+  }
+}
+
+template<StackWriterMode Mode, typename Parent>
+void StackWriter<Mode, Parent>::visitSIMDBitselect(SIMDBitselect* curr) {
+  visitChild(curr->left);
+  visitChild(curr->right);
+  visitChild(curr->cond);
+  if (justAddToStack(curr)) return;
+  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::V128Bitselect);
+}
+
+template<StackWriterMode Mode, typename Parent>
+void StackWriter<Mode, Parent>::visitSIMDShift(SIMDShift* curr) {
+  visitChild(curr->vec);
+  visitChild(curr->shift);
+  if (justAddToStack(curr)) return;
+  o << int8_t(BinaryConsts::SIMDPrefix);
+  switch (curr->op) {
+    case ShlVecI8x16:  o << U32LEB(BinaryConsts::I8x16Shl); break;
+    case ShrSVecI8x16: o << U32LEB(BinaryConsts::I8x16ShrS); break;
+    case ShrUVecI8x16: o << U32LEB(BinaryConsts::I8x16ShrU); break;
+    case ShlVecI16x8:  o << U32LEB(BinaryConsts::I16x8Shl); break;
+    case ShrSVecI16x8: o << U32LEB(BinaryConsts::I16x8ShrS); break;
+    case ShrUVecI16x8: o << U32LEB(BinaryConsts::I16x8ShrU); break;
+    case ShlVecI32x4:  o << U32LEB(BinaryConsts::I32x4Shl); break;
+    case ShrSVecI32x4: o << U32LEB(BinaryConsts::I32x4ShrS); break;
+    case ShrUVecI32x4: o << U32LEB(BinaryConsts::I32x4ShrU); break;
+    case ShlVecI64x2:  o << U32LEB(BinaryConsts::I64x2Shl); break;
+    case ShrSVecI64x2: o << U32LEB(BinaryConsts::I64x2ShrS); break;
+    case ShrUVecI64x2: o << U32LEB(BinaryConsts::I64x2ShrU); break;
+  }
+}
+
+template<StackWriterMode Mode, typename Parent>
 void StackWriter<Mode, Parent>::visitConst(Const* curr) {
   if (debug) std::cerr << "zz node: Const" << curr << " : " << curr->type << std::endl;
   if (justAddToStack(curr)) return;
@@ -892,9 +975,17 @@ void StackWriter<Mode, Parent>::visitConst(Const* curr) {
       o << int8_t(BinaryConsts::F64Const) << curr->value.reinterpreti64();
       break;
     }
-    case v128: assert(false && "v128 not implemented yet");
+    case v128: {
+      o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::V128Const);
+      std::array<uint8_t, 16> v = curr->value.getv128();
+      for (size_t i = 0; i < 16; ++i) {
+        o << uint8_t(v[i]);
+      }
+      break;
+    }
     case none:
-    case unreachable: WASM_UNREACHABLE();
+    case unreachable:
+      WASM_UNREACHABLE();
   }
   if (debug) std::cerr << "zz const node done.\n";
 }
@@ -969,6 +1060,39 @@ void StackWriter<Mode, Parent>::visitUnary(Unary* curr) {
     case TruncSatUFloat32ToInt64: o << int8_t(BinaryConsts::TruncSatPrefix) << U32LEB(BinaryConsts::I64UTruncSatF32); break;
     case TruncSatSFloat64ToInt64: o << int8_t(BinaryConsts::TruncSatPrefix) << U32LEB(BinaryConsts::I64STruncSatF64); break;
     case TruncSatUFloat64ToInt64: o << int8_t(BinaryConsts::TruncSatPrefix) << U32LEB(BinaryConsts::I64UTruncSatF64); break;
+    case SplatVecI8x16:   o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16Splat); break;
+    case SplatVecI16x8:   o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8Splat); break;
+    case SplatVecI32x4:   o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I32x4Splat); break;
+    case SplatVecI64x2:   o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I64x2Splat); break;
+    case SplatVecF32x4:   o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F32x4Splat); break;
+    case SplatVecF64x2:   o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F64x2Splat); break;
+    case NotVec128:       o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::V128Not); break;
+    case NegVecI8x16:     o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16Neg); break;
+    case AnyTrueVecI8x16: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16AnyTrue); break;
+    case AllTrueVecI8x16: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16AllTrue); break;
+    case NegVecI16x8:     o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8Neg); break;
+    case AnyTrueVecI16x8: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8AnyTrue); break;
+    case AllTrueVecI16x8: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8AllTrue); break;
+    case NegVecI32x4:     o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I32x4Neg); break;
+    case AnyTrueVecI32x4: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I32x4AnyTrue); break;
+    case AllTrueVecI32x4: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I32x4AllTrue); break;
+    case NegVecI64x2:     o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I64x2Neg); break;
+    case AnyTrueVecI64x2: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I64x2AnyTrue); break;
+    case AllTrueVecI64x2: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I64x2AllTrue); break;
+    case AbsVecF32x4:     o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F32x4Abs); break;
+    case NegVecF32x4:     o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F32x4Neg); break;
+    case SqrtVecF32x4:    o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F32x4Sqrt); break;
+    case AbsVecF64x2:     o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F64x2Abs); break;
+    case NegVecF64x2:     o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F64x2Neg); break;
+    case SqrtVecF64x2:    o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F64x2Sqrt); break;
+    case TruncSatSVecF32x4ToVecI32x4: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I32x4TruncSatSF32x4); break;
+    case TruncSatUVecF32x4ToVecI32x4: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I32x4TruncSatUF32x4); break;
+    case TruncSatSVecF64x2ToVecI64x2: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I64x2TruncSatSF64x2); break;
+    case TruncSatUVecF64x2ToVecI64x2: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I64x2TruncSatUF64x2); break;
+    case ConvertSVecI32x4ToVecF32x4:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F32x4ConvertSI32x4); break;
+    case ConvertUVecI32x4ToVecF32x4:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F32x4ConvertUI32x4); break;
+    case ConvertSVecI64x2ToVecF64x2:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F64x2ConvertSI64x2); break;
+    case ConvertUVecI64x2ToVecF64x2:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F64x2ConvertUI64x2); break;
     case InvalidUnary: WASM_UNREACHABLE();
   }
 }
@@ -1063,6 +1187,85 @@ void StackWriter<Mode, Parent>::visitBinary(Binary* curr) {
     case LeFloat64:       o << int8_t(BinaryConsts::F64Le); break;
     case GtFloat64:       o << int8_t(BinaryConsts::F64Gt); break;
     case GeFloat64:       o << int8_t(BinaryConsts::F64Ge); break;
+
+    case EqVecI8x16:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16Eq); break;
+    case NeVecI8x16:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16Ne); break;
+    case LtSVecI8x16: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16LtS); break;
+    case LtUVecI8x16: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16LtU); break;
+    case GtSVecI8x16: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16GtS); break;
+    case GtUVecI8x16: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16GtU); break;
+    case LeSVecI8x16: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16LeS); break;
+    case LeUVecI8x16: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16LeU); break;
+    case GeSVecI8x16: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16GeS); break;
+    case GeUVecI8x16: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16GeU); break;
+    case EqVecI16x8:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8Eq); break;
+    case NeVecI16x8:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8Ne); break;
+    case LtSVecI16x8: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8LtS); break;
+    case LtUVecI16x8: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8LtU); break;
+    case GtSVecI16x8: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8GtS); break;
+    case GtUVecI16x8: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8GtU); break;
+    case LeSVecI16x8: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8LeS); break;
+    case LeUVecI16x8: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8LeU); break;
+    case GeSVecI16x8: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8GeS); break;
+    case GeUVecI16x8: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8GeU); break;
+    case EqVecI32x4:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I32x4Eq); break;
+    case NeVecI32x4:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I32x4Ne); break;
+    case LtSVecI32x4: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I32x4LtS); break;
+    case LtUVecI32x4: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I32x4LtU); break;
+    case GtSVecI32x4: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I32x4GtS); break;
+    case GtUVecI32x4: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I32x4GtU); break;
+    case LeSVecI32x4: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I32x4LeS); break;
+    case LeUVecI32x4: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I32x4LeU); break;
+    case GeSVecI32x4: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I32x4GeS); break;
+    case GeUVecI32x4: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I32x4GeU); break;
+    case EqVecF32x4:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F32x4Eq); break;
+    case NeVecF32x4:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F32x4Ne); break;
+    case LtVecF32x4:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F32x4Lt); break;
+    case GtVecF32x4:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F32x4Gt); break;
+    case LeVecF32x4:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F32x4Le); break;
+    case GeVecF32x4:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F32x4Ge); break;
+    case EqVecF64x2:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F64x2Eq); break;
+    case NeVecF64x2:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F64x2Ne); break;
+    case LtVecF64x2:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F64x2Lt); break;
+    case GtVecF64x2:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F64x2Gt); break;
+    case LeVecF64x2:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F64x2Le); break;
+    case GeVecF64x2:  o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F64x2Ge); break;
+    case AndVec128:   o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::V128And); break;
+    case OrVec128:    o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::V128Or); break;
+    case XorVec128:   o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::V128Xor); break;
+
+    case AddVecI8x16:     o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16Add); break;
+    case AddSatSVecI8x16: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16AddSatS); break;
+    case AddSatUVecI8x16: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16AddSatU); break;
+    case SubVecI8x16:     o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16Sub); break;
+    case SubSatSVecI8x16: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16SubSatS); break;
+    case SubSatUVecI8x16: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16SubSatU); break;
+    case MulVecI8x16:     o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I8x16Mul); break;
+    case AddVecI16x8:     o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8Add); break;
+    case AddSatSVecI16x8: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8AddSatS); break;
+    case AddSatUVecI16x8: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8AddSatU); break;
+    case SubVecI16x8:     o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8Sub); break;
+    case SubSatSVecI16x8: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8SubSatS); break;
+    case SubSatUVecI16x8: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8SubSatU); break;
+    case MulVecI16x8:     o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I16x8Mul); break;
+    case AddVecI32x4:     o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I32x4Add); break;
+    case SubVecI32x4:     o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I32x4Sub); break;
+    case MulVecI32x4:     o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I32x4Mul); break;
+    case AddVecI64x2:     o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I64x2Add); break;
+    case SubVecI64x2:     o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::I64x2Sub); break;
+
+    case AddVecF32x4: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F32x4Add); break;
+    case SubVecF32x4: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F32x4Sub); break;
+    case MulVecF32x4: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F32x4Mul); break;
+    case DivVecF32x4: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F32x4Div); break;
+    case MinVecF32x4: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F32x4Min); break;
+    case MaxVecF32x4: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F32x4Max); break;
+    case AddVecF64x2: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F64x2Add); break;
+    case SubVecF64x2: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F64x2Sub); break;
+    case MulVecF64x2: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F64x2Mul); break;
+    case DivVecF64x2: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F64x2Div); break;
+    case MinVecF64x2: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F64x2Min); break;
+    case MaxVecF64x2: o << int8_t(BinaryConsts::SIMDPrefix) << U32LEB(BinaryConsts::F64x2Max); break;
     case InvalidBinary: WASM_UNREACHABLE();
   }
 }
diff --git a/src/wasm-traversal.h b/src/wasm-traversal.h
index 9ea1124d0..c79be1c10 100644
--- a/src/wasm-traversal.h
+++ b/src/wasm-traversal.h
@@ -54,6 +54,11 @@ struct Visitor {
   ReturnType visitAtomicCmpxchg(AtomicCmpxchg* curr) { return ReturnType(); }
   ReturnType visitAtomicWait(AtomicWait* curr) { return ReturnType(); }
   ReturnType visitAtomicWake(AtomicWake* curr) { return ReturnType(); }
+  ReturnType visitSIMDExtract(SIMDExtract* curr) { return ReturnType(); }
+  ReturnType visitSIMDReplace(SIMDReplace* curr) { return ReturnType(); }
+  ReturnType visitSIMDShuffle(SIMDShuffle* curr) { return ReturnType(); }
+  ReturnType visitSIMDBitselect(SIMDBitselect* curr) { return ReturnType(); }
+  ReturnType visitSIMDShift(SIMDShift* curr) { return ReturnType(); }
   ReturnType visitConst(Const* curr) { return ReturnType(); }
   ReturnType visitUnary(Unary* curr) { return ReturnType(); }
   ReturnType visitBinary(Binary* curr) { return ReturnType(); }
@@ -97,6 +102,11 @@ struct Visitor {
       case Expression::Id::AtomicCmpxchgId: DELEGATE(AtomicCmpxchg);
       case Expression::Id::AtomicWaitId: DELEGATE(AtomicWait);
       case Expression::Id::AtomicWakeId: DELEGATE(AtomicWake);
+      case Expression::Id::SIMDExtractId: DELEGATE(SIMDExtract);
+      case Expression::Id::SIMDReplaceId: DELEGATE(SIMDReplace);
+      case Expression::Id::SIMDShuffleId: DELEGATE(SIMDShuffle);
+      case Expression::Id::SIMDBitselectId: DELEGATE(SIMDBitselect);
+      case Expression::Id::SIMDShiftId: DELEGATE(SIMDShift);
       case Expression::Id::ConstId: DELEGATE(Const);
       case Expression::Id::UnaryId: DELEGATE(Unary);
       case Expression::Id::BinaryId: DELEGATE(Binary);
@@ -142,6 +152,11 @@ struct OverriddenVisitor {
   UNIMPLEMENTED(AtomicCmpxchg);
   UNIMPLEMENTED(AtomicWait);
   UNIMPLEMENTED(AtomicWake);
+  UNIMPLEMENTED(SIMDExtract);
+  UNIMPLEMENTED(SIMDReplace);
+  UNIMPLEMENTED(SIMDShuffle);
+  UNIMPLEMENTED(SIMDBitselect);
+  UNIMPLEMENTED(SIMDShift);
   UNIMPLEMENTED(Const);
   UNIMPLEMENTED(Unary);
   UNIMPLEMENTED(Binary);
@@ -186,6 +201,11 @@ struct OverriddenVisitor {
       case Expression::Id::AtomicCmpxchgId: DELEGATE(AtomicCmpxchg);
       case Expression::Id::AtomicWaitId: DELEGATE(AtomicWait);
       case Expression::Id::AtomicWakeId: DELEGATE(AtomicWake);
+      case Expression::Id::SIMDExtractId: DELEGATE(SIMDExtract);
+      case Expression::Id::SIMDReplaceId: DELEGATE(SIMDReplace);
+      case Expression::Id::SIMDShuffleId: DELEGATE(SIMDShuffle);
+      case Expression::Id::SIMDBitselectId: DELEGATE(SIMDBitselect);
+      case Expression::Id::SIMDShiftId: DELEGATE(SIMDShift);
       case Expression::Id::ConstId: DELEGATE(Const);
       case Expression::Id::UnaryId: DELEGATE(Unary);
       case Expression::Id::BinaryId: DELEGATE(Binary);
@@ -229,6 +249,11 @@ struct UnifiedExpressionVisitor : public Visitor<SubType, ReturnType> {
   ReturnType visitAtomicCmpxchg(AtomicCmpxchg* curr) { return static_cast<SubType*>(this)->visitExpression(curr); }
   ReturnType visitAtomicWait(AtomicWait* curr) { return static_cast<SubType*>(this)->visitExpression(curr); }
   ReturnType visitAtomicWake(AtomicWake* curr) { return static_cast<SubType*>(this)->visitExpression(curr); }
+  ReturnType visitSIMDExtract(SIMDExtract* curr) { return static_cast<SubType*>(this)->visitExpression(curr); }
+  ReturnType visitSIMDReplace(SIMDReplace* curr) { return static_cast<SubType*>(this)->visitExpression(curr); }
+  ReturnType visitSIMDShuffle(SIMDShuffle* curr) { return static_cast<SubType*>(this)->visitExpression(curr); }
+  ReturnType visitSIMDBitselect(SIMDBitselect* curr) { return static_cast<SubType*>(this)->visitExpression(curr); }
+  ReturnType visitSIMDShift(SIMDShift* curr) { return static_cast<SubType*>(this)->visitExpression(curr); }
   ReturnType visitConst(Const* curr) { return static_cast<SubType*>(this)->visitExpression(curr); }
   ReturnType visitUnary(Unary* curr) { return static_cast<SubType*>(this)->visitExpression(curr); }
   ReturnType visitBinary(Binary* curr) { return static_cast<SubType*>(this)->visitExpression(curr); }
@@ -414,6 +439,11 @@ struct Walker : public VisitorType {
   static void doVisitAtomicCmpxchg(SubType* self, Expression** currp){ self->visitAtomicCmpxchg((*currp)->cast<AtomicCmpxchg>()); }
   static void doVisitAtomicWait(SubType* self, Expression** currp)   { self->visitAtomicWait((*currp)->cast<AtomicWait>()); }
   static void doVisitAtomicWake(SubType* self, Expression** currp)   { self->visitAtomicWake((*currp)->cast<AtomicWake>()); }
+  static void doVisitSIMDExtract(SubType* self, Expression** currp)  { self->visitSIMDExtract((*currp)->cast<SIMDExtract>()); }
+  static void doVisitSIMDReplace(SubType* self, Expression** currp)  { self->visitSIMDReplace((*currp)->cast<SIMDReplace>()); }
+  static void doVisitSIMDShuffle(SubType* self, Expression** currp)  { self->visitSIMDShuffle((*currp)->cast<SIMDShuffle>()); }
+  static void doVisitSIMDBitselect(SubType* self, Expression** currp) { self->visitSIMDBitselect((*currp)->cast<SIMDBitselect>()); }
+  static void doVisitSIMDShift(SubType* self, Expression** currp)    { self->visitSIMDShift((*currp)->cast<SIMDShift>()); }
   static void doVisitConst(SubType* self, Expression** currp)        { self->visitConst((*currp)->cast<Const>()); }
   static void doVisitUnary(SubType* self, Expression** currp)        { self->visitUnary((*currp)->cast<Unary>()); }
   static void doVisitBinary(SubType* self, Expression** currp)       { self->visitBinary((*currp)->cast<Binary>()); }
@@ -554,6 +584,36 @@ struct PostWalker : public Walker<SubType, VisitorType> {
         self->pushTask(SubType::scan, &curr->cast<AtomicWake>()->ptr);
         break;
       }
+      case Expression::Id::SIMDExtractId: {
+        self->pushTask(SubType::doVisitSIMDExtract, currp);
+        self->pushTask(SubType::scan, &curr->cast<SIMDExtract>()->vec);
+        break;
+      }
+      case Expression::Id::SIMDReplaceId: {
+        self->pushTask(SubType::doVisitSIMDReplace, currp);
+        self->pushTask(SubType::scan, &curr->cast<SIMDReplace>()->value);
+        self->pushTask(SubType::scan, &curr->cast<SIMDReplace>()->vec);
+        break;
+      }
+      case Expression::Id::SIMDShuffleId: {
+        self->pushTask(SubType::doVisitSIMDShuffle, currp);
+        self->pushTask(SubType::scan, &curr->cast<SIMDShuffle>()->right);
+        self->pushTask(SubType::scan, &curr->cast<SIMDShuffle>()->left);
+        break;
+      }
+      case Expression::Id::SIMDBitselectId: {
+        self->pushTask(SubType::doVisitSIMDBitselect, currp);
+        self->pushTask(SubType::scan, &curr->cast<SIMDBitselect>()->cond);
+        self->pushTask(SubType::scan, &curr->cast<SIMDBitselect>()->right);
+        self->pushTask(SubType::scan, &curr->cast<SIMDBitselect>()->left);
+        break;
+      }
+      case Expression::Id::SIMDShiftId: {
+        self->pushTask(SubType::doVisitSIMDShift, currp);
+        self->pushTask(SubType::scan, &curr->cast<SIMDShift>()->shift);
+        self->pushTask(SubType::scan, &curr->cast<SIMDShift>()->vec);
+        break;
+      }
       case Expression::Id::ConstId: {
         self->pushTask(SubType::doVisitConst, currp);
         break;
diff --git a/src/wasm.h b/src/wasm.h
index b09b4d7d3..27a302d3c 100644
--- a/src/wasm.h
+++ b/src/wasm.h
@@ -25,6 +25,7 @@
 #define wasm_wasm_h
 
 #include <algorithm>
+#include <array>
 #include <cassert>
 #include <map>
 #include <string>
@@ -43,7 +44,8 @@ struct FeatureSet {
     Atomics = 1 << 0,
     MutableGlobals = 1 << 1,
     TruncSat = 1 << 2,
-    All = Atomics | MutableGlobals | TruncSat
+    SIMD = 1 << 3,
+    All = Atomics | MutableGlobals | TruncSat | SIMD
   };
 
   FeatureSet() : features(MVP) {}
@@ -54,16 +56,22 @@ struct FeatureSet {
   bool hasAtomics() const { return features & Atomics; }
   bool hasMutableGlobals() const { return features & MutableGlobals; }
   bool hasTruncSat() const { return features & TruncSat; }
-  bool hasAll() const { return features & (Atomics | MutableGlobals | TruncSat); }
+  bool hasSIMD() const { return features & SIMD; }
+  bool hasAll() const { return features & All; }
 
   void makeMVP() { features = MVP; }
   void set(Feature f, bool v = true) { features = v ? (features | f) : (features & ~f); }
   void setAtomics(bool v = true) { set(Atomics, v); }
   void setMutableGlobals(bool v = true) { set(MutableGlobals, v); }
   void setTruncSat(bool v = true) { set(TruncSat, v); }
+  void setSIMD(bool v = true) { set(SIMD, v); }
   void setAll(bool v = true) { features = v ? All : MVP; }
 
- private:
+  bool operator<=(const FeatureSet& other) {
+    return !(features & ~other.features);
+  }
+
+private:
   uint32_t features;
 };
 
@@ -116,6 +124,15 @@ enum UnaryOp {
   // Saturating float-to-int
   TruncSatSFloat32ToInt32, TruncSatUFloat32ToInt32, TruncSatSFloat64ToInt32, TruncSatUFloat64ToInt32,
   TruncSatSFloat32ToInt64, TruncSatUFloat32ToInt64, TruncSatSFloat64ToInt64, TruncSatUFloat64ToInt64,
+  // SIMD splats
+  SplatVecI8x16, SplatVecI16x8, SplatVecI32x4, SplatVecI64x2, SplatVecF32x4, SplatVecF64x2,
+  // SIMD arithmetic
+  NotVec128,
+  NegVecI8x16, AnyTrueVecI8x16, AllTrueVecI8x16, NegVecI16x8, AnyTrueVecI16x8, AllTrueVecI16x8,
+  NegVecI32x4, AnyTrueVecI32x4, AllTrueVecI32x4, NegVecI64x2, AnyTrueVecI64x2, AllTrueVecI64x2,
+  AbsVecF32x4, NegVecF32x4, SqrtVecF32x4, AbsVecF64x2, NegVecF64x2, SqrtVecF64x2,
+  TruncSatSVecF32x4ToVecI32x4, TruncSatUVecF32x4ToVecI32x4, TruncSatSVecF64x2ToVecI64x2, TruncSatUVecF64x2ToVecI64x2,
+  ConvertSVecI32x4ToVecF32x4, ConvertUVecI32x4ToVecF32x4, ConvertSVecI64x2ToVecF64x2, ConvertUVecI64x2ToVecF64x2,
 
   InvalidUnary
 };
@@ -144,6 +161,19 @@ enum BinaryOp {
   // relational ops
   EqFloat64, NeFloat64, // int or float
   LtFloat64, LeFloat64, GtFloat64, GeFloat64, // float
+  // SIMD relational ops (return vectors)
+  EqVecI8x16, NeVecI8x16, LtSVecI8x16, LtUVecI8x16, GtSVecI8x16, GtUVecI8x16, LeSVecI8x16, LeUVecI8x16, GeSVecI8x16, GeUVecI8x16,
+  EqVecI16x8, NeVecI16x8, LtSVecI16x8, LtUVecI16x8, GtSVecI16x8, GtUVecI16x8, LeSVecI16x8, LeUVecI16x8, GeSVecI16x8, GeUVecI16x8,
+  EqVecI32x4, NeVecI32x4, LtSVecI32x4, LtUVecI32x4, GtSVecI32x4, GtUVecI32x4, LeSVecI32x4, LeUVecI32x4, GeSVecI32x4, GeUVecI32x4,
+  EqVecF32x4, NeVecF32x4, LtVecF32x4, GtVecF32x4, LeVecF32x4, GeVecF32x4,
+  EqVecF64x2, NeVecF64x2, LtVecF64x2, GtVecF64x2, LeVecF64x2, GeVecF64x2,
+  // SIMD arithmetic
+  AndVec128, OrVec128, XorVec128,
+  AddVecI8x16, AddSatSVecI8x16, AddSatUVecI8x16, SubVecI8x16, SubSatSVecI8x16, SubSatUVecI8x16, MulVecI8x16,
+  AddVecI16x8, AddSatSVecI16x8, AddSatUVecI16x8, SubVecI16x8, SubSatSVecI16x8, SubSatUVecI16x8, MulVecI16x8,
+  AddVecI32x4, SubVecI32x4, MulVecI32x4, AddVecI64x2, SubVecI64x2,
+  AddVecF32x4, SubVecF32x4, MulVecF32x4, DivVecF32x4, MinVecF32x4, MaxVecF32x4,
+  AddVecF64x2, SubVecF64x2, MulVecF64x2, DivVecF64x2, MinVecF64x2, MaxVecF64x2,
 
   InvalidBinary
 };
@@ -156,6 +186,20 @@ enum AtomicRMWOp {
   Add, Sub, And, Or, Xor, Xchg
 };
 
+enum SIMDExtractOp {
+  ExtractLaneSVecI8x16, ExtractLaneUVecI8x16, ExtractLaneSVecI16x8, ExtractLaneUVecI16x8,
+  ExtractLaneVecI32x4, ExtractLaneVecI64x2, ExtractLaneVecF32x4, ExtractLaneVecF64x2
+};
+
+enum SIMDReplaceOp {
+  ReplaceLaneVecI8x16, ReplaceLaneVecI16x8, ReplaceLaneVecI32x4, ReplaceLaneVecI64x2, ReplaceLaneVecF32x4, ReplaceLaneVecF64x2
+};
+
+enum SIMDShiftOp {
+  ShlVecI8x16, ShrSVecI8x16, ShrUVecI8x16, ShlVecI16x8, ShrSVecI16x8, ShrUVecI16x8,
+  ShlVecI32x4, ShrSVecI32x4, ShrUVecI32x4, ShlVecI64x2, ShrSVecI64x2, ShrUVecI64x2
+};
+
 //
 // Expressions
 //
@@ -206,6 +250,11 @@ public:
     AtomicCmpxchgId,
     AtomicWaitId,
     AtomicWakeId,
+    SIMDExtractId,
+    SIMDReplaceId,
+    SIMDShuffleId,
+    SIMDBitselectId,
+    SIMDShiftId,
     NumExpressionIds
   };
   Id _id;
@@ -502,6 +551,67 @@ class AtomicWake : public SpecificExpression<Expression::AtomicWakeId> {
   void finalize();
 };
 
+class SIMDExtract : public SpecificExpression<Expression::SIMDExtractId> {
+ public:
+  SIMDExtract() = default;
+  SIMDExtract(MixedArena& allocator) : SIMDExtract() {}
+
+  SIMDExtractOp op;
+  Expression* vec;
+  uint8_t index;
+
+  void finalize();
+};
+
+class SIMDReplace : public SpecificExpression<Expression::SIMDReplaceId> {
+ public:
+  SIMDReplace() = default;
+  SIMDReplace(MixedArena& allocator) : SIMDReplace() {}
+
+  SIMDReplaceOp op;
+  Expression* vec;
+  uint8_t index;
+  Expression* value;
+
+  void finalize();
+};
+
+class SIMDShuffle : public SpecificExpression<Expression::SIMDShuffleId> {
+ public:
+  SIMDShuffle() = default;
+  SIMDShuffle(MixedArena& allocator) : SIMDShuffle() {}
+
+  Expression* left;
+  Expression* right;
+  std::array<uint8_t, 16> mask;
+
+  void finalize();
+};
+
+class SIMDBitselect : public SpecificExpression<Expression::SIMDBitselectId> {
+ public:
+  SIMDBitselect() = default;
+  SIMDBitselect(MixedArena& allocator) : SIMDBitselect() {}
+
+  Expression* left;
+  Expression* right;
+  Expression* cond;
+
+  void finalize();
+};
+
+class SIMDShift : public SpecificExpression<Expression::SIMDShiftId> {
+ public:
+  SIMDShift() = default;
+  SIMDShift(MixedArena& allocator) : SIMDShift() {}
+
+  SIMDShiftOp op;
+  Expression* vec;
+  Expression* shift;
+
+  void finalize();
+};
+
 class Const : public SpecificExpression<Expression::ConstId> {
 public:
   Const() {}
diff --git a/src/wasm/literal.cpp b/src/wasm/literal.cpp
index 7b9e64e43..5f358fa6b 100644
--- a/src/wasm/literal.cpp
+++ b/src/wasm/literal.cpp
@@ -28,6 +28,52 @@
 
 namespace wasm {
 
+template<int N>
+using LaneArray = std::array<Literal, N>;
+
+Literal::Literal(const uint8_t init[16]) : type(Type::v128) {
+  memcpy(&v128, init, 16);
+}
+
+template<typename LaneT, int Lanes>
+static void extractBytes(uint8_t (&dest)[16], const LaneArray<Lanes>& lanes) {
+  std::array<uint8_t, 16> bytes;
+  const size_t lane_width = 16 / Lanes;
+  for (size_t lane_index = 0; lane_index < Lanes; ++lane_index) {
+    uint8_t bits[16];
+    lanes[lane_index].getBits(bits);
+    LaneT lane;
+    memcpy(&lane, bits, sizeof(lane));
+    for (size_t offset = 0; offset < lane_width; ++offset) {
+      bytes.at(lane_index * lane_width + offset) = uint8_t(lane >> (8 * offset));
+    }
+  }
+  memcpy(&dest, bytes.data(), sizeof(bytes));
+}
+
+Literal::Literal(const LaneArray<16>& lanes) : type(Type::v128) {
+  extractBytes<uint8_t, 16>(v128, lanes);
+}
+
+Literal::Literal(const LaneArray<8>& lanes) : type(Type::v128) {
+  extractBytes<uint16_t, 8>(v128, lanes);
+}
+
+Literal::Literal(const LaneArray<4>& lanes) : type(Type::v128) {
+  extractBytes<uint32_t, 4>(v128, lanes);
+}
+
+Literal::Literal(const LaneArray<2>& lanes) : type(Type::v128) {
+  extractBytes<uint64_t, 2>(v128, lanes);
+}
+
+std::array<uint8_t, 16> Literal::getv128() const {
+  assert(type == Type::v128);
+  std::array<uint8_t, 16> ret;
+  memcpy(ret.data(), v128, sizeof(ret));
+  return ret;
+}
+
 Literal Literal::castToF32() {
   assert(type == Type::i32);
   Literal ret(i32);
@@ -72,20 +118,26 @@ double Literal::getFloat() const {
   }
 }
 
-int64_t Literal::getBits() const {
+void Literal::getBits(uint8_t (&buf)[16]) const {
+  memset(buf, 0, 16);
   switch (type) {
-    case Type::i32: case Type::f32: return i32;
-    case Type::i64: case Type::f64: return i64;
-    case Type::v128: assert(false && "v128 not implemented");
-    case Type::none: case Type::unreachable: WASM_UNREACHABLE();
+    case Type::i32:
+    case Type::f32: memcpy(buf, &i32, sizeof(i32)); break;
+    case Type::i64:
+    case Type::f64: memcpy(buf, &i64, sizeof(i64)); break;
+    case Type::v128: memcpy(buf, &v128, sizeof(v128)); break;
+    case Type::none:
+    case Type::unreachable: WASM_UNREACHABLE();
   }
-  WASM_UNREACHABLE();
 }
 
 bool Literal::operator==(const Literal& other) const {
   if (type != other.type) return false;
   if (type == none) return true;
-  return getBits() == other.getBits();
+  uint8_t bits[16], other_bits[16];
+  getBits(bits);
+  other.getBits(other_bits);
+  return memcmp(bits, other_bits, 16) == 0;
 }
 
 bool Literal::operator!=(const Literal& other) const {
@@ -158,6 +210,15 @@ void Literal::printDouble(std::ostream& o, double d) {
   o << text;
 }
 
+void Literal::printVec128(std::ostream& o, const std::array<uint8_t, 16>& v) {
+  o << std::hex;
+  for (auto i = 0; i < 16; ++i) {
+    o << "0x" << uint32_t(v[i]);
+    if (i < 15) o << " ";
+  }
+  o << std::dec;
+}
+
 std::ostream& operator<<(std::ostream& o, Literal literal) {
   prepareMinorColor(o) << printType(literal.type) << ".const ";
   switch (literal.type) {
@@ -166,7 +227,7 @@ std::ostream& operator<<(std::ostream& o, Literal literal) {
     case Type::i64: o << literal.i64; break;
     case Type::f32: literal.printFloat(o, literal.getf32()); break;
     case Type::f64: literal.printDouble(o, literal.getf64()); break;
-    case Type::v128: assert(false && "v128 not implemented yet");
+    case Type::v128: o << "i32 "; literal.printVec128(o, literal.getv128()); break;
     case Type::unreachable: WASM_UNREACHABLE();
   }
   restoreNormalColor(o);
@@ -450,6 +511,79 @@ Literal Literal::sub(const Literal& other) const {
   WASM_UNREACHABLE();
 }
 
+template<typename T>
+static T add_sat_s(T a, T b) {
+  static_assert(std::is_signed<T>::value, "Trying to instantiate add_sat_s with unsigned type");
+  using UT = typename std::make_unsigned<T>::type;
+  UT ua = static_cast<UT>(a);
+  UT ub = static_cast<UT>(b);
+  UT ures = ua + ub;
+  // overflow if sign of result is different from sign of a and b
+  if (static_cast<T>((ures ^ ua) & (ures ^ ub)) < 0) {
+    return (a < 0)
+        ? std::numeric_limits<T>::min()
+        : std::numeric_limits<T>::max();
+  }
+  return static_cast<T>(ures);
+}
+
+template<typename T>
+static T sub_sat_s(T a, T b) {
+  static_assert(std::is_signed<T>::value, "Trying to instantiate sub_sat_s with unsigned type");
+  using UT = typename std::make_unsigned<T>::type;
+  UT ua = static_cast<UT>(a);
+  UT ub = static_cast<UT>(b);
+  UT ures = ua - ub;
+  // overflow if a and b have different signs and result and a differ in sign
+  if (static_cast<T>((ua ^ ub) & (ures ^ ua)) < 0) {
+    return (a < 0)
+        ? std::numeric_limits<T>::min()
+        : std::numeric_limits<T>::max();
+  }
+  return static_cast<T>(ures);
+}
+
+template<typename T>
+static T add_sat_u(T a, T b) {
+  static_assert(std::is_unsigned<T>::value, "Trying to instantiate add_sat_u with signed type");
+  T res = a + b;
+  // overflow if result is less than arguments
+  return (res < a) ? std::numeric_limits<T>::max() : res;
+}
+
+template<typename T>
+static T sub_sat_u(T a, T b) {
+  static_assert(std::is_unsigned<T>::value, "Trying to instantiate sub_sat_u with signed type");
+  T res = a - b;
+  // overflow if result is greater than a
+  return (res > a) ? 0 : res;
+}
+
+Literal Literal::addSatSI8(const Literal& other) const {
+  return Literal(add_sat_s<int8_t>(geti32(), other.geti32()));
+}
+Literal Literal::addSatUI8(const Literal& other) const {
+  return Literal(add_sat_u<uint8_t>(geti32(), other.geti32()));
+}
+Literal Literal::addSatSI16(const Literal& other) const {
+  return Literal(add_sat_s<int16_t>(geti32(), other.geti32()));
+}
+Literal Literal::addSatUI16(const Literal& other) const {
+  return Literal(add_sat_u<uint16_t>(geti32(), other.geti32()));
+}
+Literal Literal::subSatSI8(const Literal& other) const {
+  return Literal(sub_sat_s<int8_t>(geti32(), other.geti32()));
+}
+Literal Literal::subSatUI8(const Literal& other) const {
+  return Literal(sub_sat_u<uint8_t>(geti32(), other.geti32()));
+}
+Literal Literal::subSatSI16(const Literal& other) const {
+  return Literal(sub_sat_s<int16_t>(geti32(), other.geti32()));
+}
+Literal Literal::subSatUI16(const Literal& other) const {
+  return Literal(sub_sat_u<uint16_t>(geti32(), other.geti32()));
+}
+
 Literal Literal::mul(const Literal& other) const {
   switch (type) {
     case Type::i32: return Literal(uint32_t(i32) * uint32_t(other.i32));
@@ -784,4 +918,538 @@ Literal Literal::copysign(const Literal& other) const {
   }
 }
 
+template<typename LaneT, int Lanes>
+static LaneArray<Lanes> getLanes(const Literal& val) {
+  assert(val.type == Type::v128);
+  const size_t lane_width = 16 / Lanes;
+  std::array<uint8_t, 16> bytes = val.getv128();
+  LaneArray<Lanes> lanes;
+  for (size_t lane_index = 0; lane_index < Lanes; ++lane_index) {
+    LaneT lane(0);
+    for (size_t offset = 0; offset < lane_width; ++offset) {
+      lane |= LaneT(bytes.at(lane_index * lane_width + offset)) << LaneT(8 * offset);
+    }
+    lanes.at(lane_index) = Literal(lane);
+  }
+  return lanes;
+}
+
+LaneArray<16> Literal::getLanesSI8x16() const {
+  return getLanes<int8_t, 16>(*this);
+}
+LaneArray<16> Literal::getLanesUI8x16() const {
+  return getLanes<uint8_t, 16>(*this);
+}
+LaneArray<8> Literal::getLanesSI16x8() const {
+  return getLanes<int16_t, 8>(*this);
+}
+LaneArray<8> Literal::getLanesUI16x8() const {
+  return getLanes<uint16_t, 8>(*this);
+}
+LaneArray<4> Literal::getLanesI32x4() const {
+  return getLanes<int32_t, 4>(*this);
+}
+LaneArray<2> Literal::getLanesI64x2() const {
+  return getLanes<int64_t, 2>(*this);
+}
+LaneArray<4> Literal::getLanesF32x4() const {
+  auto lanes = getLanesI32x4();
+  for (size_t i = 0; i < lanes.size(); ++i) {
+    lanes[i] = lanes[i].castToF32();
+  }
+  return lanes;
+}
+LaneArray<2> Literal::getLanesF64x2() const {
+  auto lanes = getLanesI64x2();
+  for (size_t i = 0; i < lanes.size(); ++i) {
+    lanes[i] = lanes[i].castToF64();
+  }
+  return lanes;
+}
+
+Literal Literal::shuffleV8x16(const Literal& other, const std::array<uint8_t, 16>& mask) const {
+  assert(type == Type::v128);
+  uint8_t bytes[16];
+  for (size_t i = 0; i < mask.size(); ++i) {
+    bytes[i] = (mask[i] < 16) ? v128[mask[i]] : other.v128[mask[i] - 16];
+  }
+  return Literal(bytes);
+}
+
+template<Type Ty, int Lanes>
+static Literal splat(const Literal& val) {
+  assert(val.type == Ty);
+  LaneArray<Lanes> lanes;
+  lanes.fill(val);
+  return Literal(lanes);
+}
+
+Literal Literal::splatI8x16() const { return splat<Type::i32, 16>(*this); }
+Literal Literal::splatI16x8() const { return splat<Type::i32, 8>(*this); }
+Literal Literal::splatI32x4() const { return splat<Type::i32, 4>(*this); }
+Literal Literal::splatI64x2() const { return splat<Type::i64, 2>(*this); }
+Literal Literal::splatF32x4() const { return splat<Type::f32, 4>(*this); }
+Literal Literal::splatF64x2() const { return splat<Type::f64, 2>(*this); }
+
+Literal Literal::extractLaneSI8x16(uint8_t index) const { return getLanesSI8x16().at(index); }
+Literal Literal::extractLaneUI8x16(uint8_t index) const { return getLanesUI8x16().at(index); }
+Literal Literal::extractLaneSI16x8(uint8_t index) const { return getLanesSI16x8().at(index); }
+Literal Literal::extractLaneUI16x8(uint8_t index) const { return getLanesUI16x8().at(index); }
+Literal Literal::extractLaneI32x4(uint8_t index) const { return getLanesI32x4().at(index); }
+Literal Literal::extractLaneI64x2(uint8_t index) const { return getLanesI64x2().at(index); }
+Literal Literal::extractLaneF32x4(uint8_t index) const { return getLanesF32x4().at(index); }
+Literal Literal::extractLaneF64x2(uint8_t index) const { return getLanesF64x2().at(index); }
+
+template<int Lanes, LaneArray<Lanes> (Literal::*IntoLanes)() const>
+static Literal replace(const Literal& val, const Literal& other, uint8_t index) {
+  LaneArray<Lanes> lanes = (val.*IntoLanes)();
+  lanes.at(index) = other;
+  auto ret = Literal(lanes);
+  return ret;
+}
+
+Literal Literal::replaceLaneI8x16(const Literal& other, uint8_t index) const {
+  return replace<16, &Literal::getLanesUI8x16>(*this, other, index);
+}
+Literal Literal::replaceLaneI16x8(const Literal& other, uint8_t index) const {
+  return replace<8, &Literal::getLanesUI16x8>(*this, other, index);
+}
+Literal Literal::replaceLaneI32x4(const Literal& other, uint8_t index) const {
+  return replace<4, &Literal::getLanesI32x4>(*this, other, index);
+}
+Literal Literal::replaceLaneI64x2(const Literal& other, uint8_t index) const {
+  return replace<2, &Literal::getLanesI64x2>(*this, other, index);
+}
+Literal Literal::replaceLaneF32x4(const Literal& other, uint8_t index) const {
+  return replace<4, &Literal::getLanesF32x4>(*this, other, index);
+}
+Literal Literal::replaceLaneF64x2(const Literal& other, uint8_t index) const {
+  return replace<2, &Literal::getLanesF64x2>(*this, other, index);
+}
+
+template<int Lanes, LaneArray<Lanes> (Literal::*IntoLanes)() const,
+         Literal (Literal::*UnaryOp)(void) const>
+static Literal unary(const Literal& val) {
+  LaneArray<Lanes> lanes = (val.*IntoLanes)();
+  for (size_t i = 0; i < Lanes; ++i) {
+    lanes[i] = (lanes[i].*UnaryOp)();
+  }
+  return Literal(lanes);
+}
+
+Literal Literal::notV128() const {
+  std::array<uint8_t, 16> ones;
+  ones.fill(0xff);
+  return xorV128(Literal(ones.data()));
+}
+Literal Literal::negI8x16() const {
+  return unary<16, &Literal::getLanesUI8x16, &Literal::neg>(*this);
+}
+Literal Literal::negI16x8() const {
+  return unary<8, &Literal::getLanesUI16x8, &Literal::neg>(*this);
+}
+Literal Literal::negI32x4() const {
+  return unary<4, &Literal::getLanesI32x4, &Literal::neg>(*this);
+}
+Literal Literal::negI64x2() const {
+  return unary<2, &Literal::getLanesI64x2, &Literal::neg>(*this);
+}
+Literal Literal::absF32x4() const {
+  return unary<4, &Literal::getLanesF32x4, &Literal::abs>(*this);
+}
+Literal Literal::negF32x4() const {
+  return unary<4, &Literal::getLanesF32x4, &Literal::neg>(*this);
+}
+Literal Literal::sqrtF32x4() const {
+  return unary<4, &Literal::getLanesF32x4, &Literal::sqrt>(*this);
+}
+Literal Literal::absF64x2() const {
+  return unary<2, &Literal::getLanesF64x2, &Literal::abs>(*this);
+}
+Literal Literal::negF64x2() const {
+  return unary<2, &Literal::getLanesF64x2, &Literal::neg>(*this);
+}
+Literal Literal::sqrtF64x2() const {
+  return unary<2, &Literal::getLanesF64x2, &Literal::sqrt>(*this);
+}
+Literal Literal::truncSatToSI32x4() const {
+  return unary<4, &Literal::getLanesF32x4, &Literal::truncSatToSI32>(*this);
+}
+Literal Literal::truncSatToUI32x4() const {
+  return unary<4, &Literal::getLanesF32x4, &Literal::truncSatToUI32>(*this);
+}
+Literal Literal::truncSatToSI64x2() const {
+  return unary<2, &Literal::getLanesF64x2, &Literal::truncSatToSI64>(*this);
+}
+Literal Literal::truncSatToUI64x2() const {
+  return unary<2, &Literal::getLanesF64x2, &Literal::truncSatToUI64>(*this);
+}
+Literal Literal::convertSToF32x4() const {
+  return unary<4, &Literal::getLanesI32x4, &Literal::convertSIToF32>(*this);
+}
+Literal Literal::convertUToF32x4() const {
+  return unary<4, &Literal::getLanesI32x4, &Literal::convertUIToF32>(*this);
+}
+Literal Literal::convertSToF64x2() const {
+  return unary<2, &Literal::getLanesI64x2, &Literal::convertSIToF64>(*this);
+}
+Literal Literal::convertUToF64x2() const {
+  return unary<2, &Literal::getLanesI64x2, &Literal::convertUIToF64>(*this);
+}
+
+template<int Lanes, LaneArray<Lanes> (Literal::*IntoLanes)() const>
+static Literal any_true(const Literal& val) {
+  LaneArray<Lanes> lanes = (val.*IntoLanes)();
+  for (size_t i = 0; i < Lanes; ++i) {
+    if (lanes[i] != Literal::makeZero(lanes[i].type)) {
+      return Literal(int32_t(1));
+    }
+  }
+  return Literal(int32_t(0));
+}
+
+template<int Lanes, LaneArray<Lanes> (Literal::*IntoLanes)() const>
+static Literal all_true(const Literal& val) {
+  LaneArray<Lanes> lanes = (val.*IntoLanes)();
+  for (size_t i = 0; i < Lanes; ++i) {
+    if (lanes[i] == Literal::makeZero(lanes[i].type)) {
+      return Literal(int32_t(0));
+    }
+  }
+  return Literal(int32_t(1));
+}
+
+Literal Literal::anyTrueI8x16() const {
+  return any_true<16, &Literal::getLanesUI8x16>(*this);
+}
+Literal Literal::allTrueI8x16() const {
+  return all_true<16, &Literal::getLanesUI8x16>(*this);
+}
+Literal Literal::anyTrueI16x8() const {
+  return any_true<8, &Literal::getLanesUI16x8>(*this);
+}
+Literal Literal::allTrueI16x8() const {
+  return all_true<8, &Literal::getLanesUI16x8>(*this);
+}
+Literal Literal::anyTrueI32x4() const {
+  return any_true<4, &Literal::getLanesI32x4>(*this);
+}
+Literal Literal::allTrueI32x4() const {
+  return all_true<4, &Literal::getLanesI32x4>(*this);
+}
+Literal Literal::anyTrueI64x2() const {
+  return any_true<2, &Literal::getLanesI64x2>(*this);
+}
+Literal Literal::allTrueI64x2() const {
+  return all_true<2, &Literal::getLanesI64x2>(*this);
+}
+
+template<int Lanes, LaneArray<Lanes> (Literal::*IntoLanes)() const,
+         Literal (Literal::*ShiftOp)(const Literal&) const>
+static Literal shift(const Literal& vec, const Literal& shift) {
+  assert(shift.type == Type::i32);
+  size_t lane_bits = 128 / Lanes;
+  LaneArray<Lanes> lanes = (vec.*IntoLanes)();
+  for (size_t i = 0; i < Lanes; ++i) {
+    lanes[i] = (lanes[i].*ShiftOp)(Literal(int32_t(shift.geti32() % lane_bits)));
+  }
+  return Literal(lanes);
+}
+
+Literal Literal::shlI8x16(const Literal& other) const {
+  return shift<16, &Literal::getLanesUI8x16, &Literal::shl>(*this, other);
+}
+Literal Literal::shrSI8x16(const Literal& other) const {
+  return shift<16, &Literal::getLanesSI8x16, &Literal::shrS>(*this, other);
+}
+Literal Literal::shrUI8x16(const Literal& other) const {
+  return shift<16, &Literal::getLanesUI8x16, &Literal::shrU>(*this, other);
+}
+Literal Literal::shlI16x8(const Literal& other) const {
+  return shift<8, &Literal::getLanesUI16x8, &Literal::shl>(*this, other);
+}
+Literal Literal::shrSI16x8(const Literal& other) const {
+  return shift<8, &Literal::getLanesSI16x8, &Literal::shrS>(*this, other);
+}
+Literal Literal::shrUI16x8(const Literal& other) const {
+  return shift<8, &Literal::getLanesUI16x8, &Literal::shrU>(*this, other);
+}
+Literal Literal::shlI32x4(const Literal& other) const {
+  return shift<4, &Literal::getLanesI32x4, &Literal::shl>(*this, other);
+}
+Literal Literal::shrSI32x4(const Literal& other) const {
+  return shift<4, &Literal::getLanesI32x4, &Literal::shrS>(*this, other);
+}
+Literal Literal::shrUI32x4(const Literal& other) const {
+  return shift<4, &Literal::getLanesI32x4, &Literal::shrU>(*this, other);
+}
+Literal Literal::shlI64x2(const Literal& other) const {
+  return shift<2, &Literal::getLanesI64x2, &Literal::shl>(*this, other);
+}
+Literal Literal::shrSI64x2(const Literal& other) const {
+  return shift<2, &Literal::getLanesI64x2, &Literal::shrS>(*this, other);
+}
+Literal Literal::shrUI64x2(const Literal& other) const {
+  return shift<2, &Literal::getLanesI64x2, &Literal::shrU>(*this, other);
+}
+
+template<int Lanes, LaneArray<Lanes> (Literal::*IntoLanes)() const,
+         Literal (Literal::*CompareOp)(const Literal&) const,
+         typename LaneT = int32_t>
+static Literal compare(const Literal& val, const Literal& other) {
+  LaneArray<Lanes> lanes = (val.*IntoLanes)();
+  LaneArray<Lanes> other_lanes = (other.*IntoLanes)();
+  for (size_t i = 0; i < Lanes; ++i) {
+    lanes[i] = (lanes[i].*CompareOp)(other_lanes[i]) == Literal(int32_t(1))
+               ? Literal(LaneT(-1))
+               : Literal(LaneT(0));
+  }
+  return Literal(lanes);
+}
+
+Literal Literal::eqI8x16(const Literal& other) const {
+  return compare<16, &Literal::getLanesUI8x16, &Literal::eq>(*this, other);
+}
+Literal Literal::neI8x16(const Literal& other) const {
+  return compare<16, &Literal::getLanesUI8x16, &Literal::ne>(*this, other);
+}
+Literal Literal::ltSI8x16(const Literal& other) const {
+  return compare<16, &Literal::getLanesSI8x16, &Literal::ltS>(*this, other);
+}
+Literal Literal::ltUI8x16(const Literal& other) const {
+  return compare<16, &Literal::getLanesUI8x16, &Literal::ltU>(*this, other);
+}
+Literal Literal::gtSI8x16(const Literal& other) const  {
+  return compare<16, &Literal::getLanesSI8x16, &Literal::gtS>(*this, other);
+}
+Literal Literal::gtUI8x16(const Literal& other) const {
+  return compare<16, &Literal::getLanesUI8x16, &Literal::gtU>(*this, other);
+}
+Literal Literal::leSI8x16(const Literal& other) const {
+  return compare<16, &Literal::getLanesSI8x16, &Literal::leS>(*this, other);
+}
+Literal Literal::leUI8x16(const Literal& other) const {
+  return compare<16, &Literal::getLanesUI8x16, &Literal::leU>(*this, other);
+}
+Literal Literal::geSI8x16(const Literal& other) const {
+  return compare<16, &Literal::getLanesSI8x16, &Literal::geS>(*this, other);
+}
+Literal Literal::geUI8x16(const Literal& other) const {
+  return compare<16, &Literal::getLanesUI8x16, &Literal::geU>(*this, other);
+}
+Literal Literal::eqI16x8(const Literal& other) const {
+  return compare<8, &Literal::getLanesUI16x8, &Literal::eq>(*this, other);
+}
+Literal Literal::neI16x8(const Literal& other) const {
+  return compare<8, &Literal::getLanesUI16x8, &Literal::ne>(*this, other);
+}
+Literal Literal::ltSI16x8(const Literal& other) const {
+  return compare<8, &Literal::getLanesSI16x8, &Literal::ltS>(*this, other);
+}
+Literal Literal::ltUI16x8(const Literal& other) const {
+  return compare<8, &Literal::getLanesUI16x8, &Literal::ltU>(*this, other);
+}
+Literal Literal::gtSI16x8(const Literal& other) const {
+  return compare<8, &Literal::getLanesSI16x8, &Literal::gtS>(*this, other);
+}
+Literal Literal::gtUI16x8(const Literal& other) const {
+  return compare<8, &Literal::getLanesUI16x8, &Literal::gtU>(*this, other);
+}
+Literal Literal::leSI16x8(const Literal& other) const {
+  return compare<8, &Literal::getLanesSI16x8, &Literal::leS>(*this, other);
+}
+Literal Literal::leUI16x8(const Literal& other) const {
+  return compare<8, &Literal::getLanesUI16x8, &Literal::leU>(*this, other);
+}
+Literal Literal::geSI16x8(const Literal& other) const {
+  return compare<8, &Literal::getLanesSI16x8, &Literal::geS>(*this, other);
+}
+Literal Literal::geUI16x8(const Literal& other) const {
+  return compare<8, &Literal::getLanesUI16x8, &Literal::geU>(*this, other);
+}
+Literal Literal::eqI32x4(const Literal& other) const {
+  return compare<4, &Literal::getLanesI32x4, &Literal::eq>(*this, other);
+}
+Literal Literal::neI32x4(const Literal& other) const {
+  return compare<4, &Literal::getLanesI32x4, &Literal::ne>(*this, other);
+}
+Literal Literal::ltSI32x4(const Literal& other) const {
+  return compare<4, &Literal::getLanesI32x4, &Literal::ltS>(*this, other);
+}
+Literal Literal::ltUI32x4(const Literal& other) const {
+  return compare<4, &Literal::getLanesI32x4, &Literal::ltU>(*this, other);
+}
+Literal Literal::gtSI32x4(const Literal& other) const {
+  return compare<4, &Literal::getLanesI32x4, &Literal::gtS>(*this, other);
+}
+Literal Literal::gtUI32x4(const Literal& other) const {
+  return compare<4, &Literal::getLanesI32x4, &Literal::gtU>(*this, other);
+}
+Literal Literal::leSI32x4(const Literal& other) const {
+  return compare<4, &Literal::getLanesI32x4, &Literal::leS>(*this, other);
+}
+Literal Literal::leUI32x4(const Literal& other) const {
+  return compare<4, &Literal::getLanesI32x4, &Literal::leU>(*this, other);
+}
+Literal Literal::geSI32x4(const Literal& other) const {
+  return compare<4, &Literal::getLanesI32x4, &Literal::geS>(*this, other);
+}
+Literal Literal::geUI32x4(const Literal& other) const {
+  return compare<4, &Literal::getLanesI32x4, &Literal::geU>(*this, other);
+}
+Literal Literal::eqF32x4(const Literal& other) const {
+  return compare<4, &Literal::getLanesF32x4, &Literal::eq>(*this, other);
+}
+Literal Literal::neF32x4(const Literal& other) const {
+  return compare<4, &Literal::getLanesF32x4, &Literal::ne>(*this, other);
+}
+Literal Literal::ltF32x4(const Literal& other) const {
+  return compare<4, &Literal::getLanesF32x4, &Literal::lt>(*this, other);
+}
+Literal Literal::gtF32x4(const Literal& other) const {
+  return compare<4, &Literal::getLanesF32x4, &Literal::gt>(*this, other);
+}
+Literal Literal::leF32x4(const Literal& other) const {
+  return compare<4, &Literal::getLanesF32x4, &Literal::le>(*this, other);
+}
+Literal Literal::geF32x4(const Literal& other) const {
+  return compare<4, &Literal::getLanesF32x4, &Literal::ge>(*this, other);
+}
+Literal Literal::eqF64x2(const Literal& other) const {
+  return compare<2, &Literal::getLanesF64x2, &Literal::eq, int64_t>(*this, other);
+}
+Literal Literal::neF64x2(const Literal& other) const {
+  return compare<2, &Literal::getLanesF64x2, &Literal::ne, int64_t>(*this, other);
+}
+Literal Literal::ltF64x2(const Literal& other) const {
+  return compare<2, &Literal::getLanesF64x2, &Literal::lt, int64_t>(*this, other);
+}
+Literal Literal::gtF64x2(const Literal& other) const {
+  return compare<2, &Literal::getLanesF64x2, &Literal::gt, int64_t>(*this, other);
+}
+Literal Literal::leF64x2(const Literal& other) const {
+  return compare<2, &Literal::getLanesF64x2, &Literal::le, int64_t>(*this, other);
+}
+Literal Literal::geF64x2(const Literal& other) const {
+  return compare<2, &Literal::getLanesF64x2, &Literal::ge, int64_t>(*this, other);
+}
+
+template<int Lanes, LaneArray<Lanes> (Literal::*IntoLanes)() const,
+          Literal (Literal::*BinaryOp)(const Literal&) const>
+static Literal binary(const Literal& val, const Literal& other) {
+  LaneArray<Lanes> lanes = (val.*IntoLanes)();
+  LaneArray<Lanes> other_lanes = (other.*IntoLanes)();
+  for (size_t i = 0; i < Lanes; ++i) {
+    lanes[i] = (lanes[i].*BinaryOp)(other_lanes[i]);
+  }
+  return Literal(lanes);
+}
+
+Literal Literal::andV128(const Literal& other) const {
+  return binary<4, &Literal::getLanesI32x4, &Literal::and_>(*this, other);
+}
+Literal Literal::orV128(const Literal& other) const {
+  return binary<4, &Literal::getLanesI32x4, &Literal::or_>(*this, other);
+}
+Literal Literal::xorV128(const Literal& other) const {
+  return binary<4, &Literal::getLanesI32x4, &Literal::xor_>(*this, other);
+}
+Literal Literal::addI8x16(const Literal& other) const {
+  return binary<16, &Literal::getLanesUI8x16, &Literal::add>(*this, other);
+}
+Literal Literal::addSaturateSI8x16(const Literal& other) const {
+  return binary<16, &Literal::getLanesUI8x16, &Literal::addSatSI8>(*this, other);
+}
+Literal Literal::addSaturateUI8x16(const Literal& other) const {
+  return binary<16, &Literal::getLanesSI8x16, &Literal::addSatUI8>(*this, other);
+}
+Literal Literal::subI8x16(const Literal& other) const {
+  return binary<16, &Literal::getLanesUI8x16, &Literal::sub>(*this, other);
+}
+Literal Literal::subSaturateSI8x16(const Literal& other) const {
+  return binary<16, &Literal::getLanesUI8x16, &Literal::subSatSI8>(*this, other);
+}
+Literal Literal::subSaturateUI8x16(const Literal& other) const {
+  return binary<16, &Literal::getLanesSI8x16, &Literal::subSatUI8>(*this, other);
+}
+Literal Literal::mulI8x16(const Literal& other) const {
+  return binary<16, &Literal::getLanesUI8x16, &Literal::mul>(*this, other);
+}
+Literal Literal::addI16x8(const Literal& other) const {
+  return binary<8, &Literal::getLanesUI16x8, &Literal::add>(*this, other);
+}
+Literal Literal::addSaturateSI16x8(const Literal& other) const {
+  return binary<8, &Literal::getLanesUI16x8, &Literal::addSatSI16>(*this, other);
+}
+Literal Literal::addSaturateUI16x8(const Literal& other) const {
+  return binary<8, &Literal::getLanesSI16x8, &Literal::addSatUI16>(*this, other);
+}
+Literal Literal::subI16x8(const Literal& other) const {
+  return binary<8, &Literal::getLanesUI16x8, &Literal::sub>(*this, other);
+}
+Literal Literal::subSaturateSI16x8(const Literal& other) const {
+  return binary<8, &Literal::getLanesUI16x8, &Literal::subSatSI16>(*this, other);
+}
+Literal Literal::subSaturateUI16x8(const Literal& other) const {
+  return binary<8, &Literal::getLanesSI16x8, &Literal::subSatUI16>(*this, other);
+}
+Literal Literal::mulI16x8(const Literal& other) const {
+  return binary<8, &Literal::getLanesUI16x8, &Literal::mul>(*this, other);
+}
+Literal Literal::addI32x4(const Literal& other) const {
+  return binary<4, &Literal::getLanesI32x4, &Literal::add>(*this, other);
+}
+Literal Literal::subI32x4(const Literal& other) const {
+  return binary<4, &Literal::getLanesI32x4, &Literal::sub>(*this, other);
+}
+Literal Literal::mulI32x4(const Literal& other) const {
+  return binary<4, &Literal::getLanesI32x4, &Literal::mul>(*this, other);
+}
+Literal Literal::addI64x2(const Literal& other) const {
+  return binary<2, &Literal::getLanesI64x2, &Literal::add>(*this, other);
+}
+Literal Literal::subI64x2(const Literal& other) const {
+  return binary<2, &Literal::getLanesI64x2, &Literal::sub>(*this, other);
+}
+Literal Literal::addF32x4(const Literal& other) const {
+  return binary<4, &Literal::getLanesF32x4, &Literal::add>(*this, other);
+}
+Literal Literal::subF32x4(const Literal& other) const {
+  return binary<4, &Literal::getLanesF32x4, &Literal::sub>(*this, other);
+}
+Literal Literal::mulF32x4(const Literal& other) const {
+  return binary<4, &Literal::getLanesF32x4, &Literal::mul>(*this, other);
+}
+Literal Literal::divF32x4(const Literal& other) const {
+  return binary<4, &Literal::getLanesF32x4, &Literal::div>(*this, other);
+}
+Literal Literal::minF32x4(const Literal& other) const {
+  return binary<4, &Literal::getLanesF32x4, &Literal::min>(*this, other);
+}
+Literal Literal::maxF32x4(const Literal& other) const {
+  return binary<4, &Literal::getLanesF32x4, &Literal::max>(*this, other);
+}
+Literal Literal::addF64x2(const Literal& other) const {
+  return binary<2, &Literal::getLanesF64x2, &Literal::add>(*this, other);
+}
+Literal Literal::subF64x2(const Literal& other) const {
+  return binary<2, &Literal::getLanesF64x2, &Literal::sub>(*this, other);
+}
+Literal Literal::mulF64x2(const Literal& other) const {
+  return binary<2, &Literal::getLanesF64x2, &Literal::mul>(*this, other);
+}
+Literal Literal::divF64x2(const Literal& other) const {
+  return binary<2, &Literal::getLanesF64x2, &Literal::div>(*this, other);
+}
+Literal Literal::minF64x2(const Literal& other) const {
+  return binary<2, &Literal::getLanesF64x2, &Literal::min>(*this, other);
+}
+Literal Literal::maxF64x2(const Literal& other) const {
+  return binary<2, &Literal::getLanesF64x2, &Literal::max>(*this, other);
+}
+
+Literal Literal::bitselectV128(const Literal& left, const Literal& right) const {
+  return andV128(left).orV128(notV128().andV128(right));
+}
+
 } // namespace wasm
diff --git a/src/wasm/wasm-binary.cpp b/src/wasm/wasm-binary.cpp
index 2a7bff51f..af42ed8a4 100644
--- a/src/wasm/wasm-binary.cpp
+++ b/src/wasm/wasm-binary.cpp
@@ -757,6 +757,14 @@ uint64_t WasmBinaryBuilder::getInt64() {
   return ret;
 }
 
+uint8_t WasmBinaryBuilder::getLaneIndex(size_t lanes) {
+  if (debug) std::cerr << "<==" << std::endl;
+  auto ret = getInt8();
+  if (ret >= lanes) throwError("Illegal lane index");
+  if (debug) std::cerr << "getLaneIndex(" << lanes << "): " << ret << " ==>" << std::endl;
+  return ret;
+}
+
 Literal WasmBinaryBuilder::getFloat32Literal() {
   if (debug) std::cerr << "<==" << std::endl;
   auto ret = Literal(getInt32());
@@ -773,6 +781,17 @@ Literal WasmBinaryBuilder::getFloat64Literal() {
   return ret;
 }
 
+Literal WasmBinaryBuilder::getVec128Literal() {
+  if (debug) std::cerr << "<==" << std::endl;
+  std::array<uint8_t, 16> bytes;
+  for (auto i = 0; i < 16; ++i) {
+    bytes[i] = getInt8();
+  }
+  auto ret = Literal(bytes.data());
+  if (debug) std::cerr << "getVec128: " << ret << " ==>" << std::endl;
+  return ret;
+}
+
 uint32_t WasmBinaryBuilder::getU32LEB() {
   if (debug) std::cerr << "<==" << std::endl;
   U32LEB ret;
@@ -822,6 +841,7 @@ Type WasmBinaryBuilder::getType() {
     case BinaryConsts::EncodedType::i64: return i64;
     case BinaryConsts::EncodedType::f32: return f32;
     case BinaryConsts::EncodedType::f64: return f64;
+    case BinaryConsts::EncodedType::v128: return v128;
     case BinaryConsts::EncodedType::AnyFunc:
     case BinaryConsts::EncodedType::Func:
       throwError("invalid wasm type: " + std::to_string(type));
@@ -1677,7 +1697,7 @@ BinaryConsts::ASTNodes WasmBinaryBuilder::readExpression(Expression*& curr) {
     case BinaryConsts::End:
     case BinaryConsts::Else:         curr = nullptr; break;
     case BinaryConsts::AtomicPrefix: {
-      code = getInt8();
+      code = static_cast<uint8_t>(getU32LEB());
       if (maybeVisitLoad(curr, code, /*isAtomic=*/true)) break;
       if (maybeVisitStore(curr, code, /*isAtomic=*/true)) break;
       if (maybeVisitAtomicRMW(curr, code)) break;
@@ -1688,11 +1708,26 @@ BinaryConsts::ASTNodes WasmBinaryBuilder::readExpression(Expression*& curr) {
       break;
     }
     case BinaryConsts::TruncSatPrefix: {
-      uint32_t code = getU32LEB();
-      if (maybeVisitTruncSat(curr, code)) break;
+      auto opcode = getU32LEB();
+      if (maybeVisitTruncSat(curr, opcode)) break;
       throwError("invalid code after nontrapping float-to-int prefix: " + std::to_string(code));
       break;
     }
+    case BinaryConsts::SIMDPrefix: {
+      auto opcode = getU32LEB();
+      if (maybeVisitSIMDBinary(curr, opcode)) break;
+      if (maybeVisitSIMDUnary(curr, opcode)) break;
+      if (maybeVisitSIMDConst(curr, opcode)) break;
+      if (maybeVisitSIMDLoad(curr, opcode)) break;
+      if (maybeVisitSIMDStore(curr, opcode)) break;
+      if (maybeVisitSIMDExtract(curr, opcode)) break;
+      if (maybeVisitSIMDReplace(curr, opcode)) break;
+      if (maybeVisitSIMDShuffle(curr, opcode)) break;
+      if (maybeVisitSIMDBitselect(curr, opcode)) break;
+      if (maybeVisitSIMDShift(curr, opcode)) break;
+      throwError("invalid code after SIMD prefix: " + std::to_string(opcode));
+      break;
+    }
     default: {
       // otherwise, the code is a subcode TODO: optimize
       if (maybeVisitBinary(curr, code)) break;
@@ -1951,10 +1986,10 @@ void WasmBinaryBuilder::visitCallIndirect(CallIndirect* curr) {
 
 void WasmBinaryBuilder::visitGetLocal(GetLocal* curr) {
   if (debug) std::cerr << "zz node: GetLocal " << pos << std::endl;
-  requireFunctionContext("get_local");
+  requireFunctionContext("local.get");
   curr->index = getU32LEB();
   if (curr->index >= currFunction->getNumLocals()) {
-    throwError("bad get_local index");
+    throwError("bad local.get index");
   }
   curr->type = currFunction->getLocalType(curr->index);
   curr->finalize();
@@ -1962,10 +1997,10 @@ void WasmBinaryBuilder::visitGetLocal(GetLocal* curr) {
 
 void WasmBinaryBuilder::visitSetLocal(SetLocal *curr, uint8_t code) {
   if (debug) std::cerr << "zz node: Set|TeeLocal" << std::endl;
-  requireFunctionContext("set_local outside of function");
+  requireFunctionContext("local.set outside of function");
   curr->index = getU32LEB();
   if (curr->index >= currFunction->getNumLocals()) {
-    throwError("bad set_local index");
+    throwError("bad local.set index");
   }
   curr->value = popNonVoidExpression();
   curr->type = curr->value->type;
@@ -2077,7 +2112,6 @@ bool WasmBinaryBuilder::maybeVisitStore(Expression*& out, uint8_t code, bool isA
   return true;
 }
 
-
 bool WasmBinaryBuilder::maybeVisitAtomicRMW(Expression*& out, uint8_t code) {
   if (code < BinaryConsts::AtomicRMWOps_Begin || code > BinaryConsts::AtomicRMWOps_End) return false;
   auto* curr = allocator.alloc<AtomicRMW>();
@@ -2359,6 +2393,269 @@ bool WasmBinaryBuilder::maybeVisitBinary(Expression*& out, uint8_t code) {
 #undef FLOAT_TYPED_CODE
 }
 
+bool WasmBinaryBuilder::maybeVisitSIMDBinary(Expression*& out, uint32_t code) {
+  Binary* curr;
+  switch (code) {
+    case BinaryConsts::I8x16Eq: curr = allocator.alloc<Binary>(); curr->op = EqVecI8x16; break;
+    case BinaryConsts::I8x16Ne: curr = allocator.alloc<Binary>(); curr->op = NeVecI8x16; break;
+    case BinaryConsts::I8x16LtS: curr = allocator.alloc<Binary>(); curr->op = LtSVecI8x16; break;
+    case BinaryConsts::I8x16LtU: curr = allocator.alloc<Binary>(); curr->op = LtUVecI8x16; break;
+    case BinaryConsts::I8x16GtS: curr = allocator.alloc<Binary>(); curr->op = GtSVecI8x16; break;
+    case BinaryConsts::I8x16GtU: curr = allocator.alloc<Binary>(); curr->op = GtUVecI8x16; break;
+    case BinaryConsts::I8x16LeS: curr = allocator.alloc<Binary>(); curr->op = LeSVecI8x16; break;
+    case BinaryConsts::I8x16LeU: curr = allocator.alloc<Binary>(); curr->op = LeUVecI8x16; break;
+    case BinaryConsts::I8x16GeS: curr = allocator.alloc<Binary>(); curr->op = GeSVecI8x16; break;
+    case BinaryConsts::I8x16GeU: curr = allocator.alloc<Binary>(); curr->op = GeUVecI8x16; break;
+    case BinaryConsts::I16x8Eq: curr = allocator.alloc<Binary>(); curr->op = EqVecI16x8; break;
+    case BinaryConsts::I16x8Ne: curr = allocator.alloc<Binary>(); curr->op = NeVecI16x8; break;
+    case BinaryConsts::I16x8LtS: curr = allocator.alloc<Binary>(); curr->op = LtSVecI16x8; break;
+    case BinaryConsts::I16x8LtU: curr = allocator.alloc<Binary>(); curr->op = LtUVecI16x8; break;
+    case BinaryConsts::I16x8GtS: curr = allocator.alloc<Binary>(); curr->op = GtSVecI16x8; break;
+    case BinaryConsts::I16x8GtU: curr = allocator.alloc<Binary>(); curr->op = GtUVecI16x8; break;
+    case BinaryConsts::I16x8LeS: curr = allocator.alloc<Binary>(); curr->op = LeSVecI16x8; break;
+    case BinaryConsts::I16x8LeU: curr = allocator.alloc<Binary>(); curr->op = LeUVecI16x8; break;
+    case BinaryConsts::I16x8GeS: curr = allocator.alloc<Binary>(); curr->op = GeSVecI16x8; break;
+    case BinaryConsts::I16x8GeU: curr = allocator.alloc<Binary>(); curr->op = GeUVecI16x8; break;
+    case BinaryConsts::I32x4Eq: curr = allocator.alloc<Binary>(); curr->op = EqVecI32x4; break;
+    case BinaryConsts::I32x4Ne: curr = allocator.alloc<Binary>(); curr->op = NeVecI32x4; break;
+    case BinaryConsts::I32x4LtS: curr = allocator.alloc<Binary>(); curr->op = LtSVecI32x4; break;
+    case BinaryConsts::I32x4LtU: curr = allocator.alloc<Binary>(); curr->op = LtUVecI32x4; break;
+    case BinaryConsts::I32x4GtS: curr = allocator.alloc<Binary>(); curr->op = GtSVecI32x4; break;
+    case BinaryConsts::I32x4GtU: curr = allocator.alloc<Binary>(); curr->op = GtUVecI32x4; break;
+    case BinaryConsts::I32x4LeS: curr = allocator.alloc<Binary>(); curr->op = LeSVecI32x4; break;
+    case BinaryConsts::I32x4LeU: curr = allocator.alloc<Binary>(); curr->op = LeUVecI32x4; break;
+    case BinaryConsts::I32x4GeS: curr = allocator.alloc<Binary>(); curr->op = GeSVecI32x4; break;
+    case BinaryConsts::I32x4GeU: curr = allocator.alloc<Binary>(); curr->op = GeUVecI32x4; break;
+    case BinaryConsts::F32x4Eq: curr = allocator.alloc<Binary>(); curr->op = EqVecF32x4; break;
+    case BinaryConsts::F32x4Ne: curr = allocator.alloc<Binary>(); curr->op = NeVecF32x4; break;
+    case BinaryConsts::F32x4Lt: curr = allocator.alloc<Binary>(); curr->op = LtVecF32x4; break;
+    case BinaryConsts::F32x4Gt: curr = allocator.alloc<Binary>(); curr->op = GtVecF32x4; break;
+    case BinaryConsts::F32x4Le: curr = allocator.alloc<Binary>(); curr->op = LeVecF32x4; break;
+    case BinaryConsts::F32x4Ge: curr = allocator.alloc<Binary>(); curr->op = GeVecF32x4; break;
+    case BinaryConsts::F64x2Eq: curr = allocator.alloc<Binary>(); curr->op = EqVecF64x2; break;
+    case BinaryConsts::F64x2Ne: curr = allocator.alloc<Binary>(); curr->op = NeVecF64x2; break;
+    case BinaryConsts::F64x2Lt: curr = allocator.alloc<Binary>(); curr->op = LtVecF64x2; break;
+    case BinaryConsts::F64x2Gt: curr = allocator.alloc<Binary>(); curr->op = GtVecF64x2; break;
+    case BinaryConsts::F64x2Le: curr = allocator.alloc<Binary>(); curr->op = LeVecF64x2; break;
+    case BinaryConsts::F64x2Ge: curr = allocator.alloc<Binary>(); curr->op = GeVecF64x2; break;
+    case BinaryConsts::V128And: curr = allocator.alloc<Binary>(); curr->op = AndVec128; break;
+    case BinaryConsts::V128Or: curr = allocator.alloc<Binary>(); curr->op = OrVec128; break;
+    case BinaryConsts::V128Xor: curr = allocator.alloc<Binary>(); curr->op = XorVec128; break;
+    case BinaryConsts::I8x16Add: curr = allocator.alloc<Binary>(); curr->op = AddVecI8x16; break;
+    case BinaryConsts::I8x16AddSatS: curr = allocator.alloc<Binary>(); curr->op = AddSatSVecI8x16; break;
+    case BinaryConsts::I8x16AddSatU: curr = allocator.alloc<Binary>(); curr->op = AddSatUVecI8x16; break;
+    case BinaryConsts::I8x16Sub: curr = allocator.alloc<Binary>(); curr->op = SubVecI8x16; break;
+    case BinaryConsts::I8x16SubSatS: curr = allocator.alloc<Binary>(); curr->op = SubSatSVecI8x16; break;
+    case BinaryConsts::I8x16SubSatU: curr = allocator.alloc<Binary>(); curr->op = SubSatUVecI8x16; break;
+    case BinaryConsts::I8x16Mul: curr = allocator.alloc<Binary>(); curr->op = MulVecI8x16; break;
+    case BinaryConsts::I16x8Add: curr = allocator.alloc<Binary>(); curr->op = AddVecI16x8; break;
+    case BinaryConsts::I16x8AddSatS: curr = allocator.alloc<Binary>(); curr->op = AddSatSVecI16x8; break;
+    case BinaryConsts::I16x8AddSatU: curr = allocator.alloc<Binary>(); curr->op = AddSatUVecI16x8; break;
+    case BinaryConsts::I16x8Sub: curr = allocator.alloc<Binary>(); curr->op = SubVecI16x8; break;
+    case BinaryConsts::I16x8SubSatS: curr = allocator.alloc<Binary>(); curr->op = SubSatSVecI16x8; break;
+    case BinaryConsts::I16x8SubSatU: curr = allocator.alloc<Binary>(); curr->op = SubSatUVecI16x8; break;
+    case BinaryConsts::I16x8Mul: curr = allocator.alloc<Binary>(); curr->op = MulVecI16x8; break;
+    case BinaryConsts::I32x4Add: curr = allocator.alloc<Binary>(); curr->op = AddVecI32x4; break;
+    case BinaryConsts::I32x4Sub: curr = allocator.alloc<Binary>(); curr->op = SubVecI32x4; break;
+    case BinaryConsts::I32x4Mul: curr = allocator.alloc<Binary>(); curr->op = MulVecI32x4; break;
+    case BinaryConsts::I64x2Add: curr = allocator.alloc<Binary>(); curr->op = AddVecI64x2; break;
+    case BinaryConsts::I64x2Sub: curr = allocator.alloc<Binary>(); curr->op = SubVecI64x2; break;
+    case BinaryConsts::F32x4Add: curr = allocator.alloc<Binary>(); curr->op = AddVecF32x4; break;
+    case BinaryConsts::F32x4Sub: curr = allocator.alloc<Binary>(); curr->op = SubVecF32x4; break;
+    case BinaryConsts::F32x4Mul: curr = allocator.alloc<Binary>(); curr->op = MulVecF32x4; break;
+    case BinaryConsts::F32x4Div: curr = allocator.alloc<Binary>(); curr->op = DivVecF32x4; break;
+    case BinaryConsts::F32x4Min: curr = allocator.alloc<Binary>(); curr->op = MinVecF32x4; break;
+    case BinaryConsts::F32x4Max: curr = allocator.alloc<Binary>(); curr->op = MaxVecF32x4; break;
+    case BinaryConsts::F64x2Add: curr = allocator.alloc<Binary>(); curr->op = AddVecF64x2; break;
+    case BinaryConsts::F64x2Sub: curr = allocator.alloc<Binary>(); curr->op = SubVecF64x2; break;
+    case BinaryConsts::F64x2Mul: curr = allocator.alloc<Binary>(); curr->op = MulVecF64x2; break;
+    case BinaryConsts::F64x2Div: curr = allocator.alloc<Binary>(); curr->op = DivVecF64x2; break;
+    case BinaryConsts::F64x2Min: curr = allocator.alloc<Binary>(); curr->op = MinVecF64x2; break;
+    case BinaryConsts::F64x2Max: curr = allocator.alloc<Binary>(); curr->op = MaxVecF64x2; break;
+    default: return false;
+  }
+  if (debug) std::cerr << "zz node: Binary" << std::endl;
+  curr->right = popNonVoidExpression();
+  curr->left = popNonVoidExpression();
+  curr->finalize();
+  out = curr;
+  return true;
+}
+bool WasmBinaryBuilder::maybeVisitSIMDUnary(Expression*& out, uint32_t code) {
+  Unary* curr;
+  switch (code) {
+    case BinaryConsts::I8x16Splat: curr = allocator.alloc<Unary>(); curr->op = SplatVecI8x16; break;
+    case BinaryConsts::I16x8Splat: curr = allocator.alloc<Unary>(); curr->op = SplatVecI16x8; break;
+    case BinaryConsts::I32x4Splat: curr = allocator.alloc<Unary>(); curr->op = SplatVecI32x4; break;
+    case BinaryConsts::I64x2Splat: curr = allocator.alloc<Unary>(); curr->op = SplatVecI64x2; break;
+    case BinaryConsts::F32x4Splat: curr = allocator.alloc<Unary>(); curr->op = SplatVecF32x4; break;
+    case BinaryConsts::F64x2Splat: curr = allocator.alloc<Unary>(); curr->op = SplatVecF64x2; break;
+    case BinaryConsts::V128Not: curr = allocator.alloc<Unary>(); curr->op = NotVec128; break;
+    case BinaryConsts::I8x16Neg: curr = allocator.alloc<Unary>(); curr->op = NegVecI8x16; break;
+    case BinaryConsts::I8x16AnyTrue: curr = allocator.alloc<Unary>(); curr->op = AnyTrueVecI8x16; break;
+    case BinaryConsts::I8x16AllTrue: curr = allocator.alloc<Unary>(); curr->op = AllTrueVecI8x16; break;
+    case BinaryConsts::I16x8Neg: curr = allocator.alloc<Unary>(); curr->op = NegVecI16x8; break;
+    case BinaryConsts::I16x8AnyTrue: curr = allocator.alloc<Unary>(); curr->op = AnyTrueVecI16x8; break;
+    case BinaryConsts::I16x8AllTrue: curr = allocator.alloc<Unary>(); curr->op = AllTrueVecI16x8; break;
+    case BinaryConsts::I32x4Neg: curr = allocator.alloc<Unary>(); curr->op = NegVecI32x4; break;
+    case BinaryConsts::I32x4AnyTrue: curr = allocator.alloc<Unary>(); curr->op = AnyTrueVecI32x4; break;
+    case BinaryConsts::I32x4AllTrue: curr = allocator.alloc<Unary>(); curr->op = AllTrueVecI32x4; break;
+    case BinaryConsts::I64x2Neg: curr = allocator.alloc<Unary>(); curr->op = NegVecI64x2; break;
+    case BinaryConsts::I64x2AnyTrue: curr = allocator.alloc<Unary>(); curr->op = AnyTrueVecI64x2; break;
+    case BinaryConsts::I64x2AllTrue: curr = allocator.alloc<Unary>(); curr->op = AllTrueVecI64x2; break;
+    case BinaryConsts::F32x4Abs: curr = allocator.alloc<Unary>(); curr->op = AbsVecF32x4; break;
+    case BinaryConsts::F32x4Neg: curr = allocator.alloc<Unary>(); curr->op = NegVecF32x4; break;
+    case BinaryConsts::F32x4Sqrt: curr = allocator.alloc<Unary>(); curr->op = SqrtVecF32x4; break;
+    case BinaryConsts::F64x2Abs: curr = allocator.alloc<Unary>(); curr->op = AbsVecF64x2; break;
+    case BinaryConsts::F64x2Neg: curr = allocator.alloc<Unary>(); curr->op = NegVecF64x2; break;
+    case BinaryConsts::F64x2Sqrt: curr = allocator.alloc<Unary>(); curr->op = SqrtVecF64x2; break;
+    case BinaryConsts::I32x4TruncSatSF32x4: curr = allocator.alloc<Unary>(); curr->op = TruncSatSVecF32x4ToVecI32x4; break;
+    case BinaryConsts::I32x4TruncSatUF32x4: curr = allocator.alloc<Unary>(); curr->op = TruncSatUVecF32x4ToVecI32x4; break;
+    case BinaryConsts::I64x2TruncSatSF64x2: curr = allocator.alloc<Unary>(); curr->op = TruncSatSVecF64x2ToVecI64x2; break;
+    case BinaryConsts::I64x2TruncSatUF64x2: curr = allocator.alloc<Unary>(); curr->op = TruncSatUVecF64x2ToVecI64x2; break;
+    case BinaryConsts::F32x4ConvertSI32x4: curr = allocator.alloc<Unary>(); curr->op = ConvertSVecI32x4ToVecF32x4; break;
+    case BinaryConsts::F32x4ConvertUI32x4: curr = allocator.alloc<Unary>(); curr->op = ConvertUVecI32x4ToVecF32x4; break;
+    case BinaryConsts::F64x2ConvertSI64x2: curr = allocator.alloc<Unary>(); curr->op = ConvertSVecI64x2ToVecF64x2; break;
+    case BinaryConsts::F64x2ConvertUI64x2: curr = allocator.alloc<Unary>(); curr->op = ConvertUVecI64x2ToVecF64x2; break;
+    default: return false;
+  }
+  curr->value = popNonVoidExpression();
+  curr->finalize();
+  out = curr;
+  return true;
+}
+
+bool WasmBinaryBuilder::maybeVisitSIMDConst(Expression*& out, uint32_t code) {
+  if (code != BinaryConsts::V128Const) {
+    return false;
+  }
+  auto* curr = allocator.alloc<Const>();
+  curr->value = getVec128Literal();
+  curr->finalize();
+  out = curr;
+  return true;
+}
+
+bool WasmBinaryBuilder::maybeVisitSIMDLoad(Expression*& out, uint32_t code) {
+  if (code != BinaryConsts::V128Load) {
+    return false;
+  }
+  auto* curr = allocator.alloc<Load>();
+  curr->type = v128;
+  curr->bytes = 16;
+  readMemoryAccess(curr->align, curr->offset);
+  curr->isAtomic = false;
+  curr->ptr = popNonVoidExpression();
+  curr->finalize();
+  out = curr;
+  return true;
+}
+
+bool WasmBinaryBuilder::maybeVisitSIMDStore(Expression*& out, uint32_t code) {
+  if (code != BinaryConsts::V128Store) {
+    return false;
+  }
+  auto* curr = allocator.alloc<Store>();
+  curr->bytes = 16;
+  curr->valueType = v128;
+  readMemoryAccess(curr->align, curr->offset);
+  curr->isAtomic = false;
+  curr->value = popNonVoidExpression();
+  curr->ptr = popNonVoidExpression();
+  curr->finalize();
+  out = curr;
+  return true;
+}
+
+bool WasmBinaryBuilder::maybeVisitSIMDExtract(Expression*& out, uint32_t code) {
+  SIMDExtract* curr;
+  switch (code) {
+    case BinaryConsts::I8x16ExtractLaneS: curr = allocator.alloc<SIMDExtract>(); curr->op = ExtractLaneSVecI8x16; curr->index = getLaneIndex(16); break;
+    case BinaryConsts::I8x16ExtractLaneU: curr = allocator.alloc<SIMDExtract>(); curr->op = ExtractLaneUVecI8x16; curr->index = getLaneIndex(16); break;
+    case BinaryConsts::I16x8ExtractLaneS: curr = allocator.alloc<SIMDExtract>(); curr->op = ExtractLaneSVecI16x8; curr->index = getLaneIndex(8); break;
+    case BinaryConsts::I16x8ExtractLaneU: curr = allocator.alloc<SIMDExtract>(); curr->op = ExtractLaneUVecI16x8; curr->index = getLaneIndex(8); break;
+    case BinaryConsts::I32x4ExtractLane: curr = allocator.alloc<SIMDExtract>(); curr->op = ExtractLaneVecI32x4; curr->index = getLaneIndex(4); break;
+    case BinaryConsts::I64x2ExtractLane: curr = allocator.alloc<SIMDExtract>(); curr->op = ExtractLaneVecI64x2; curr->index = getLaneIndex(2); break;
+    case BinaryConsts::F32x4ExtractLane: curr = allocator.alloc<SIMDExtract>(); curr->op = ExtractLaneVecF32x4; curr->index = getLaneIndex(4); break;
+    case BinaryConsts::F64x2ExtractLane: curr = allocator.alloc<SIMDExtract>(); curr->op = ExtractLaneVecF64x2; curr->index = getLaneIndex(2); break;
+    default: return false;
+  }
+  curr->vec = popNonVoidExpression();
+  curr->finalize();
+  out = curr;
+  return true;
+}
+
+bool WasmBinaryBuilder::maybeVisitSIMDReplace(Expression*& out, uint32_t code) {
+  SIMDReplace* curr;
+  switch (code) {
+    case BinaryConsts::I8x16ReplaceLane: curr = allocator.alloc<SIMDReplace>(); curr->op = ReplaceLaneVecI8x16; curr->index = getLaneIndex(16); break;
+    case BinaryConsts::I16x8ReplaceLane: curr = allocator.alloc<SIMDReplace>(); curr->op = ReplaceLaneVecI16x8; curr->index = getLaneIndex(8); break;
+    case BinaryConsts::I32x4ReplaceLane: curr = allocator.alloc<SIMDReplace>(); curr->op = ReplaceLaneVecI32x4; curr->index = getLaneIndex(4); break;
+    case BinaryConsts::I64x2ReplaceLane: curr = allocator.alloc<SIMDReplace>(); curr->op = ReplaceLaneVecI64x2; curr->index = getLaneIndex(2); break;
+    case BinaryConsts::F32x4ReplaceLane: curr = allocator.alloc<SIMDReplace>(); curr->op = ReplaceLaneVecF32x4; curr->index = getLaneIndex(4); break;
+    case BinaryConsts::F64x2ReplaceLane: curr = allocator.alloc<SIMDReplace>(); curr->op = ReplaceLaneVecF64x2; curr->index = getLaneIndex(2); break;
+    default: return false;
+  }
+  curr->value = popNonVoidExpression();
+  curr->vec = popNonVoidExpression();
+  curr->finalize();
+  out = curr;
+  return true;
+}
+
+bool WasmBinaryBuilder::maybeVisitSIMDShuffle(Expression*& out, uint32_t code) {
+  if (code != BinaryConsts::V8x16Shuffle) {
+    return false;
+  }
+  auto* curr = allocator.alloc<SIMDShuffle>();
+  for (auto i = 0; i < 16; ++i) {
+    curr->mask[i] = getLaneIndex(32);
+  }
+  curr->right = popNonVoidExpression();
+  curr->left = popNonVoidExpression();
+  curr->finalize();
+  out = curr;
+  return true;
+}
+
+bool WasmBinaryBuilder::maybeVisitSIMDBitselect(Expression*& out, uint32_t code) {
+  if (code != BinaryConsts::V128Bitselect) {
+    return false;
+  }
+  auto* curr = allocator.alloc<SIMDBitselect>();
+  curr->cond = popNonVoidExpression();
+  curr->right = popNonVoidExpression();
+  curr->left = popNonVoidExpression();
+  curr->finalize();
+  out = curr;
+  return true;
+}
+
+bool WasmBinaryBuilder::maybeVisitSIMDShift(Expression*& out, uint32_t code) {
+  SIMDShift* curr;
+  switch (code) {
+    case BinaryConsts::I8x16Shl: curr = allocator.alloc<SIMDShift>(); curr->op = ShlVecI8x16; break;
+    case BinaryConsts::I8x16ShrS: curr = allocator.alloc<SIMDShift>(); curr->op = ShrSVecI8x16; break;
+    case BinaryConsts::I8x16ShrU: curr = allocator.alloc<SIMDShift>(); curr->op = ShrUVecI8x16; break;
+    case BinaryConsts::I16x8Shl: curr = allocator.alloc<SIMDShift>(); curr->op = ShlVecI16x8; break;
+    case BinaryConsts::I16x8ShrS: curr = allocator.alloc<SIMDShift>(); curr->op = ShrSVecI16x8; break;
+    case BinaryConsts::I16x8ShrU: curr = allocator.alloc<SIMDShift>(); curr->op = ShrUVecI16x8; break;
+    case BinaryConsts::I32x4Shl: curr = allocator.alloc<SIMDShift>(); curr->op = ShlVecI32x4; break;
+    case BinaryConsts::I32x4ShrS: curr = allocator.alloc<SIMDShift>(); curr->op = ShrSVecI32x4; break;
+    case BinaryConsts::I32x4ShrU: curr = allocator.alloc<SIMDShift>(); curr->op = ShrUVecI32x4; break;
+    case BinaryConsts::I64x2Shl: curr = allocator.alloc<SIMDShift>(); curr->op = ShlVecI64x2; break;
+    case BinaryConsts::I64x2ShrS: curr = allocator.alloc<SIMDShift>(); curr->op = ShrSVecI64x2; break;
+    case BinaryConsts::I64x2ShrU: curr = allocator.alloc<SIMDShift>(); curr->op = ShrUVecI64x2; break;
+    default: return false;
+  }
+  curr->shift = popNonVoidExpression();
+  curr->vec = popNonVoidExpression();
+  curr->finalize();
+  out = curr;
+  return true;
+}
+
 void WasmBinaryBuilder::visitSelect(Select* curr) {
   if (debug) std::cerr << "zz node: Select" << std::endl;
   curr->condition = popNonVoidExpression();
diff --git a/src/wasm/wasm-emscripten.cpp b/src/wasm/wasm-emscripten.cpp
index 3c78ce866..b18fe0c76 100644
--- a/src/wasm/wasm-emscripten.cpp
+++ b/src/wasm/wasm-emscripten.cpp
@@ -247,7 +247,7 @@ struct RemoveStackPointer : public PostWalker<RemoveStackPointer> {
 
   void visitGetGlobal(GetGlobal* curr) {
     if (getModule()->getGlobalOrNull(curr->name) == stackPointer) {
-      ensureFunctionImport(getModule(), STACK_SAVE, "i");
+      needStackSave = true;
       if (!builder) builder = make_unique<Builder>(*getModule());
       replaceCurrent(builder->makeCall(STACK_SAVE, {}, i32));
     }
@@ -255,12 +255,15 @@ struct RemoveStackPointer : public PostWalker<RemoveStackPointer> {
 
   void visitSetGlobal(SetGlobal* curr) {
     if (getModule()->getGlobalOrNull(curr->name) == stackPointer) {
-      ensureFunctionImport(getModule(), STACK_RESTORE, "vi");
+      needStackRestore = true;
       if (!builder) builder = make_unique<Builder>(*getModule());
       replaceCurrent(builder->makeCall(STACK_RESTORE, {curr->value}, none));
     }
   }
 
+  bool needStackSave = false;
+  bool needStackRestore = false;
+
 private:
   std::unique_ptr<Builder> builder;
   Global* stackPointer;
@@ -272,6 +275,12 @@ void EmscriptenGlueGenerator::replaceStackPointerGlobal() {
   // Replace all uses of stack pointer global
   RemoveStackPointer walker(stackPointer);
   walker.walkModule(&wasm);
+  if (walker.needStackSave) {
+    ensureFunctionImport(&wasm, STACK_SAVE, "i");
+  }
+  if (walker.needStackRestore) {
+    ensureFunctionImport(&wasm, STACK_RESTORE, "vi");
+  }
 
   // Finally remove the stack pointer global itself. This avoids importing
   // a mutable global.
@@ -331,6 +340,7 @@ void EmscriptenGlueGenerator::generateJSCallThunks(
 
   JSCallWalker walker = getJSCallWalker(wasm);
   auto& tableSegmentData = wasm.table.segments[0].data;
+  unsigned numEntriesAdded = 0;
   for (std::string sig : walker.indirectlyCallableSigs) {
     // Add imports for jsCall_sig (e.g. jsCall_vi).
     // Imported jsCall_sig functions have their first parameter as an index to
@@ -371,11 +381,13 @@ void EmscriptenGlueGenerator::generateJSCallThunks(
       f->body = call;
       wasm.addFunction(f);
       tableSegmentData.push_back(f->name);
+      numEntriesAdded++;
     }
   }
-  wasm.table.initial = wasm.table.max =
-      wasm.table.segments[0].offset->cast<Const>()->value.getInteger() +
-      tableSegmentData.size();
+  wasm.table.initial.addr += numEntriesAdded;
+  if (wasm.table.max != Table::kUnlimitedSize) {
+    wasm.table.max.addr += numEntriesAdded;
+  }
 }
 
 std::vector<Address> getSegmentOffsets(Module& wasm) {
@@ -800,6 +812,7 @@ std::string EmscriptenGlueGenerator::generateEmscriptenMetadata(
   }
 
   meta << "  \"staticBump\": " << staticBump << ",\n";
+  meta << "  \"tableSize\": " << wasm.table.initial.addr << ",\n";
 
   if (!initializerFunctions.empty()) {
     meta << "  \"initializers\": [";
diff --git a/src/wasm/wasm-s-parser.cpp b/src/wasm/wasm-s-parser.cpp
index de1e4f2e9..0dfea962b 100644
--- a/src/wasm/wasm-s-parser.cpp
+++ b/src/wasm/wasm-s-parser.cpp
@@ -632,6 +632,11 @@ Type SExpressionWasmBuilder::stringToType(const char* str, bool allowError, bool
     if (str[1] == '3' && str[2] == '2' && (prefix || str[3] == 0)) return f32;
     if (str[1] == '6' && str[2] == '4' && (prefix || str[3] == 0)) return f64;
   }
+  if (str[0] == 'v') {
+    if (str[1] == '1' && str[2] == '2' && str[3] == '8' && (prefix || str[4] == 0)) {
+      return v128;
+    }
+  }
   if (allowError) return none;
   throw ParseException("invalid wasm type");
 }
@@ -764,7 +769,7 @@ Expression* SExpressionWasmBuilder::makeGetGlobal(Element& s) {
   ret->name = getGlobalName(*s[1]);
   auto* global = wasm.getGlobalOrNull(ret->name);
   if (!global) {
-    throw ParseException("bad get_global name", s.line, s.col);
+    throw ParseException("bad global.get name", s.line, s.col);
   }
   ret->type = global->type;
   return ret;
@@ -773,7 +778,7 @@ Expression* SExpressionWasmBuilder::makeGetGlobal(Element& s) {
 Expression* SExpressionWasmBuilder::makeSetGlobal(Element& s) {
   auto ret = allocator.alloc<SetGlobal>();
   ret->name = getGlobalName(*s[1]);
-  if (wasm.getGlobalOrNull(ret->name) && !wasm.getGlobalOrNull(ret->name)->mutable_) throw ParseException("set_global of immutable", s.line, s.col);
+  if (wasm.getGlobalOrNull(ret->name) && !wasm.getGlobalOrNull(ret->name)->mutable_) throw ParseException("global.set of immutable", s.line, s.col);
   ret->value = parseExpression(s[2]);
   ret->finalize();
   return ret;
@@ -859,8 +864,69 @@ Expression* SExpressionWasmBuilder::makeThenOrElse(Element& s) {
 }
 
 Expression* SExpressionWasmBuilder::makeConst(Element& s, Type type) {
-  auto ret = parseConst(s[1]->str(), type, allocator);
-  if (!ret) throw ParseException("bad const");
+  if (type != v128) {
+    auto ret = parseConst(s[1]->str(), type, allocator);
+    if (!ret) throw ParseException("bad const");
+    return ret;
+  }
+
+  auto ret = allocator.alloc<Const>();
+  auto getLiteral = [](Expression* expr) {
+    if (expr == nullptr) {
+      throw ParseException("Could not parse v128 lane");
+    }
+    return expr->cast<Const>()->value;
+  };
+  Type lane_t = stringToType(s[1]->str());
+  size_t lanes = s.size() - 2;
+  switch (lanes) {
+    case 2: {
+      if (lane_t != i64 && lane_t != f64) {
+        throw ParseException("Unexpected v128 literal lane type");
+      }
+      std::array<Literal, 2> lanes;
+      for (size_t i = 0; i < 2; ++i) {
+        lanes[i] = getLiteral(parseConst(s[i+2]->str(), lane_t, allocator));
+      }
+      ret->value = Literal(lanes);
+      break;
+    }
+    case 4: {
+      if (lane_t != i32 && lane_t != f32) {
+        throw ParseException("Unexpected v128 literal lane type");
+      }
+      std::array<Literal, 4> lanes;
+      for (size_t i = 0; i < 4; ++i) {
+        lanes[i] = getLiteral(parseConst(s[i+2]->str(), lane_t, allocator));
+      }
+      ret->value = Literal(lanes);
+      break;
+    }
+    case 8: {
+      if (lane_t != i32) {
+        throw ParseException("Unexpected v128 literal lane type");
+      }
+      std::array<Literal, 8> lanes;
+      for (size_t i = 0; i < 8; ++i) {
+        lanes[i] = getLiteral(parseConst(s[i+2]->str(), lane_t, allocator));
+      }
+      ret->value = Literal(lanes);
+      break;
+    }
+    case 16: {
+      if (lane_t != i32) {
+        throw ParseException("Unexpected v128 literal lane type");
+      }
+      std::array<Literal, 16> lanes;
+      for (size_t i = 0; i < 16; ++i) {
+        lanes[i] = getLiteral(parseConst(s[i+2]->str(), lane_t, allocator));
+      }
+      ret->value = Literal(lanes);
+      break;
+    }
+    default: throw ParseException("Unexpected number of lanes in v128 literal");
+  }
+  ret->finalize();
   return ret;
 }
 
@@ -1011,6 +1077,63 @@ Expression* SExpressionWasmBuilder::makeAtomicWake(Element& s) {
   return ret;
 }
 
+static uint8_t parseLaneIndex(const Element* s, size_t lanes) {
+  const char *str = s->c_str();
+  char *end;
+  auto n = static_cast<unsigned long long>(strtoll(str, &end, 10));
+  if (end == str || *end != '\0') throw ParseException("Expected lane index");
+  if (n > lanes) throw ParseException("lane index must be less than " + std::to_string(lanes));
+  return uint8_t(n);
+}
+
+Expression* SExpressionWasmBuilder::makeSIMDExtract(Element& s, SIMDExtractOp op, size_t lanes) {
+  auto ret = allocator.alloc<SIMDExtract>();
+  ret->op = op;
+  ret->index = parseLaneIndex(s[1], lanes);
+  ret->vec = parseExpression(s[2]);
+  ret->finalize();
+  return ret;
+}
+
+Expression* SExpressionWasmBuilder::makeSIMDReplace(Element& s, SIMDReplaceOp op, size_t lanes) {
+  auto ret = allocator.alloc<SIMDReplace>();
+  ret->op = op;
+  ret->index = parseLaneIndex(s[1], lanes);
+  ret->vec = parseExpression(s[2]);
+  ret->value = parseExpression(s[3]);
+  ret->finalize();
+  return ret;
+}
+
+Expression* SExpressionWasmBuilder::makeSIMDShuffle(Element& s) {
+  auto ret = allocator.alloc<SIMDShuffle>();
+  for (size_t i = 0; i < 16; ++i) {
+    ret->mask[i] = parseLaneIndex(s[i+1], 32);
+  }
+  ret->left = parseExpression(s[17]);
+  ret->right = parseExpression(s[18]);
+  ret->finalize();
+  return ret;
+}
+
+Expression* SExpressionWasmBuilder::makeSIMDBitselect(Element& s) {
+  auto ret = allocator.alloc<SIMDBitselect>();
+  ret->left = parseExpression(s[1]);
+  ret->right = parseExpression(s[2]);
+  ret->cond = parseExpression(s[3]);
+  ret->finalize();
+  return ret;
+}
+
+Expression* SExpressionWasmBuilder::makeSIMDShift(Element& s, SIMDShiftOp op) {
+  auto ret = allocator.alloc<SIMDShift>();
+  ret->op = op;
+  ret->vec = parseExpression(s[1]);
+  ret->shift = parseExpression(s[2]);
+  ret->finalize();
+  return ret;
+}
+
 Expression* SExpressionWasmBuilder::makeIf(Element& s) {
   auto ret = allocator.alloc<If>();
   Index i = 1;
@@ -1628,7 +1751,7 @@ void SExpressionWasmBuilder::parseTable(Element& s, bool preParseImport) {
   }
   if (i == s.size()) return;
   if (!s[i]->dollared()) {
-    if (s[i]->str() == ANYFUNC) {
+    if (s[i]->str() == FUNCREF) {
       // (table type (elem ..))
       parseInnerElem(*s[i + 1]);
       if (wasm.table.segments.size() > 0) {
@@ -1638,8 +1761,8 @@ void SExpressionWasmBuilder::parseTable(Element& s, bool preParseImport) {
       }
       return;
     }
-    // first element isn't dollared, and isn't anyfunc. this could be old syntax for (table 0 1) which means function 0 and 1, or it could be (table initial max? type), look for type
-    if (s[s.size() - 1]->str() == ANYFUNC) {
+    // first element isn't dollared, and isn't funcref. this could be old syntax for (table 0 1) which means function 0 and 1, or it could be (table initial max? type), look for type
+    if (s[s.size() - 1]->str() == FUNCREF) {
       // (table initial max? type)
       if (i < s.size() - 1) {
         wasm.table.initial = atoi(s[i++]->c_str());
diff --git a/src/wasm/wasm-type.cpp b/src/wasm/wasm-type.cpp
index b7ed12947..f9371ffab 100644
--- a/src/wasm/wasm-type.cpp
+++ b/src/wasm/wasm-type.cpp
@@ -36,12 +36,12 @@ const char* printType(Type type) {
 
 unsigned getTypeSize(Type type) {
   switch (type) {
-    case Type::none: abort();
     case Type::i32: return 4;
     case Type::i64: return 8;
     case Type::f32: return 4;
     case Type::f64: return 8;
     case Type::v128: return 16;
+    case Type::none:
     case Type::unreachable: WASM_UNREACHABLE();
   }
   WASM_UNREACHABLE();
diff --git a/src/wasm/wasm-validator.cpp b/src/wasm/wasm-validator.cpp
index 3f65c9f7a..0b07be802 100644
--- a/src/wasm/wasm-validator.cpp
+++ b/src/wasm/wasm-validator.cpp
@@ -24,10 +24,10 @@
 #include "wasm-validator.h"
 #include "ir/utils.h"
 #include "ir/branch-utils.h"
+#include "ir/features.h"
 #include "ir/module-utils.h"
 #include "support/colors.h"
 
-
 namespace wasm {
 
 // Print anything that can be streamed to an ostream
@@ -245,6 +245,11 @@ public:
   void visitAtomicCmpxchg(AtomicCmpxchg* curr);
   void visitAtomicWait(AtomicWait* curr);
   void visitAtomicWake(AtomicWake* curr);
+  void visitSIMDExtract(SIMDExtract* curr);
+  void visitSIMDReplace(SIMDReplace* curr);
+  void visitSIMDShuffle(SIMDShuffle* curr);
+  void visitSIMDBitselect(SIMDBitselect* curr);
+  void visitSIMDShift(SIMDShift* curr);
   void visitBinary(Binary* curr);
   void visitUnary(Unary* curr);
   void visitSelect(Select* curr);
@@ -467,37 +472,41 @@ void FunctionValidator::visitCallIndirect(CallIndirect* curr) {
 }
 
 void FunctionValidator::visitGetLocal(GetLocal* curr) {
-  shouldBeTrue(curr->index < getFunction()->getNumLocals(), curr, "get_local index must be small enough");
-  shouldBeTrue(isConcreteType(curr->type), curr, "get_local must have a valid type - check what you provided when you constructed the node");
-  shouldBeTrue(curr->type == getFunction()->getLocalType(curr->index), curr, "get_local must have proper type");
+  shouldBeTrue(curr->index < getFunction()->getNumLocals(), curr, "local.get index must be small enough");
+  shouldBeTrue(isConcreteType(curr->type), curr, "local.get must have a valid type - check what you provided when you constructed the node");
+  shouldBeTrue(curr->type == getFunction()->getLocalType(curr->index), curr, "local.get must have proper type");
 }
 
 void FunctionValidator::visitSetLocal(SetLocal* curr) {
-  shouldBeTrue(curr->index < getFunction()->getNumLocals(), curr, "set_local index must be small enough");
+  shouldBeTrue(curr->index < getFunction()->getNumLocals(), curr, "local.set index must be small enough");
   if (curr->value->type != unreachable) {
     if (curr->type != none) { // tee is ok anyhow
-      shouldBeEqualOrFirstIsUnreachable(curr->value->type, curr->type, curr, "set_local type must be correct");
+      shouldBeEqualOrFirstIsUnreachable(curr->value->type, curr->type, curr, "local.set type must be correct");
     }
-    shouldBeEqual(getFunction()->getLocalType(curr->index), curr->value->type, curr, "set_local type must match function");
+    shouldBeEqual(getFunction()->getLocalType(curr->index), curr->value->type, curr, "local.set type must match function");
   }
 }
 
 void FunctionValidator::visitGetGlobal(GetGlobal* curr) {
   if (!info.validateGlobally) return;
-  shouldBeTrue(getModule()->getGlobalOrNull(curr->name), curr, "get_global name must be valid");
+  shouldBeTrue(getModule()->getGlobalOrNull(curr->name), curr, "global.get name must be valid");
 }
 
 void FunctionValidator::visitSetGlobal(SetGlobal* curr) {
   if (!info.validateGlobally) return;
   auto* global = getModule()->getGlobalOrNull(curr->name);
-  if (shouldBeTrue(global, curr, "set_global name must be valid (and not an import; imports can't be modified)")) {
-    shouldBeTrue(global->mutable_, curr, "set_global global must be mutable");
-    shouldBeEqualOrFirstIsUnreachable(curr->value->type, global->type, curr, "set_global value must have right type");
+  if (shouldBeTrue(global, curr, "global.set name must be valid (and not an import; imports can't be modified)")) {
+    shouldBeTrue(global->mutable_, curr, "global.set global must be mutable");
+    shouldBeEqualOrFirstIsUnreachable(curr->value->type, global->type, curr, "global.set value must have right type");
   }
 }
 
 void FunctionValidator::visitLoad(Load* curr) {
-  if (curr->isAtomic) shouldBeTrue(info.features.hasAtomics(), curr, "Atomic operation (atomics are disabled)");
+  if (curr->isAtomic) {
+    shouldBeTrue(info.features.hasAtomics(), curr, "Atomic operation (atomics are disabled)");
+    shouldBeTrue(curr->type == i32 || curr->type == i64 || curr->type == unreachable, curr, "Atomic load should be i32 or i64");
+  }
+  if (curr->type == v128) shouldBeTrue(info.features.hasSIMD(), curr, "SIMD operation (SIMD is disabled)");
   shouldBeFalse(curr->isAtomic && !getModule()->memory.shared, curr, "Atomic operation with non-shared memory");
   validateMemBytes(curr->bytes, curr->type, curr);
   validateAlignment(curr->align, curr->type, curr->bytes, curr->isAtomic, curr);
@@ -509,10 +518,14 @@ void FunctionValidator::visitLoad(Load* curr) {
 }
 
 void FunctionValidator::visitStore(Store* curr) {
-  if (curr->isAtomic) shouldBeTrue(info.features.hasAtomics(), curr, "Atomic operation (atomics are disabled)");
+  if (curr->isAtomic) {
+    shouldBeTrue(info.features.hasAtomics(), curr, "Atomic operation (atomics are disabled)");
+    shouldBeTrue(curr->valueType == i32 || curr->valueType == i64 || curr->valueType == unreachable, curr, "Atomic store should be i32 or i64");
+  }
+  if (curr->valueType == v128) shouldBeTrue(info.features.hasSIMD(), curr, "SIMD operation (SIMD is disabled)");
   shouldBeFalse(curr->isAtomic && !getModule()->memory.shared, curr, "Atomic operation with non-shared memory");
   validateMemBytes(curr->bytes, curr->valueType, curr);
-  validateAlignment(curr->align, curr->type, curr->bytes, curr->isAtomic, curr);
+  validateAlignment(curr->align, curr->valueType, curr->bytes, curr->isAtomic, curr);
   shouldBeEqualOrFirstIsUnreachable(curr->ptr->type, i32, curr, "store pointer type must be i32");
   shouldBeUnequal(curr->value->type, none, curr, "store value type must not be none");
   shouldBeEqualOrFirstIsUnreachable(curr->value->type, curr->valueType, curr, "store value type must match");
@@ -561,20 +574,77 @@ void FunctionValidator::visitAtomicWake(AtomicWake* curr) {
   shouldBeEqualOrFirstIsUnreachable(curr->wakeCount->type, i32, curr, "AtomicWake wakeCount type must be i32");
 }
 
+void FunctionValidator::visitSIMDExtract(SIMDExtract* curr) {
+  shouldBeTrue(info.features.hasSIMD(), curr, "SIMD operation (SIMD is disabled)");
+  shouldBeEqualOrFirstIsUnreachable(curr->vec->type, v128, curr, "extract_lane must operate on a v128");
+  Type lane_t = none;
+  size_t lanes = 0;
+  switch (curr->op) {
+    case ExtractLaneSVecI8x16:
+    case ExtractLaneUVecI8x16: lane_t = i32; lanes = 16; break;
+    case ExtractLaneSVecI16x8:
+    case ExtractLaneUVecI16x8: lane_t = i32; lanes = 8; break;
+    case ExtractLaneVecI32x4: lane_t = i32; lanes = 4; break;
+    case ExtractLaneVecI64x2: lane_t = i64; lanes = 2; break;
+    case ExtractLaneVecF32x4: lane_t = f32; lanes = 4; break;
+    case ExtractLaneVecF64x2: lane_t = f64; lanes = 2; break;
+  }
+  shouldBeEqualOrFirstIsUnreachable(curr->type, lane_t, curr, "extract_lane must have same type as vector lane");
+  shouldBeTrue(curr->index < lanes, curr, "invalid lane index");
+}
+
+void FunctionValidator::visitSIMDReplace(SIMDReplace* curr) {
+  shouldBeTrue(info.features.hasSIMD(), curr, "SIMD operation (SIMD is disabled)");
+  shouldBeEqualOrFirstIsUnreachable(curr->type, v128, curr, "replace_lane must have type v128");
+  shouldBeEqualOrFirstIsUnreachable(curr->vec->type, v128, curr, "replace_lane must operate on a v128");
+  Type lane_t = none;
+  size_t lanes = 0;
+  switch (curr->op) {
+    case ReplaceLaneVecI8x16: lane_t = i32; lanes = 16; break;
+    case ReplaceLaneVecI16x8: lane_t = i32; lanes = 8; break;
+    case ReplaceLaneVecI32x4: lane_t = i32; lanes = 4; break;
+    case ReplaceLaneVecI64x2: lane_t = i64; lanes = 2; break;
+    case ReplaceLaneVecF32x4: lane_t = f32; lanes = 4; break;
+    case ReplaceLaneVecF64x2: lane_t = f64; lanes = 2; break;
+  }
+  shouldBeEqualOrFirstIsUnreachable(curr->value->type, lane_t, curr, "unexpected value type");
+  shouldBeTrue(curr->index < lanes, curr, "invalid lane index");
+}
+
+void FunctionValidator::visitSIMDShuffle(SIMDShuffle* curr) {
+  shouldBeTrue(info.features.hasSIMD(), curr, "SIMD operation (SIMD is disabled)");
+  shouldBeEqualOrFirstIsUnreachable(curr->type, v128, curr, "v128.shuffle must have type v128");
+  shouldBeEqualOrFirstIsUnreachable(curr->left->type, v128, curr, "expected operand of type v128");
+  shouldBeEqualOrFirstIsUnreachable(curr->right->type, v128, curr, "expected operand of type v128");
+  for (uint8_t index : curr->mask) {
+    shouldBeTrue(index < 32, curr, "Invalid lane index in mask");
+  }
+}
+
+void FunctionValidator::visitSIMDBitselect(SIMDBitselect* curr) {
+  shouldBeTrue(info.features.hasSIMD(), curr, "SIMD operation (SIMD is disabled)");
+  shouldBeEqualOrFirstIsUnreachable(curr->type, v128, curr, "v128.bitselect must have type v128");
+  shouldBeEqualOrFirstIsUnreachable(curr->left->type, v128, curr, "expected operand of type v128");
+  shouldBeEqualOrFirstIsUnreachable(curr->right->type, v128, curr, "expected operand of type v128");
+  shouldBeEqualOrFirstIsUnreachable(curr->cond->type, v128, curr, "expected operand of type v128");
+}
+
+void FunctionValidator::visitSIMDShift(SIMDShift* curr) {
+  shouldBeTrue(info.features.hasSIMD(), curr, "SIMD operation (SIMD is disabled)");
+  shouldBeEqualOrFirstIsUnreachable(curr->type, v128, curr, "vector shift must have type v128");
+  shouldBeEqualOrFirstIsUnreachable(curr->vec->type, v128, curr, "expected operand of type v128");
+  shouldBeEqualOrFirstIsUnreachable(curr->shift->type, i32, curr, "expected shift amount to have type i32");
+}
+
 void FunctionValidator::validateMemBytes(uint8_t bytes, Type type, Expression* curr) {
-  switch (bytes) {
-    case 1:
-    case 2:
-    case 4: break;
-    case 8: {
-      // if we have a concrete type for the load, then we know the size of the mem operation and
-      // can validate it
-      if (type != unreachable) {
-        shouldBeEqual(getTypeSize(type), 8U, curr, "8-byte mem operations are only allowed with 8-byte wasm types");
-      }
-      break;
-    }
-    default: info.fail("Memory operations must be 1,2,4, or 8 bytes", curr, getFunction());
+  switch (type) {
+    case i32: shouldBeTrue(bytes == 1 || bytes == 2 || bytes == 4, curr, "expected i32 operation to touch 1, 2, or 4 bytes"); break;
+    case i64: shouldBeTrue(bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8, curr, "expected i64 operation to touch 1, 2, 4, or 8 bytes"); break;
+    case f32: shouldBeEqual(bytes, uint8_t(4), curr, "expected f32 operation to touch 4 bytes"); break;
+    case f64: shouldBeEqual(bytes, uint8_t(8), curr, "expected f64 operation to touch 8 bytes"); break;
+    case v128: shouldBeEqual(bytes, uint8_t(16), curr, "expected v128 operation to touch 16 bytes"); break;
+    case none: WASM_UNREACHABLE();
+    case unreachable: break;
   }
 }
 
@@ -671,8 +741,89 @@ void FunctionValidator::visitBinary(Binary* curr) {
       shouldBeEqualOrFirstIsUnreachable(curr->left->type, f64, curr, "f64 op");
       break;
     }
+    case EqVecI8x16:
+    case NeVecI8x16:
+    case LtSVecI8x16:
+    case LtUVecI8x16:
+    case LeSVecI8x16:
+    case LeUVecI8x16:
+    case GtSVecI8x16:
+    case GtUVecI8x16:
+    case GeSVecI8x16:
+    case GeUVecI8x16:
+    case EqVecI16x8:
+    case NeVecI16x8:
+    case LtSVecI16x8:
+    case LtUVecI16x8:
+    case LeSVecI16x8:
+    case LeUVecI16x8:
+    case GtSVecI16x8:
+    case GtUVecI16x8:
+    case GeSVecI16x8:
+    case GeUVecI16x8:
+    case EqVecI32x4:
+    case NeVecI32x4:
+    case LtSVecI32x4:
+    case LtUVecI32x4:
+    case LeSVecI32x4:
+    case LeUVecI32x4:
+    case GtSVecI32x4:
+    case GtUVecI32x4:
+    case GeSVecI32x4:
+    case GeUVecI32x4:
+    case EqVecF32x4:
+    case NeVecF32x4:
+    case LtVecF32x4:
+    case LeVecF32x4:
+    case GtVecF32x4:
+    case GeVecF32x4:
+    case EqVecF64x2:
+    case NeVecF64x2:
+    case LtVecF64x2:
+    case LeVecF64x2:
+    case GtVecF64x2:
+    case GeVecF64x2:
+    case AndVec128:
+    case OrVec128:
+    case XorVec128:
+    case AddVecI8x16:
+    case AddSatSVecI8x16:
+    case AddSatUVecI8x16:
+    case SubVecI8x16:
+    case SubSatSVecI8x16:
+    case SubSatUVecI8x16:
+    case MulVecI8x16:
+    case AddVecI16x8:
+    case AddSatSVecI16x8:
+    case AddSatUVecI16x8:
+    case SubVecI16x8:
+    case SubSatSVecI16x8:
+    case SubSatUVecI16x8:
+    case MulVecI16x8:
+    case AddVecI32x4:
+    case SubVecI32x4:
+    case MulVecI32x4:
+    case AddVecI64x2:
+    case SubVecI64x2:
+    case AddVecF32x4:
+    case SubVecF32x4:
+    case MulVecF32x4:
+    case DivVecF32x4:
+    case MinVecF32x4:
+    case MaxVecF32x4:
+    case AddVecF64x2:
+    case SubVecF64x2:
+    case MulVecF64x2:
+    case DivVecF64x2:
+    case MinVecF64x2:
+    case MaxVecF64x2:  {
+      shouldBeEqualOrFirstIsUnreachable(curr->left->type, v128, curr, "v128 op");
+      shouldBeEqualOrFirstIsUnreachable(curr->right->type, v128, curr, "v128 op");
+      break;
+    }
     case InvalidBinary: WASM_UNREACHABLE();
   }
+  shouldBeTrue(Features::get(curr->op) <= info.features, curr, "all used features should be allowed");
 }
 
 void FunctionValidator::visitUnary(Unary* curr) {
@@ -747,7 +898,6 @@ void FunctionValidator::visitUnary(Unary* curr) {
     case TruncSatSFloat32ToInt64:
     case TruncSatUFloat32ToInt32:
     case TruncSatUFloat32ToInt64: {
-      shouldBeTrue(info.features.hasTruncSat(), curr, "nontrapping float-to-int conversions are disabled");
       shouldBeEqual(curr->value->type, f32, curr, "trunc type must be correct");
       break;
     }
@@ -762,7 +912,6 @@ void FunctionValidator::visitUnary(Unary* curr) {
     case TruncSatSFloat64ToInt64:
     case TruncSatUFloat64ToInt32:
     case TruncSatUFloat64ToInt64: {
-      shouldBeTrue(info.features.hasTruncSat(), curr, "nontrapping float-to-int conversions are disabled");
       shouldBeEqual(curr->value->type, f64, curr, "trunc type must be correct");
       break;
     }
@@ -804,8 +953,60 @@ void FunctionValidator::visitUnary(Unary* curr) {
       shouldBeEqual(curr->value->type, i64, curr, "reinterpret/i64 type must be correct");
       break;
     }
+    case SplatVecI8x16:
+    case SplatVecI16x8:
+    case SplatVecI32x4:
+      shouldBeEqual(curr->type, v128, curr, "expected splat to have v128 type");
+      shouldBeEqual(curr->value->type, i32, curr, "expected i32 splat value");
+      break;
+    case SplatVecI64x2:
+      shouldBeEqual(curr->type, v128, curr, "expected splat to have v128 type");
+      shouldBeEqual(curr->value->type, i64, curr, "expected i64 splat value");
+      break;
+    case SplatVecF32x4:
+      shouldBeEqual(curr->type, v128, curr, "expected splat to have v128 type");
+      shouldBeEqual(curr->value->type, f32, curr, "expected f32 splat value");
+      break;
+    case SplatVecF64x2:
+      shouldBeEqual(curr->type, v128, curr, "expected splat to have v128 type");
+      shouldBeEqual(curr->value->type, f64, curr, "expected i64 splat value");
+      break;
+    case NotVec128:
+    case NegVecI8x16:
+    case NegVecI16x8:
+    case NegVecI32x4:
+    case NegVecI64x2:
+    case AbsVecF32x4:
+    case NegVecF32x4:
+    case SqrtVecF32x4:
+    case AbsVecF64x2:
+    case NegVecF64x2:
+    case SqrtVecF64x2:
+    case TruncSatSVecF32x4ToVecI32x4:
+    case TruncSatUVecF32x4ToVecI32x4:
+    case TruncSatSVecF64x2ToVecI64x2:
+    case TruncSatUVecF64x2ToVecI64x2:
+    case ConvertSVecI32x4ToVecF32x4:
+    case ConvertUVecI32x4ToVecF32x4:
+    case ConvertSVecI64x2ToVecF64x2:
+    case ConvertUVecI64x2ToVecF64x2:
+      shouldBeEqual(curr->type, v128, curr, "expected v128 type");
+      shouldBeEqual(curr->value->type, v128, curr, "expected v128 operand");
+      break;
+    case AnyTrueVecI8x16:
+    case AllTrueVecI8x16:
+    case AnyTrueVecI16x8:
+    case AllTrueVecI16x8:
+    case AnyTrueVecI32x4:
+    case AllTrueVecI32x4:
+    case AnyTrueVecI64x2:
+    case AllTrueVecI64x2:
+      shouldBeEqual(curr->type, i32, curr, "expected boolean reduction to have i32 type");
+      shouldBeEqual(curr->value->type, v128, curr, "expected v128 operand");
+      break;
     case InvalidUnary: WASM_UNREACHABLE();
   }
+  shouldBeTrue(Features::get(curr->op) <= info.features, curr, "all used features should be allowed");
 }
 
 void FunctionValidator::visitSelect(Select* curr) {
@@ -868,6 +1069,9 @@ void FunctionValidator::visitFunction(Function* curr) {
     shouldBeTrue(ft->params == curr->params, curr->name, "function params must match its declared type");
     shouldBeTrue(ft->result == curr->result, curr->name, "function result must match its declared type");
   }
+  if (curr->imported()) {
+    shouldBeTrue(curr->type.is(), curr->name, "imported functions must have a function type");
+  }
 }
 
 static bool checkOffset(Expression* curr, Address add, Address max) {
@@ -895,7 +1099,8 @@ void FunctionValidator::validateAlignment(size_t align, Type type, Index bytes,
     case 1:
     case 2:
     case 4:
-    case 8: break;
+    case 8:
+    case 16: break;
     default:{
       info.fail("bad alignment: " + std::to_string(align), curr, getFunction());
       break;
@@ -913,9 +1118,9 @@ void FunctionValidator::validateAlignment(size_t align, Type type, Index bytes,
       shouldBeTrue(align <= 8, curr, "alignment must not exceed natural");
       break;
     }
-    case v128: assert(false && "v128 not implemented yet");
-    case none:
-    case unreachable: {}
+    case v128:
+    case unreachable: break;
+    case none: WASM_UNREACHABLE();
   }
 }
 
diff --git a/src/wasm/wasm.cpp b/src/wasm/wasm.cpp
index fe7927870..cfee4f3c4 100644
--- a/src/wasm/wasm.cpp
+++ b/src/wasm/wasm.cpp
@@ -66,7 +66,7 @@ Name GROW_WASM_MEMORY("__growWasmMemory"),
      NEG_NAN("-nan"),
      CASE("case"),
      BR("br"),
-     ANYFUNC("anyfunc"),
+     FUNCREF("funcref"),
      FAKE_RETURN("fake_return_waka123"),
      MUT("mut"),
      SPECTEST("spectest"),
@@ -85,10 +85,10 @@ const char* getExpressionName(Expression* curr) {
     case Expression::Id::SwitchId: return "switch";
     case Expression::Id::CallId: return "call";
     case Expression::Id::CallIndirectId: return "call_indirect";
-    case Expression::Id::GetLocalId: return "get_local";
-    case Expression::Id::SetLocalId: return "set_local";
-    case Expression::Id::GetGlobalId: return "get_global";
-    case Expression::Id::SetGlobalId: return "set_global";
+    case Expression::Id::GetLocalId: return "local.get";
+    case Expression::Id::SetLocalId: return "local.set";
+    case Expression::Id::GetGlobalId: return "global.get";
+    case Expression::Id::SetGlobalId: return "global.set";
     case Expression::Id::LoadId: return "load";
     case Expression::Id::StoreId: return "store";
     case Expression::Id::ConstId: return "const";
@@ -104,6 +104,11 @@ const char* getExpressionName(Expression* curr) {
     case Expression::Id::AtomicRMWId: return "atomic_rmw";
     case Expression::Id::AtomicWaitId: return "atomic_wait";
     case Expression::Id::AtomicWakeId: return "atomic_wake";
+    case Expression::Id::SIMDExtractId: return "simd_extract";
+    case Expression::Id::SIMDReplaceId: return "simd_replace";
+    case Expression::Id::SIMDShuffleId: return "simd_shuffle";
+    case Expression::Id::SIMDBitselectId: return "simd_bitselect";
+    case Expression::Id::SIMDShiftId: return "simd_shift";
     case Expression::Id::NumExpressionIds: WASM_UNREACHABLE();
   }
   WASM_UNREACHABLE();
@@ -416,6 +421,56 @@ void AtomicWake::finalize() {
   }
 }
 
+void SIMDExtract::finalize() {
+  assert(vec);
+  switch (op) {
+    case ExtractLaneSVecI8x16:
+    case ExtractLaneUVecI8x16:
+    case ExtractLaneSVecI16x8:
+    case ExtractLaneUVecI16x8:
+    case ExtractLaneVecI32x4: type = i32; break;
+    case ExtractLaneVecI64x2: type = i64; break;
+    case ExtractLaneVecF32x4: type = f32; break;
+    case ExtractLaneVecF64x2: type = f64; break;
+    default: WASM_UNREACHABLE();
+  }
+  if (vec->type == unreachable) {
+    type = unreachable;
+  }
+}
+
+void SIMDReplace::finalize() {
+  assert(vec && value);
+  type = v128;
+  if (vec->type == unreachable || value->type == unreachable) {
+    type = unreachable;
+  }
+}
+
+void SIMDShuffle::finalize() {
+  assert(left && right);
+  type = v128;
+  if (left->type == unreachable || right->type == unreachable) {
+    type = unreachable;
+  }
+}
+
+void SIMDBitselect::finalize() {
+  assert(left && right && cond);
+  type = v128;
+  if (left->type == unreachable || right->type == unreachable || cond->type == unreachable) {
+    type = unreachable;
+  }
+}
+
+void SIMDShift::finalize() {
+  assert(vec && shift);
+  type = v128;
+  if (vec->type == unreachable || shift->type == unreachable) {
+    type = unreachable;
+  }
+}
+
 Const* Const::set(Literal value_) {
   value = value_;
   type = value.type;
@@ -491,6 +546,39 @@ void Unary::finalize() {
     case ConvertUInt32ToFloat64:
     case ConvertSInt64ToFloat64:
     case ConvertUInt64ToFloat64: type = f64; break;
+    case SplatVecI8x16:
+    case SplatVecI16x8:
+    case SplatVecI32x4:
+    case SplatVecI64x2:
+    case SplatVecF32x4:
+    case SplatVecF64x2:
+    case NotVec128:
+    case NegVecI8x16:
+    case NegVecI16x8:
+    case NegVecI32x4:
+    case NegVecI64x2:
+    case AbsVecF32x4:
+    case NegVecF32x4:
+    case SqrtVecF32x4:
+    case AbsVecF64x2:
+    case NegVecF64x2:
+    case SqrtVecF64x2:
+    case TruncSatSVecF32x4ToVecI32x4:
+    case TruncSatUVecF32x4ToVecI32x4:
+    case TruncSatSVecF64x2ToVecI64x2:
+    case TruncSatUVecF64x2ToVecI64x2:
+    case ConvertSVecI32x4ToVecF32x4:
+    case ConvertUVecI32x4ToVecF32x4:
+    case ConvertSVecI64x2ToVecF64x2:
+    case ConvertUVecI64x2ToVecF64x2: type = v128; break;
+    case AnyTrueVecI8x16:
+    case AllTrueVecI8x16:
+    case AnyTrueVecI16x8:
+    case AllTrueVecI16x8:
+    case AnyTrueVecI32x4:
+    case AllTrueVecI32x4:
+    case AnyTrueVecI64x2:
+    case AllTrueVecI64x2: type = i32; break;
     case InvalidUnary: WASM_UNREACHABLE();
   }
 }