12 files changed, 241 insertions, 79 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 590d756fd..63d887b69 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,8 @@ full changeset diff at the end of each section.
 Current Trunk
 -------------
 
+- Add `--fast-math` mode. (#3155)
+
 v97
 ---
 
diff --git a/src/pass.h b/src/pass.h
index a3ee41d61..27e7ee37f 100644
--- a/src/pass.h
+++ b/src/pass.h
@@ -102,6 +102,11 @@ struct PassOptions {
   // many cases.
   bool lowMemoryUnused = false;
   enum { LowMemoryBound = 1024 };
+  // Whether to allow "loose" math semantics, ignoring corner cases with NaNs
+  // and assuming math follows the algebraic rules for associativity and so
+  // forth (which IEEE floats do not, strictly speaking). This is inspired by
+  // gcc/clang's -ffast-math flag.
+  bool fastMath = false;
   // Whether to try to preserve debug info through, which are special calls.
   bool debugInfo = false;
   // Arbitrary string arguments from the commandline, which we forward to
diff --git a/src/passes/OptimizeInstructions.cpp b/src/passes/OptimizeInstructions.cpp
index 36d92e81f..55af9a34d 100644
--- a/src/passes/OptimizeInstructions.cpp
+++ b/src/passes/OptimizeInstructions.cpp
@@ -161,7 +161,10 @@ struct OptimizeInstructions
 #endif
   }
 
+  bool fastMath;
+
   void doWalkFunction(Function* func) {
+    fastMath = getPassOptions().fastMath;
     // first, scan locals
     {
       LocalScanner scanner(localInfo, getPassOptions());
@@ -1414,14 +1417,15 @@ private:
     }
     {
       double value;
-      if (matches(curr, binary(Abstract::Sub, any(), fval(&value))) &&
+      if (fastMath &&
+          matches(curr, binary(Abstract::Sub, any(), fval(&value))) &&
           value == 0.0) {
         // x - (-0.0)   ==>   x + 0.0
         if (std::signbit(value)) {
           curr->op = Abstract::getBinary(type, Abstract::Add);
           right->value = right->value.neg();
           return curr;
-        } else {
+        } else if (fastMath) {
           // x - 0.0   ==>   x
           return curr->left;
         }
@@ -1430,19 +1434,18 @@ private:
     {
       // x + (-0.0)   ==>   x
       double value;
-      if (matches(curr, binary(Abstract::Add, any(), fval(&value))) &&
+      if (fastMath &&
+          matches(curr, binary(Abstract::Add, any(), fval(&value))) &&
           value == 0.0 && std::signbit(value)) {
         return curr->left;
       }
     }
-    // Note that this is correct even on floats with a NaN on the left,
-    // as a NaN would skip the computation and just return the NaN,
-    // and that is precisely what we do here. but, the same with -1
-    // (change to a negation) would be incorrect for that reason.
     if (matches(curr, binary(Abstract::Mul, any(&left), constant(1))) ||
         matches(curr, binary(Abstract::DivS, any(&left), constant(1))) ||
         matches(curr, binary(Abstract::DivU, any(&left), constant(1)))) {
-      return left;
+      if (curr->type.isInteger() || fastMath) {
+        return left;
+      }
     }
     return nullptr;
   }
diff --git a/src/tools/optimization-options.h b/src/tools/optimization-options.h
index 72f478329..5b6a643e6 100644
--- a/src/tools/optimization-options.h
+++ b/src/tools/optimization-options.h
@@ -187,7 +187,13 @@ struct OptimizationOptions : public ToolOptions {
            Options::Arguments::Zero,
            [this](Options*, const std::string&) {
              passOptions.lowMemoryUnused = true;
-           });
+           })
+      .add(
+        "--fast-math",
+        "-ffm",
+        "Optimize floats without handling corner cases of NaNs and rounding",
+        Options::Arguments::Zero,
+        [this](Options*, const std::string&) { passOptions.fastMath = true; });
     // add passes in registry
     for (const auto& p : PassRegistry::get()->getRegisteredNames()) {
       (*this).add(
diff --git a/src/wasm/literal.cpp b/src/wasm/literal.cpp
index d309be308..54453b356 100644
--- a/src/wasm/literal.cpp
+++ b/src/wasm/literal.cpp
@@ -934,35 +934,10 @@ Literal Literal::mul(const Literal& other) const {
       return Literal(uint32_t(i32) * uint32_t(other.i32));
     case Type::i64:
       return Literal(uint64_t(i64) * uint64_t(other.i64));
-    case Type::f32: {
-      // Special-case multiplication by 1. nan * 1 can change nan bits per the
-      // wasm spec, but it is ok to just return that original nan, and we
-      // do that here so that we are consistent with the optimization of
-      // removing the * 1 and leaving just the nan. That is, if we just
-      // do a normal multiply and the CPU decides to change the bits, we'd
-      // give a different result on optimized code, which would look like
-      // it was a bad optimization. So out of all the valid results to
-      // return here, return the simplest one that is consistent with
-      // our optimization for the case of 1.
-      float lhs = getf32(), rhs = other.getf32();
-      if (rhs == 1) {
-        return Literal(lhs);
-      }
-      if (lhs == 1) {
-        return Literal(rhs);
-      }
-      return Literal(lhs * rhs);
-    }
-    case Type::f64: {
-      double lhs = getf64(), rhs = other.getf64();
-      if (rhs == 1) {
-        return Literal(lhs);
-      }
-      if (lhs == 1) {
-        return Literal(rhs);
-      }
-      return Literal(lhs * rhs);
-    }
+    case Type::f32:
+      return Literal(getf32() * other.getf32());
+    case Type::f64:
+      return Literal(getf64() * other.getf64());
     case Type::v128:
     case Type::funcref:
     case Type::externref:
@@ -1002,10 +977,6 @@ Literal Literal::div(const Literal& other) const {
         case FP_INFINITE: // fallthrough
         case FP_NORMAL:   // fallthrough
         case FP_SUBNORMAL:
-          // Special-case division by 1, similar to multiply from earlier.
-          if (rhs == 1) {
-            return Literal(lhs);
-          }
           return Literal(lhs / rhs);
         default:
           WASM_UNREACHABLE("invalid fp classification");
@@ -1034,10 +1005,6 @@ Literal Literal::div(const Literal& other) const {
         case FP_INFINITE: // fallthrough
         case FP_NORMAL:   // fallthrough
         case FP_SUBNORMAL:
-          // See above comment on f32.
-          if (rhs == 1) {
-            return Literal(lhs);
-          }
           return Literal(lhs / rhs);
         default:
           WASM_UNREACHABLE("invalid fp classification");
diff --git a/test/passes/O_fast-math.txt b/test/passes/O_fast-math.txt
new file mode 100644
index 000000000..1b454c68e
--- /dev/null
+++ b/test/passes/O_fast-math.txt
@@ -0,0 +1,21 @@
+(module
+ (type $none_=>_f32 (func (result f32)))
+ (export "div" (func $0))
+ (export "mul1" (func $1))
+ (export "mul2" (func $2))
+ (export "add1" (func $1))
+ (export "add2" (func $2))
+ (export "add3" (func $2))
+ (export "add4" (func $2))
+ (export "sub1" (func $1))
+ (export "sub2" (func $2))
+ (func $0 (; has Stack IR ;) (result f32)
+  (f32.const -nan:0x23017a)
+ )
+ (func $1 (; has Stack IR ;) (result f32)
+  (f32.const -nan:0x34546d)
+ )
+ (func $2 (; has Stack IR ;) (result f32)
+  (f32.const -nan:0x74546d)
+ )
+)
diff --git a/test/passes/O_fast-math.wast b/test/passes/O_fast-math.wast
new file mode 100644
index 000000000..2317f782d
--- /dev/null
+++ b/test/passes/O_fast-math.wast
@@ -0,0 +1,57 @@
+;; with fast-math we can optimize some of these patterns
+(module
+ (func "div" (result f32)
+  (f32.div
+   (f32.const -nan:0x23017a)
+   (f32.const 1)
+  )
+ )
+ (func "mul1" (result f32)
+  (f32.mul
+   (f32.const -nan:0x34546d)
+   (f32.const 1)
+  )
+ )
+ (func "mul2" (result f32)
+  (f32.mul
+   (f32.const 1)
+   (f32.const -nan:0x34546d)
+  )
+ )
+ (func "add1" (result f32)
+  (f32.add
+   (f32.const -nan:0x34546d)
+   (f32.const -0)
+  )
+ )
+ (func "add2" (result f32)
+  (f32.add
+   (f32.const -0)
+   (f32.const -nan:0x34546d)
+  )
+ )
+ (func "add3" (result f32)
+  (f32.add
+   (f32.const -nan:0x34546d)
+   (f32.const 0)
+  )
+ )
+ (func "add4" (result f32)
+  (f32.add
+   (f32.const 0)
+   (f32.const -nan:0x34546d)
+  )
+ )
+ (func "sub1" (result f32)
+  (f32.sub
+   (f32.const -nan:0x34546d)
+   (f32.const 0)
+  )
+ )
+ (func "sub2" (result f32)
+  (f32.sub
+   (f32.const -nan:0x34546d)
+   (f32.const -0)
+  )
+ )
+)
diff --git a/test/passes/fuzz-exec_O.txt b/test/passes/fuzz-exec_O.txt
index ef8e165bb..f17b04650 100644
--- a/test/passes/fuzz-exec_O.txt
+++ b/test/passes/fuzz-exec_O.txt
@@ -31,29 +31,65 @@
 [fuzz-exec] comparing func_0
 [fuzz-exec] comparing func_1
 [fuzz-exec] calling div
-[fuzz-exec] note result: div => -nan:0x23017a
+[fuzz-exec] note result: div => -nan:0x63017a
 [fuzz-exec] calling mul1
-[fuzz-exec] note result: mul1 => -nan:0x34546d
+[fuzz-exec] note result: mul1 => -nan:0x74546d
 [fuzz-exec] calling mul2
-[fuzz-exec] note result: mul2 => -nan:0x34546d
+[fuzz-exec] note result: mul2 => -nan:0x74546d
+[fuzz-exec] calling add1
+[fuzz-exec] note result: add1 => -nan:0x74546d
+[fuzz-exec] calling add2
+[fuzz-exec] note result: add2 => -nan:0x74546d
+[fuzz-exec] calling add3
+[fuzz-exec] note result: add3 => -nan:0x74546d
+[fuzz-exec] calling add4
+[fuzz-exec] note result: add4 => -nan:0x74546d
+[fuzz-exec] calling sub1
+[fuzz-exec] note result: sub1 => -nan:0x74546d
+[fuzz-exec] calling sub2
+[fuzz-exec] note result: sub2 => -nan:0x74546d
 (module
  (type $none_=>_f32 (func (result f32)))
  (export "div" (func $0))
  (export "mul1" (func $1))
  (export "mul2" (func $1))
+ (export "add1" (func $1))
+ (export "add2" (func $1))
+ (export "add3" (func $1))
+ (export "add4" (func $1))
+ (export "sub1" (func $1))
+ (export "sub2" (func $1))
  (func $0 (; has Stack IR ;) (result f32)
-  (f32.const -nan:0x23017a)
+  (f32.const -nan:0x63017a)
  )
  (func $1 (; has Stack IR ;) (result f32)
-  (f32.const -nan:0x34546d)
+  (f32.const -nan:0x74546d)
  )
 )
 [fuzz-exec] calling div
-[fuzz-exec] note result: div => -nan:0x23017a
+[fuzz-exec] note result: div => -nan:0x63017a
 [fuzz-exec] calling mul1
-[fuzz-exec] note result: mul1 => -nan:0x34546d
+[fuzz-exec] note result: mul1 => -nan:0x74546d
 [fuzz-exec] calling mul2
-[fuzz-exec] note result: mul2 => -nan:0x34546d
+[fuzz-exec] note result: mul2 => -nan:0x74546d
+[fuzz-exec] calling add1
+[fuzz-exec] note result: add1 => -nan:0x74546d
+[fuzz-exec] calling add2
+[fuzz-exec] note result: add2 => -nan:0x74546d
+[fuzz-exec] calling add3
+[fuzz-exec] note result: add3 => -nan:0x74546d
+[fuzz-exec] calling add4
+[fuzz-exec] note result: add4 => -nan:0x74546d
+[fuzz-exec] calling sub1
+[fuzz-exec] note result: sub1 => -nan:0x74546d
+[fuzz-exec] calling sub2
+[fuzz-exec] note result: sub2 => -nan:0x74546d
+[fuzz-exec] comparing add1
+[fuzz-exec] comparing add2
+[fuzz-exec] comparing add3
+[fuzz-exec] comparing add4
 [fuzz-exec] comparing div
 [fuzz-exec] comparing mul1
 [fuzz-exec] comparing mul2
+[fuzz-exec] comparing sub1
+[fuzz-exec] comparing sub2
diff --git a/test/passes/fuzz-exec_O.wast b/test/passes/fuzz-exec_O.wast
index 5c739c548..b34dc2e8f 100644
--- a/test/passes/fuzz-exec_O.wast
+++ b/test/passes/fuzz-exec_O.wast
@@ -22,10 +22,10 @@
 )
 (module
  (func "div" (result f32)
-  (f32.div                   ;; div by 1 can be removed, leaving this nan
-   (f32.const -nan:0x23017a) ;; as it is. wasm semantics allow nan bits to
-   (f32.const 1)             ;; change, but the interpreter should not do so,
-  )                          ;; so that it does not fail on that opt.
+  (f32.div
+   (f32.const -nan:0x23017a)
+   (f32.const 1)
+  )
  )
  (func "mul1" (result f32)
   (f32.mul
@@ -39,5 +39,40 @@
    (f32.const -nan:0x34546d)
   )
  )
+ (func "add1" (result f32)
+  (f32.add
+   (f32.const -nan:0x34546d)
+   (f32.const -0)
+  )
+ )
+ (func "add2" (result f32)
+  (f32.add
+   (f32.const -0)
+   (f32.const -nan:0x34546d)
+  )
+ )
+ (func "add3" (result f32)
+  (f32.add
+   (f32.const -nan:0x34546d)
+   (f32.const 0)
+  )
+ )
+ (func "add4" (result f32)
+  (f32.add
+   (f32.const 0)
+   (f32.const -nan:0x34546d)
+  )
+ )
+ (func "sub1" (result f32)
+  (f32.sub
+   (f32.const -nan:0x34546d)
+   (f32.const 0)
+  )
+ )
+ (func "sub2" (result f32)
+  (f32.sub
+   (f32.const -nan:0x34546d)
+   (f32.const -0)
+  )
+ )
 )
-
diff --git a/test/passes/optimize-instructions_all-features.txt b/test/passes/optimize-instructions_all-features.txt
index 873d550d7..5babd75de 100644
--- a/test/passes/optimize-instructions_all-features.txt
+++ b/test/passes/optimize-instructions_all-features.txt
@@ -2886,10 +2886,16 @@
    (local.get $x64)
   )
   (drop
-   (local.get $y32)
+   (f32.mul
+    (local.get $y32)
+    (f32.const 1)
+   )
   )
   (drop
-   (local.get $y64)
+   (f64.mul
+    (local.get $y64)
+    (f64.const 1)
+   )
   )
   (drop
    (i32.const 0)
@@ -2922,10 +2928,16 @@
    (local.get $x64)
   )
   (drop
-   (local.get $y32)
+   (f32.div
+    (local.get $y32)
+    (f32.const 1)
+   )
   )
   (drop
-   (local.get $y64)
+   (f64.div
+    (local.get $y64)
+    (f64.const 1)
+   )
   )
   (drop
    (f32.div
@@ -3703,27 +3715,39 @@
  )
  (func $const-float-zero (param $fx f32) (param $fy f64)
   (drop
-   (local.get $fx)
+   (f32.sub
+    (local.get $fx)
+    (f32.const 0)
+   )
   )
   (drop
-   (local.get $fy)
+   (f64.sub
+    (local.get $fy)
+    (f64.const 0)
+   )
   )
   (drop
-   (local.get $fx)
+   (f32.add
+    (local.get $fx)
+    (f32.const -0)
+   )
   )
   (drop
-   (local.get $fy)
+   (f64.add
+    (local.get $fy)
+    (f64.const -0)
+   )
   )
   (drop
-   (f32.add
+   (f32.sub
     (local.get $fx)
-    (f32.const 0)
+    (f32.const -0)
    )
   )
   (drop
-   (f64.add
+   (f64.sub
     (local.get $fy)
-    (f64.const 0)
+    (f64.const -0)
    )
   )
   (drop
@@ -3750,6 +3774,12 @@
     (f64.const 0)
    )
   )
+  (drop
+   (f32.sub
+    (f32.const -nan:0x34546d)
+    (f32.const 0)
+   )
+  )
  )
  (func $rhs-is-neg-one (param $x i32) (param $y i64) (param $fx f32) (param $fy f64)
   (drop
diff --git a/test/passes/optimize-instructions_all-features.wast b/test/passes/optimize-instructions_all-features.wast
index 59e7e21d8..246fd41a2 100644
--- a/test/passes/optimize-instructions_all-features.wast
+++ b/test/passes/optimize-instructions_all-features.wast
@@ -4284,6 +4284,10 @@
       (local.get $fy) ;; skip
       (f64.const 0)
     ))
+    (drop (f32.sub
+      (f32.const -nan:0x34546d) ;; skip
+      (f32.const 0)
+    ))
   )
   (func $rhs-is-neg-one (param $x i32) (param $y i64) (param $fx f32) (param $fy f64)
     (drop (i32.sub
diff --git a/test/spec/old_float_exprs.wast b/test/spec/old_float_exprs.wast
index ca031114f..7900832b0 100644
--- a/test/spec/old_float_exprs.wast
+++ b/test/spec/old_float_exprs.wast
@@ -103,10 +103,8 @@
     (f64.mul (local.get $x) (f64.const 1.0)))
 )
 
-;; XXX BINARYEN: disable this test, as we have testing for the more strict property
-;;               of not changing the bits at all in our interpreter
-;; (assert_return (invoke "f32.no_fold_mul_one" (f32.const nan:0x200000)) (f32.const nan:0x600000))
-;; (assert_return (invoke "f64.no_fold_mul_one" (f64.const nan:0x4000000000000)) (f64.const nan:0xc000000000000))
+(assert_return (invoke "f32.no_fold_mul_one" (f32.const nan:0x200000)) (f32.const nan:0x600000))
+(assert_return (invoke "f64.no_fold_mul_one" (f64.const nan:0x4000000000000)) (f64.const nan:0xc000000000000))
 
 ;; Test that 0.0/x is not folded to 0.0.
 
@@ -135,10 +133,8 @@
     (f64.div (local.get $x) (f64.const 1.0)))
 )
 
-;; XXX BINARYEN: disable this test, as we have testing for the more strict property
-;;               of not changing the bits at all in our interpreter
-;; (assert_return (invoke "f32.no_fold_div_one" (f32.const nan:0x200000)) (f32.const nan:arithmetic))
-;; (assert_return (invoke "f64.no_fold_div_one" (f64.const nan:0x4000000000000)) (f64.const nan:arithmetic))
+(assert_return (invoke "f32.no_fold_div_one" (f32.const nan:0x200000)) (f32.const nan:0x600000))
+(assert_return (invoke "f64.no_fold_div_one" (f64.const nan:0x4000000000000)) (f64.const nan:0xc000000000000))
 
 ;; Test that x/-1.0 is not folded to -x.