Add --fast-math mode (#3155)

Similar to clang and gcc, --fast-math makes us ignore corner cases of floating-point math like NaN changes and (not done yet) lack of associativity and so forth. In the future we may want to have separate fast math flags for each specific thing, like gcc and clang do. This undoes some changes (#2958 and #3096) where we assumed it was ok to not change NaN bits, but @binji corrected us. We can only do such things in fast math mode. This puts those optimizations behind that flag, adds tests for it, and restores the interpreter to the simpler code from before with no special cases.
author: Alon Zakai <azakai@google.com> 2020-09-30 12:39:05 -0700
committer: GitHub <noreply@github.com> 2020-09-30 12:39:05 -0700
commit: 07047103a26e1c17ee995ef3e1358ddb26d8e8c8 (patch)
tree: edaf22d19ab7c22c3cff58f9e02d8f41abcb5b3e
parent: 11de8894505d37b7b970a2103bc5b1cfd094b115 (diff)
download: binaryen-07047103a26e1c17ee995ef3e1358ddb26d8e8c8.tar.gz
binaryen-07047103a26e1c17ee995ef3e1358ddb26d8e8c8.tar.bz2
binaryen-07047103a26e1c17ee995ef3e1358ddb26d8e8c8.zip
12 files changed, 241 insertions, 79 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 590d756fd..63d887b69 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,8 @@ full changeset diff at the end of each section.
 Current Trunk
 -------------
 
+- Add `--fast-math` mode. (#3155)
+
 v97
 ---
 
diff --git a/src/pass.h b/src/pass.h
index a3ee41d61..27e7ee37f 100644
--- a/src/pass.h
+++ b/src/pass.h
@@ -102,6 +102,11 @@ struct PassOptions {
   // many cases.
   bool lowMemoryUnused = false;
   enum { LowMemoryBound = 1024 };
+  // Whether to allow "loose" math semantics, ignoring corner cases with NaNs
+  // and assuming math follows the algebraic rules for associativity and so
+  // forth (which IEEE floats do not, strictly speaking). This is inspired by
+  // gcc/clang's -ffast-math flag.
+  bool fastMath = false;
   // Whether to try to preserve debug info through, which are special calls.
   bool debugInfo = false;
   // Arbitrary string arguments from the commandline, which we forward to
diff --git a/src/passes/OptimizeInstructions.cpp b/src/passes/OptimizeInstructions.cpp
index 36d92e81f..55af9a34d 100644
--- a/src/passes/OptimizeInstructions.cpp
+++ b/src/passes/OptimizeInstructions.cpp
@@ -161,7 +161,10 @@ struct OptimizeInstructions
 #endif
   }
 
+  bool fastMath;
+
   void doWalkFunction(Function* func) {
+    fastMath = getPassOptions().fastMath;
     // first, scan locals
     {
       LocalScanner scanner(localInfo, getPassOptions());
@@ -1414,14 +1417,15 @@ private:
     }
     {
       double value;
-      if (matches(curr, binary(Abstract::Sub, any(), fval(&value))) &&
+      if (fastMath &&
+          matches(curr, binary(Abstract::Sub, any(), fval(&value))) &&
           value == 0.0) {
         // x - (-0.0)   ==>   x + 0.0
         if (std::signbit(value)) {
           curr->op = Abstract::getBinary(type, Abstract::Add);
           right->value = right->value.neg();
           return curr;
-        } else {
+        } else if (fastMath) {
           // x - 0.0   ==>   x
           return curr->left;
         }
@@ -1430,19 +1434,18 @@ private:
     {
       // x + (-0.0)   ==>   x
       double value;
-      if (matches(curr, binary(Abstract::Add, any(), fval(&value))) &&
+      if (fastMath &&
+          matches(curr, binary(Abstract::Add, any(), fval(&value))) &&
           value == 0.0 && std::signbit(value)) {
         return curr->left;
       }
     }
-    // Note that this is correct even on floats with a NaN on the left,
-    // as a NaN would skip the computation and just return the NaN,
-    // and that is precisely what we do here. but, the same with -1
-    // (change to a negation) would be incorrect for that reason.
     if (matches(curr, binary(Abstract::Mul, any(&left), constant(1))) ||
         matches(curr, binary(Abstract::DivS, any(&left), constant(1))) ||
         matches(curr, binary(Abstract::DivU, any(&left), constant(1)))) {
-      return left;
+      if (curr->type.isInteger() || fastMath) {
+        return left;
+      }
     }
     return nullptr;
   }
diff --git a/src/tools/optimization-options.h b/src/tools/optimization-options.h
index 72f478329..5b6a643e6 100644
--- a/src/tools/optimization-options.h
+++ b/src/tools/optimization-options.h
@@ -187,7 +187,13 @@ struct OptimizationOptions : public ToolOptions {
            Options::Arguments::Zero,
            [this](Options*, const std::string&) {
              passOptions.lowMemoryUnused = true;
-           });
+           })
+      .add(
+        "--fast-math",
+        "-ffm",
+        "Optimize floats without handling corner cases of NaNs and rounding",
+        Options::Arguments::Zero,
+        [this](Options*, const std::string&) { passOptions.fastMath = true; });
     // add passes in registry
     for (const auto& p : PassRegistry::get()->getRegisteredNames()) {
       (*this).add(
diff --git a/src/wasm/literal.cpp b/src/wasm/literal.cpp
index d309be308..54453b356 100644
--- a/src/wasm/literal.cpp
+++ b/src/wasm/literal.cpp
@@ -934,35 +934,10 @@ Literal Literal::mul(const Literal& other) const {
       return Literal(uint32_t(i32) * uint32_t(other.i32));
     case Type::i64:
       return Literal(uint64_t(i64) * uint64_t(other.i64));
-    case Type::f32: {
-      // Special-case multiplication by 1. nan * 1 can change nan bits per the
-      // wasm spec, but it is ok to just return that original nan, and we
-      // do that here so that we are consistent with the optimization of
-      // removing the * 1 and leaving just the nan. That is, if we just
-      // do a normal multiply and the CPU decides to change the bits, we'd
-      // give a different result on optimized code, which would look like
-      // it was a bad optimization. So out of all the valid results to
-      // return here, return the simplest one that is consistent with
-      // our optimization for the case of 1.
-      float lhs = getf32(), rhs = other.getf32();
-      if (rhs == 1) {
-        return Literal(lhs);
-      }
-      if (lhs == 1) {
-        return Literal(rhs);
-      }
-      return Literal(lhs * rhs);
-    }
-    case Type::f64: {
-      double lhs = getf64(), rhs = other.getf64();
-      if (rhs == 1) {
-        return Literal(lhs);
-      }
-      if (lhs == 1) {
-        return Literal(rhs);
-      }
-      return Literal(lhs * rhs);
-    }
+    case Type::f32:
+      return Literal(getf32() * other.getf32());
+    case Type::f64:
+      return Literal(getf64() * other.getf64());
     case Type::v128:
     case Type::funcref:
     case Type::externref:
@@ -1002,10 +977,6 @@ Literal Literal::div(const Literal& other) const {
         case FP_INFINITE: // fallthrough
         case FP_NORMAL:   // fallthrough
         case FP_SUBNORMAL:
-          // Special-case division by 1, similar to multiply from earlier.
-          if (rhs == 1) {
-            return Literal(lhs);
-          }
           return Literal(lhs / rhs);
         default:
           WASM_UNREACHABLE("invalid fp classification");
@@ -1034,10 +1005,6 @@ Literal Literal::div(const Literal& other) const {
         case FP_INFINITE: // fallthrough
         case FP_NORMAL:   // fallthrough
         case FP_SUBNORMAL:
-          // See above comment on f32.
-          if (rhs == 1) {
-            return Literal(lhs);
-          }
           return Literal(lhs / rhs);
         default:
           WASM_UNREACHABLE("invalid fp classification");
diff --git a/test/passes/O_fast-math.txt b/test/passes/O_fast-math.txt
new file mode 100644
index 000000000..1b454c68e
--- /dev/null
+++ b/test/passes/O_fast-math.txt
@@ -0,0 +1,21 @@
+(module
+ (type $none_=>_f32 (func (result f32)))
+ (export "div" (func $0))
+ (export "mul1" (func $1))
+ (export "mul2" (func $2))
+ (export "add1" (func $1))
+ (export "add2" (func $2))
+ (export "add3" (func $2))
+ (export "add4" (func $2))
+ (export "sub1" (func $1))
+ (export "sub2" (func $2))
+ (func $0 (; has Stack IR ;) (result f32)
+  (f32.const -nan:0x23017a)
+ )
+ (func $1 (; has Stack IR ;) (result f32)
+  (f32.const -nan:0x34546d)
+ )
+ (func $2 (; has Stack IR ;) (result f32)
+  (f32.const -nan:0x74546d)
+ )
+)
diff --git a/test/passes/O_fast-math.wast b/test/passes/O_fast-math.wast
new file mode 100644
index 000000000..2317f782d
--- /dev/null
+++ b/test/passes/O_fast-math.wast
@@ -0,0 +1,57 @@
+;; with fast-math we can optimize some of these patterns
+(module
+ (func "div" (result f32)
+  (f32.div
+   (f32.const -nan:0x23017a)
+   (f32.const 1)
+  )
+ )
+ (func "mul1" (result f32)
+  (f32.mul
+   (f32.const -nan:0x34546d)
+   (f32.const 1)
+  )
+ )
+ (func "mul2" (result f32)
+  (f32.mul
+   (f32.const 1)
+   (f32.const -nan:0x34546d)
+  )
+ )
+ (func "add1" (result f32)
+  (f32.add
+   (f32.const -nan:0x34546d)
+   (f32.const -0)
+  )
+ )
+ (func "add2" (result f32)
+  (f32.add
+   (f32.const -0)
+   (f32.const -nan:0x34546d)
+  )
+ )
+ (func "add3" (result f32)
+  (f32.add
+   (f32.const -nan:0x34546d)
+   (f32.const 0)
+  )
+ )
+ (func "add4" (result f32)
+  (f32.add
+   (f32.const 0)
+   (f32.const -nan:0x34546d)
+  )
+ )
+ (func "sub1" (result f32)
+  (f32.sub
+   (f32.const -nan:0x34546d)
+   (f32.const 0)
+  )
+ )
+ (func "sub2" (result f32)
+  (f32.sub
+   (f32.const -nan:0x34546d)
+   (f32.const -0)
+  )
+ )
+)
diff --git a/test/passes/fuzz-exec_O.txt b/test/passes/fuzz-exec_O.txt
index ef8e165bb..f17b04650 100644
--- a/test/passes/fuzz-exec_O.txt
+++ b/test/passes/fuzz-exec_O.txt
@@ -31,29 +31,65 @@
 [fuzz-exec] comparing func_0
 [fuzz-exec] comparing func_1
 [fuzz-exec] calling div
-[fuzz-exec] note result: div => -nan:0x23017a
+[fuzz-exec] note result: div => -nan:0x63017a
 [fuzz-exec] calling mul1
-[fuzz-exec] note result: mul1 => -nan:0x34546d
+[fuzz-exec] note result: mul1 => -nan:0x74546d
 [fuzz-exec] calling mul2
-[fuzz-exec] note result: mul2 => -nan:0x34546d
+[fuzz-exec] note result: mul2 => -nan:0x74546d
+[fuzz-exec] calling add1
+[fuzz-exec] note result: add1 => -nan:0x74546d
+[fuzz-exec] calling add2
+[fuzz-exec] note result: add2 => -nan:0x74546d
+[fuzz-exec] calling add3
+[fuzz-exec] note result: add3 => -nan:0x74546d
+[fuzz-exec] calling add4
+[fuzz-exec] note result: add4 => -nan:0x74546d
+[fuzz-exec] calling sub1
+[fuzz-exec] note result: sub1 => -nan:0x74546d
+[fuzz-exec] calling sub2
+[fuzz-exec] note result: sub2 => -nan:0x74546d
 (module
  (type $none_=>_f32 (func (result f32)))
  (export "div" (func $0))
  (export "mul1" (func $1))
  (export "mul2" (func $1))
+ (export "add1" (func $1))
+ (export "add2" (func $1))
+ (export "add3" (func $1))
+ (export "add4" (func $1))
+ (export "sub1" (func $1))
+ (export "sub2" (func $1))
  (func $0 (; has Stack IR ;) (result f32)
-  (f32.const -nan:0x23017a)
+  (f32.const -nan:0x63017a)
  )
  (func $1 (; has Stack IR ;) (result f32)
-  (f32.const -nan:0x34546d)
+  (f32.const -nan:0x74546d)
  )
 )
 [fuzz-exec] calling div
-[fuzz-exec] note result: div => -nan:0x23017a
+[fuzz-exec] note result: div => -nan:0x63017a
 [fuzz-exec] calling mul1
-[fuzz-exec] note result: mul1 => -nan:0x34546d
+[fuzz-exec] note result: mul1 => -nan:0x74546d
 [fuzz-exec] calling mul2
-[fuzz-exec] note result: mul2 => -nan:0x34546d
+[fuzz-exec] note result: mul2 => -nan:0x74546d
+[fuzz-exec] calling add1
+[fuzz-exec] note result: add1 => -nan:0x74546d
+[fuzz-exec] calling add2
+[fuzz-exec] note result: add2 => -nan:0x74546d
+[fuzz-exec] calling add3
+[fuzz-exec] note result: add3 => -nan:0x74546d
+[fuzz-exec] calling add4
+[fuzz-exec] note result: add4 => -nan:0x74546d
+[fuzz-exec] calling sub1
+[fuzz-exec] note result: sub1 => -nan:0x74546d
+[fuzz-exec] calling sub2
+[fuzz-exec] note result: sub2 => -nan:0x74546d
+[fuzz-exec] comparing add1
+[fuzz-exec] comparing add2
+[fuzz-exec] comparing add3
+[fuzz-exec] comparing add4
 [fuzz-exec] comparing div
 [fuzz-exec] comparing mul1
 [fuzz-exec] comparing mul2
+[fuzz-exec] comparing sub1
+[fuzz-exec] comparing sub2
diff --git a/test/passes/fuzz-exec_O.wast b/test/passes/fuzz-exec_O.wast
index 5c739c548..b34dc2e8f 100644
--- a/test/passes/fuzz-exec_O.wast
+++ b/test/passes/fuzz-exec_O.wast
@@ -22,10 +22,10 @@
 )
 (module
  (func "div" (result f32)
-  (f32.div                   ;; div by 1 can be removed, leaving this nan
-   (f32.const -nan:0x23017a) ;; as it is. wasm semantics allow nan bits to
-   (f32.const 1)             ;; change, but the interpreter should not do so,
-  )                          ;; so that it does not fail on that opt.
+  (f32.div
+   (f32.const -nan:0x23017a)
+   (f32.const 1)
+  )
  )
  (func "mul1" (result f32)
   (f32.mul
@@ -39,5 +39,40 @@
    (f32.const -nan:0x34546d)
   )
  )
+ (func "add1" (result f32)
+  (f32.add
+   (f32.const -nan:0x34546d)
+   (f32.const -0)
+  )
+ )
+ (func "add2" (result f32)
+  (f32.add
+   (f32.const -0)
+   (f32.const -nan:0x34546d)
+  )
+ )
+ (func "add3" (result f32)
+  (f32.add
+   (f32.const -nan:0x34546d)
+   (f32.const 0)
+  )
+ )
+ (func "add4" (result f32)
+  (f32.add
+   (f32.const 0)
+   (f32.const -nan:0x34546d)
+  )
+ )
+ (func "sub1" (result f32)
+  (f32.sub
+   (f32.const -nan:0x34546d)
+   (f32.const 0)
+  )
+ )
+ (func "sub2" (result f32)
+  (f32.sub
+   (f32.const -nan:0x34546d)
+   (f32.const -0)
+  )
+ )
 )
-
diff --git a/test/passes/optimize-instructions_all-features.txt b/test/passes/optimize-instructions_all-features.txt
index 873d550d7..5babd75de 100644
--- a/test/passes/optimize-instructions_all-features.txt
+++ b/test/passes/optimize-instructions_all-features.txt
@@ -2886,10 +2886,16 @@
    (local.get $x64)
   )
   (drop
-   (local.get $y32)
+   (f32.mul
+    (local.get $y32)
+    (f32.const 1)
+   )
   )
   (drop
-   (local.get $y64)
+   (f64.mul
+    (local.get $y64)
+    (f64.const 1)
+   )
   )
   (drop
    (i32.const 0)
@@ -2922,10 +2928,16 @@
    (local.get $x64)
   )
   (drop
-   (local.get $y32)
+   (f32.div
+    (local.get $y32)
+    (f32.const 1)
+   )
   )
   (drop
-   (local.get $y64)
+   (f64.div
+    (local.get $y64)
+    (f64.const 1)
+   )
   )
   (drop
    (f32.div
@@ -3703,27 +3715,39 @@
  )
  (func $const-float-zero (param $fx f32) (param $fy f64)
   (drop
-   (local.get $fx)
+   (f32.sub
+    (local.get $fx)
+    (f32.const 0)
+   )
   )
   (drop
-   (local.get $fy)
+   (f64.sub
+    (local.get $fy)
+    (f64.const 0)
+   )
   )
   (drop
-   (local.get $fx)
+   (f32.add
+    (local.get $fx)
+    (f32.const -0)
+   )
   )
   (drop
-   (local.get $fy)
+   (f64.add
+    (local.get $fy)
+    (f64.const -0)
+   )
   )
   (drop
-   (f32.add
+   (f32.sub
     (local.get $fx)
-    (f32.const 0)
+    (f32.const -0)
    )
   )
   (drop
-   (f64.add
+   (f64.sub
     (local.get $fy)
-    (f64.const 0)
+    (f64.const -0)
    )
   )
   (drop
@@ -3750,6 +3774,12 @@
     (f64.const 0)
    )
   )
+  (drop
+   (f32.sub
+    (f32.const -nan:0x34546d)
+    (f32.const 0)
+   )
+  )
  )
  (func $rhs-is-neg-one (param $x i32) (param $y i64) (param $fx f32) (param $fy f64)
   (drop
diff --git a/test/passes/optimize-instructions_all-features.wast b/test/passes/optimize-instructions_all-features.wast
index 59e7e21d8..246fd41a2 100644
--- a/test/passes/optimize-instructions_all-features.wast
+++ b/test/passes/optimize-instructions_all-features.wast
@@ -4284,6 +4284,10 @@
       (local.get $fy) ;; skip
       (f64.const 0)
     ))
+    (drop (f32.sub
+      (f32.const -nan:0x34546d) ;; skip
+      (f32.const 0)
+    ))
   )
   (func $rhs-is-neg-one (param $x i32) (param $y i64) (param $fx f32) (param $fy f64)
     (drop (i32.sub
diff --git a/test/spec/old_float_exprs.wast b/test/spec/old_float_exprs.wast
index ca031114f..7900832b0 100644
--- a/test/spec/old_float_exprs.wast
+++ b/test/spec/old_float_exprs.wast
@@ -103,10 +103,8 @@
     (f64.mul (local.get $x) (f64.const 1.0)))
 )
 
-;; XXX BINARYEN: disable this test, as we have testing for the more strict property
-;;               of not changing the bits at all in our interpreter
-;; (assert_return (invoke "f32.no_fold_mul_one" (f32.const nan:0x200000)) (f32.const nan:0x600000))
-;; (assert_return (invoke "f64.no_fold_mul_one" (f64.const nan:0x4000000000000)) (f64.const nan:0xc000000000000))
+(assert_return (invoke "f32.no_fold_mul_one" (f32.const nan:0x200000)) (f32.const nan:0x600000))
+(assert_return (invoke "f64.no_fold_mul_one" (f64.const nan:0x4000000000000)) (f64.const nan:0xc000000000000))
 
 ;; Test that 0.0/x is not folded to 0.0.
 
@@ -135,10 +133,8 @@
     (f64.div (local.get $x) (f64.const 1.0)))
 )
 
-;; XXX BINARYEN: disable this test, as we have testing for the more strict property
-;;               of not changing the bits at all in our interpreter
-;; (assert_return (invoke "f32.no_fold_div_one" (f32.const nan:0x200000)) (f32.const nan:arithmetic))
-;; (assert_return (invoke "f64.no_fold_div_one" (f64.const nan:0x4000000000000)) (f64.const nan:arithmetic))
+(assert_return (invoke "f32.no_fold_div_one" (f32.const nan:0x200000)) (f32.const nan:0x600000))
+(assert_return (invoke "f64.no_fold_div_one" (f64.const nan:0x4000000000000)) (f64.const nan:0xc000000000000))
 
 ;; Test that x/-1.0 is not folded to -x.
author	Alon Zakai <azakai@google.com>	2020-09-30 12:39:05 -0700
committer	GitHub <noreply@github.com>	2020-09-30 12:39:05 -0700
commit	07047103a26e1c17ee995ef3e1358ddb26d8e8c8 (patch)
tree	edaf22d19ab7c22c3cff58f9e02d8f41abcb5b3e
parent	11de8894505d37b7b970a2103bc5b1cfd094b115 (diff)
download	binaryen-07047103a26e1c17ee995ef3e1358ddb26d8e8c8.tar.gz binaryen-07047103a26e1c17ee995ef3e1358ddb26d8e8c8.tar.bz2 binaryen-07047103a26e1c17ee995ef3e1358ddb26d8e8c8.zip