9 files changed, 864 insertions, 6 deletions
diff --git a/scripts/fuzz_opt.py b/scripts/fuzz_opt.py
index bb6e5723d..49f98fc1a 100755
--- a/scripts/fuzz_opt.py
+++ b/scripts/fuzz_opt.py
@@ -1294,6 +1294,8 @@ opt_choices = [
     ["--memory-packing"],
     ["--merge-blocks"],
     ['--merge-locals'],
+    ['--monomorphize'],
+    ['--monomorphize-always'],
     ['--once-reduction'],
     ["--optimize-instructions"],
     ["--optimize-stack-ir"],
diff --git a/src/ir/type-updating.h b/src/ir/type-updating.h
index f6df449ae..50ba9efca 100644
--- a/src/ir/type-updating.h
+++ b/src/ir/type-updating.h
@@ -409,12 +409,12 @@ Expression* fixLocalGet(LocalGet* get, Module& wasm);
 
 // Applies new types of parameters to a function. This does all the necessary
 // changes aside from altering the function type, which the caller is expected
-// to do (the caller might simply change the type, but in other cases the caller
-// might be rewriting the types and need to preserve their identity in terms of
-// nominal typing, so we don't change the type here). The specific things this
-// function does are to update the types of local.get/tee operations,
-// refinalize, etc., basically all operations necessary to ensure validation
-// with the new types.
+// to do after we run (the caller might simply change the type, but in other
+// cases the caller  might be rewriting the types and need to preserve their
+// identity in terms of nominal typing, so we don't change the type here). The
+// specific things this function does are to update the types of local.get/tee
+// operations, refinalize, etc., basically all operations necessary to ensure
+// validation with the new types.
 //
 // While doing so, we can either update or not update the types of local.get and
 // local.tee operations. (We do not update them here if we'll be doing an update
diff --git a/src/passes/CMakeLists.txt b/src/passes/CMakeLists.txt
index cb180179c..621a8e68a 100644
--- a/src/passes/CMakeLists.txt
+++ b/src/passes/CMakeLists.txt
@@ -62,6 +62,7 @@ set(passes_SOURCES
   MergeLocals.cpp
   Metrics.cpp
   MinifyImportsAndExports.cpp
+  Monomorphize.cpp
   MultiMemoryLowering.cpp
   NameList.cpp
   NameTypes.cpp
diff --git a/src/passes/Monomorphize.cpp b/src/passes/Monomorphize.cpp
new file mode 100644
index 000000000..80e908a83
--- /dev/null
+++ b/src/passes/Monomorphize.cpp
@@ -0,0 +1,245 @@
+/*
+ * Copyright 2022 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//
+// When we see a call foo(arg1, arg2) and at least one of the arguments has a
+// more refined type than is declared in the function being called, create a
+// copy of the function with the refined type. That copy can then potentially be
+// optimized in useful ways later.
+//
+// Inlining also monomorphizes in effect. What this pass does is handle the
+// cases where inlining cannot be done.
+//
+// To see when monomorphizing makes sense, this optimizes the target function
+// both with and without the more refined types. If the refined types help then
+// the version with might remove a cast, for example. Note that while doing so
+// we keep the optimization results of the version without - there is no reason
+// to forget them since we've gone to the trouble anyhow. So this pass may have
+// the side effect of performing minor optimizations on functions. There is also
+// a variant of the pass that always monomorphizes, even when it does not seem
+// helpful, which is useful for testing, and possibly in cases where we need
+// more than just local optimizations to see the benefit - for example, perhaps
+// GUFA ends up more powerful later on.
+//
+// TODO: When we optimize we could run multiple cycles: A calls B calls C might
+//       end up with the refined+optimized B now having refined types in its
+//       call to C, which it did not have before. This is in fact the expected
+//       pattern of incremental monomorphization. Doing it in the pass could be
+//       more efficient as later cycles can focus only on what was just
+//       optimized and changed. Also, operating on functions just modified would
+//       help the case of A calls B and we end up optimizing A after we consider
+//       A->B, and the optimized version sends more refined types to B, which
+//       could unlock more potential.
+// TODO: We could sort the functions so that we start from root functions and
+//       end on leaves. That would make it more likely for a single iteration to
+//       do more work, as if A->B->C then we'd do A->B and optimize B and only
+//       then look at B->C.
+// TODO: Also run the result-refining part of SignatureRefining, as if we
+//       refine the result then callers of the function may benefit, even if
+//       there is no benefit in the function itself.
+// TODO: If this is too slow, we could "group" things, for example we could
+//       compute the LUB of a bunch of calls to a target and then investigate
+//       that one case and use it in all those callers.
+// TODO: Not just direct calls? But updating vtables is complex.
+// TODO: Not just types? We could monomorphize using Literal values. E.g. for
+//       function references, if we monomorphized we'd end up specializing qsort
+//       for the particular functions it is given.
+//
+
+#include "ir/cost.h"
+#include "ir/find_all.h"
+#include "ir/module-utils.h"
+#include "ir/names.h"
+#include "ir/type-updating.h"
+#include "ir/utils.h"
+#include "pass.h"
+#include "wasm-type.h"
+#include "wasm.h"
+
+namespace wasm {
+
+namespace {
+
+struct Monomorphize : public Pass {
+  // If set, we run some opts to see if monomorphization helps, and skip it if
+  // not.
+  bool onlyWhenHelpful;
+
+  Monomorphize(bool onlyWhenHelpful) : onlyWhenHelpful(onlyWhenHelpful) {}
+
+  void run(Module* module) override {
+    if (!module->features.hasGC()) {
+      return;
+    }
+
+    // TODO: parallelize, see comments below
+
+    // Note the list of all functions. We'll be adding more, and do not want to
+    // operate on those.
+    std::vector<Name> funcNames;
+    ModuleUtils::iterDefinedFunctions(
+      *module, [&](Function* func) { funcNames.push_back(func->name); });
+
+    // Find the calls in each function and optimize where we can, changing them
+    // to call more refined targets.
+    for (auto name : funcNames) {
+      auto* func = module->getFunction(name);
+      for (auto* call : FindAll<Call>(func->body).list) {
+        if (call->type == Type::unreachable) {
+          // Ignore unreachable code.
+          // TODO: return_call?
+          continue;
+        }
+
+        if (call->target == name) {
+          // Avoid recursion, which adds some complexity (as we'd be modifying
+          // ourselves if we apply optimizations).
+          continue;
+        }
+
+        call->target = getRefinedTarget(call, module);
+      }
+    }
+  }
+
+  // Given a call, make a copy of the function it is calling that has more
+  // refined arguments that fit the arguments being passed perfectly.
+  Name getRefinedTarget(Call* call, Module* module) {
+    auto target = call->target;
+    auto* func = module->getFunction(target);
+    if (func->imported()) {
+      // Nothing to do since this calls outside of the module.
+      return target;
+    }
+    auto params = func->getParams();
+    bool hasRefinedParam = false;
+    for (Index i = 0; i < call->operands.size(); i++) {
+      if (call->operands[i]->type != params[i]) {
+        hasRefinedParam = true;
+        break;
+      }
+    }
+    if (!hasRefinedParam) {
+      // Nothing to do since all params are fully refined already.
+      return target;
+    }
+
+    std::vector<Type> refinedTypes;
+    for (auto* operand : call->operands) {
+      refinedTypes.push_back(operand->type);
+    }
+    auto refinedParams = Type(refinedTypes);
+    auto iter = funcParamMap.find({target, refinedParams});
+    if (iter != funcParamMap.end()) {
+      return iter->second;
+    }
+
+    // This is the first time we see this situation. Let's see if it is worth
+    // monomorphizing.
+
+    // Create a new function with refined parameters as a copy of the original.
+    // (Note we must clear stack IR on the original: atm we do not have the
+    // ability to copy stack IR, so we'd hit an internal error. But as we will
+    // be optimizing the function anyhow, we'd be throwing away stack IR later
+    // so this isn't a problem.)
+    func->stackIR.reset();
+    auto refinedTarget = Names::getValidFunctionName(*module, target);
+    auto* refinedFunc = ModuleUtils::copyFunction(func, *module, refinedTarget);
+    TypeUpdating::updateParamTypes(refinedFunc, refinedTypes, *module);
+    refinedFunc->type = HeapType(Signature(refinedParams, func->getResults()));
+
+    // Assume we'll choose to use the refined target, but if we are being
+    // careful then we might change our mind.
+    auto chosenTarget = refinedTarget;
+    if (onlyWhenHelpful) {
+      // Optimize both functions using minimal opts, hopefully enough to see if
+      // there is a benefit to the refined types (such as the new types allowing
+      // a cast to be removed).
+      // TODO: Atm this can be done many times per function as it is once per
+      //       function and per set of types sent to it. Perhaps have some
+      //       total limit to avoid slow runtimes.
+      // TODO: We can end up optimizing |func| more than once. It may be
+      //       different each time if the previous optimization helped, but
+      //       often it will be identical. We could save the original version
+      //       and use that as the starting point here (and cache the optimized
+      //       version), but then we'd be throwing away optimization results. Or
+      //       we could see if later optimizations do not further decrease the
+      //       cost, and if so, use a cached value for the cost on such
+      //       "already maximally optimized" functions. The former approach is
+      //       more amenable to parallelization, as it avoids path dependence -
+      //       the other approaches are deterministic but they depend on the
+      //       order in which we see things. But it does require saving a copy
+      //       of the function, which uses memory, which is avoided if we just
+      //       keep optimizing from the current contents as we go. It's not
+      //       obvious which approach is best here.
+      doMinimalOpts(func);
+      doMinimalOpts(refinedFunc);
+
+      auto costBefore = CostAnalyzer(func->body).cost;
+      auto costAfter = CostAnalyzer(refinedFunc->body).cost;
+      if (costAfter >= costBefore) {
+        // We failed to improve. Remove the new function and return the old
+        // target.
+        module->removeFunction(refinedTarget);
+        chosenTarget = target;
+      }
+    }
+
+    // Mark the chosen target in the map, so we don't do this work again: every
+    // pair of target and refinedParams is only considered once.
+    funcParamMap[{target, refinedParams}] = chosenTarget;
+
+    return chosenTarget;
+  }
+
+  // Run minimal function-level optimizations on a function. This optimizes at
+  // -O1 which is very fast and runs in linear time basically, and so it should
+  // be reasonable to run as part of this pass: -O1 is several times faster than
+  // a full -O2, in particular, and so if we run this as part of -O2 we should
+  // not be making it much slower overall.
+  // TODO: Perhaps we don't need all of -O1, and can focus on specific things we
+  //       expect to help. That would be faster, but we'd always run the risk of
+  //       missing things, especially as new passes are added later and we don't
+  //       think to add them here.
+  void doMinimalOpts(Function* func) {
+    PassRunner runner(getPassRunner());
+    runner.options.optimizeLevel = 1;
+    // Local subtyping is not run in -O1, but we really do want it here since
+    // the entire point is that parameters now have more refined types, which
+    // can lead to locals reading them being refinable as well.
+    runner.add("local-subtyping");
+    runner.addDefaultFunctionOptimizationPasses();
+    runner.setIsNested(true);
+    runner.runOnFunction(func);
+  }
+
+  // Maps [func name, param types] to the name of a new function whose params
+  // have those types.
+  //
+  // Note that this can contain funcParamMap{A, types} = A, that is, that maps
+  // a function name to itself. That indicates we found no benefit from
+  // refining with those particular types, and saves us from computing it again
+  // later on.
+  std::unordered_map<std::pair<Name, Type>, Name> funcParamMap;
+};
+
+} // anonymous namespace
+
+Pass* createMonomorphizePass() { return new Monomorphize(true); }
+
+Pass* createMonomorphizeAlwaysPass() { return new Monomorphize(false); }
+
+} // namespace wasm
diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp
index 7190201a1..b3bf80f8f 100644
--- a/src/passes/pass.cpp
+++ b/src/passes/pass.cpp
@@ -272,6 +272,12 @@ void PassRegistry::registerPasses() {
   registerPass("mod-asyncify-never-unwind",
                "apply the assumption that asyncify never unwinds",
                createModAsyncifyNeverUnwindPass);
+  registerPass("monomorphize",
+               "creates specialized versions of functions",
+               createMonomorphizePass);
+  registerPass("monomorphize-always",
+               "creates specialized versions of functions (even if unhelpful)",
+               createMonomorphizeAlwaysPass);
   registerPass("multi-memory-lowering",
                "combines multiple memories into a single memory",
                createMultiMemoryLoweringPass);
diff --git a/src/passes/passes.h b/src/passes/passes.h
index 12e8b77c7..ba60ade6f 100644
--- a/src/passes/passes.h
+++ b/src/passes/passes.h
@@ -84,6 +84,8 @@ Pass* createMinifyImportsPass();
 Pass* createMinifyImportsAndExportsPass();
 Pass* createMinifyImportsAndExportsAndModulesPass();
 Pass* createMetricsPass();
+Pass* createMonomorphizePass();
+Pass* createMonomorphizeAlwaysPass();
 Pass* createMultiMemoryLoweringPass();
 Pass* createNameListPass();
 Pass* createNameTypesPass();
diff --git a/test/lit/help/wasm-opt.test b/test/lit/help/wasm-opt.test
index 281411cca..4fd50abcc 100644
--- a/test/lit/help/wasm-opt.test
+++ b/test/lit/help/wasm-opt.test
@@ -273,6 +273,12 @@
 ;; CHECK-NEXT:   --mod-asyncify-never-unwind                   apply the assumption that
 ;; CHECK-NEXT:                                                 asyncify never unwinds
 ;; CHECK-NEXT:
+;; CHECK-NEXT:   --monomorphize                                creates specialized versions of
+;; CHECK-NEXT:                                                 functions
+;; CHECK-NEXT:
+;; CHECK-NEXT:   --monomorphize-always                         creates specialized versions of
+;; CHECK-NEXT:                                                 functions (even if unhelpful)
+;; CHECK-NEXT:
 ;; CHECK-NEXT:   --multi-memory-lowering                       combines multiple memories into
 ;; CHECK-NEXT:                                                 a single memory
 ;; CHECK-NEXT:
diff --git a/test/lit/help/wasm2js.test b/test/lit/help/wasm2js.test
index f190c8fed..7a782c872 100644
--- a/test/lit/help/wasm2js.test
+++ b/test/lit/help/wasm2js.test
@@ -232,6 +232,12 @@
 ;; CHECK-NEXT:   --mod-asyncify-never-unwind                   apply the assumption that
 ;; CHECK-NEXT:                                                 asyncify never unwinds
 ;; CHECK-NEXT:
+;; CHECK-NEXT:   --monomorphize                                creates specialized versions of
+;; CHECK-NEXT:                                                 functions
+;; CHECK-NEXT:
+;; CHECK-NEXT:   --monomorphize-always                         creates specialized versions of
+;; CHECK-NEXT:                                                 functions (even if unhelpful)
+;; CHECK-NEXT:
 ;; CHECK-NEXT:   --multi-memory-lowering                       combines multiple memories into
 ;; CHECK-NEXT:                                                 a single memory
 ;; CHECK-NEXT:
diff --git a/test/lit/passes/monomorphize.wast b/test/lit/passes/monomorphize.wast
new file mode 100644
index 000000000..1cd219d08
--- /dev/null
+++ b/test/lit/passes/monomorphize.wast
@@ -0,0 +1,590 @@
+;; NOTE: Assertions have been generated by update_lit_checks.py --all-items and should not be edited.
+
+;; Test in both "always" mode, which always monomorphizes, and in "careful"
+;; mode which does it only when it appears to actually help.
+
+;; RUN: foreach %s %t wasm-opt --nominal --monomorphize-always -all -S -o - | filecheck %s --check-prefix ALWAYS
+;; RUN: foreach %s %t wasm-opt --nominal --monomorphize        -all -S -o - | filecheck %s --check-prefix CAREFUL
+
+(module
+  ;; ALWAYS:      (type $A (struct_subtype  data))
+  ;; CAREFUL:      (type $A (struct_subtype  data))
+  (type $A (struct_subtype data))
+  ;; ALWAYS:      (type $B (struct_subtype  $A))
+  ;; CAREFUL:      (type $B (struct_subtype  $A))
+  (type $B (struct_subtype $A))
+
+  ;; ALWAYS:      (type $ref|$A|_=>_none (func_subtype (param (ref $A)) func))
+
+  ;; ALWAYS:      (type $none_=>_none (func_subtype func))
+
+  ;; ALWAYS:      (type $ref|$B|_=>_none (func_subtype (param (ref $B)) func))
+
+  ;; ALWAYS:      (import "a" "b" (func $import (param (ref $A))))
+  ;; CAREFUL:      (type $ref|$A|_=>_none (func_subtype (param (ref $A)) func))
+
+  ;; CAREFUL:      (type $none_=>_none (func_subtype func))
+
+  ;; CAREFUL:      (import "a" "b" (func $import (param (ref $A))))
+  (import "a" "b" (func $import (param (ref $A))))
+
+  ;; ALWAYS:      (func $calls (type $none_=>_none)
+  ;; ALWAYS-NEXT:  (call $refinable
+  ;; ALWAYS-NEXT:   (struct.new_default $A)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT:  (call $refinable
+  ;; ALWAYS-NEXT:   (struct.new_default $A)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT:  (call $refinable_0
+  ;; ALWAYS-NEXT:   (struct.new_default $B)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT:  (call $refinable_0
+  ;; ALWAYS-NEXT:   (struct.new_default $B)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT: )
+  ;; CAREFUL:      (func $calls (type $none_=>_none)
+  ;; CAREFUL-NEXT:  (call $refinable
+  ;; CAREFUL-NEXT:   (struct.new_default $A)
+  ;; CAREFUL-NEXT:  )
+  ;; CAREFUL-NEXT:  (call $refinable
+  ;; CAREFUL-NEXT:   (struct.new_default $A)
+  ;; CAREFUL-NEXT:  )
+  ;; CAREFUL-NEXT:  (call $refinable
+  ;; CAREFUL-NEXT:   (struct.new_default $B)
+  ;; CAREFUL-NEXT:  )
+  ;; CAREFUL-NEXT:  (call $refinable
+  ;; CAREFUL-NEXT:   (struct.new_default $B)
+  ;; CAREFUL-NEXT:  )
+  ;; CAREFUL-NEXT: )
+  (func $calls
+    ;; Two calls with $A, two with $B. The calls to $B should both go to the
+    ;; same new function which has a refined parameter of $B.
+    ;;
+    ;; However, in CAREFUL mode we won't do that, as there is no helpful
+    ;; improvement in the target functions even with the refined types.
+    (call $refinable
+      (struct.new $A)
+    )
+    (call $refinable
+      (struct.new $A)
+    )
+    (call $refinable
+      (struct.new $B)
+    )
+    (call $refinable
+      (struct.new $B)
+    )
+  )
+
+  ;; ALWAYS:      (func $call-import (type $none_=>_none)
+  ;; ALWAYS-NEXT:  (call $import
+  ;; ALWAYS-NEXT:   (struct.new_default $B)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT: )
+  ;; CAREFUL:      (func $call-import (type $none_=>_none)
+  ;; CAREFUL-NEXT:  (call $import
+  ;; CAREFUL-NEXT:   (struct.new_default $B)
+  ;; CAREFUL-NEXT:  )
+  ;; CAREFUL-NEXT: )
+  (func $call-import
+    ;; Calls to imports are left as they are.
+    (call $import
+      (struct.new $B)
+    )
+  )
+
+  ;; ALWAYS:      (func $refinable (type $ref|$A|_=>_none) (param $ref (ref $A))
+  ;; ALWAYS-NEXT:  (drop
+  ;; ALWAYS-NEXT:   (local.get $ref)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT: )
+  ;; CAREFUL:      (func $refinable (type $ref|$A|_=>_none) (param $0 (ref $A))
+  ;; CAREFUL-NEXT:  (nop)
+  ;; CAREFUL-NEXT: )
+  (func $refinable (param $ref (ref $A))
+    ;; Helper function for the above. Use the parameter to see we update types
+    ;; etc when we make a refined version of the function (if we didn't,
+    ;; validation would error).
+    ;;
+    ;; In CAREFUL mode we optimize to check if refined types help, which has the
+    ;; side effect of optimizing the body of this function into a nop.
+    (drop
+      (local.get $ref)
+    )
+  )
+)
+
+
+;; ALWAYS:      (func $refinable_0 (type $ref|$B|_=>_none) (param $ref (ref $B))
+;; ALWAYS-NEXT:  (drop
+;; ALWAYS-NEXT:   (local.get $ref)
+;; ALWAYS-NEXT:  )
+;; ALWAYS-NEXT: )
+(module
+  ;; As above, but now the refinable function uses the local in a way that
+  ;; requires a fixup.
+
+  ;; ALWAYS:      (type $A (struct_subtype  data))
+  ;; CAREFUL:      (type $none_=>_none (func_subtype func))
+
+  ;; CAREFUL:      (type $A (struct_subtype  data))
+  (type $A (struct_subtype data))
+  ;; ALWAYS:      (type $B (struct_subtype  $A))
+  ;; CAREFUL:      (type $B (struct_subtype  $A))
+  (type $B (struct_subtype $A))
+
+
+
+  ;; ALWAYS:      (type $none_=>_none (func_subtype func))
+
+  ;; ALWAYS:      (type $ref|$A|_=>_none (func_subtype (param (ref $A)) func))
+
+  ;; ALWAYS:      (type $ref|$B|_=>_none (func_subtype (param (ref $B)) func))
+
+  ;; ALWAYS:      (func $calls (type $none_=>_none)
+  ;; ALWAYS-NEXT:  (call $refinable_0
+  ;; ALWAYS-NEXT:   (struct.new_default $B)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT: )
+  ;; CAREFUL:      (type $ref|$A|_=>_none (func_subtype (param (ref $A)) func))
+
+  ;; CAREFUL:      (func $calls (type $none_=>_none)
+  ;; CAREFUL-NEXT:  (call $refinable
+  ;; CAREFUL-NEXT:   (struct.new_default $B)
+  ;; CAREFUL-NEXT:  )
+  ;; CAREFUL-NEXT: )
+  (func $calls
+    (call $refinable
+      (struct.new $B)
+    )
+  )
+
+  ;; ALWAYS:      (func $refinable (type $ref|$A|_=>_none) (param $ref (ref $A))
+  ;; ALWAYS-NEXT:  (local $unref (ref $A))
+  ;; ALWAYS-NEXT:  (local.set $unref
+  ;; ALWAYS-NEXT:   (local.get $ref)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT:  (local.set $ref
+  ;; ALWAYS-NEXT:   (local.get $unref)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT: )
+  ;; CAREFUL:      (func $refinable (type $ref|$A|_=>_none) (param $0 (ref $A))
+  ;; CAREFUL-NEXT:  (nop)
+  ;; CAREFUL-NEXT: )
+  (func $refinable (param $ref (ref $A))
+    (local $unref (ref $A))
+    (local.set $unref
+      (local.get $ref)
+    )
+    ;; If we refine $ref then this set will be invalid - we'd be setting a less-
+    ;; refined type into a local/param that is more refined. We should fix this
+    ;; up by using a temp local.
+    (local.set $ref
+      (local.get $unref)
+    )
+  )
+)
+
+
+;; ALWAYS:      (func $refinable_0 (type $ref|$B|_=>_none) (param $ref (ref $B))
+;; ALWAYS-NEXT:  (local $unref (ref $A))
+;; ALWAYS-NEXT:  (local $2 (ref $A))
+;; ALWAYS-NEXT:  (local.set $2
+;; ALWAYS-NEXT:   (local.get $ref)
+;; ALWAYS-NEXT:  )
+;; ALWAYS-NEXT:  (block
+;; ALWAYS-NEXT:   (local.set $unref
+;; ALWAYS-NEXT:    (local.get $2)
+;; ALWAYS-NEXT:   )
+;; ALWAYS-NEXT:   (local.set $2
+;; ALWAYS-NEXT:    (local.get $unref)
+;; ALWAYS-NEXT:   )
+;; ALWAYS-NEXT:  )
+;; ALWAYS-NEXT: )
+(module
+  ;; Multiple refinings of the same function, and of different functions.
+
+  ;; ALWAYS:      (type $A (struct_subtype  data))
+  ;; CAREFUL:      (type $none_=>_none (func_subtype func))
+
+  ;; CAREFUL:      (type $A (struct_subtype  data))
+  (type $A (struct_subtype data))
+  ;; ALWAYS:      (type $B (struct_subtype  $A))
+  ;; CAREFUL:      (type $B (struct_subtype  $A))
+  (type $B (struct_subtype $A))
+
+  ;; ALWAYS:      (type $none_=>_none (func_subtype func))
+
+  ;; ALWAYS:      (type $C (struct_subtype  $B))
+  ;; CAREFUL:      (type $ref|$A|_=>_none (func_subtype (param (ref $A)) func))
+
+  ;; CAREFUL:      (type $C (struct_subtype  $B))
+  (type $C (struct_subtype $B))
+
+  ;; ALWAYS:      (type $ref|$A|_=>_none (func_subtype (param (ref $A)) func))
+
+  ;; ALWAYS:      (type $ref|$B|_=>_none (func_subtype (param (ref $B)) func))
+
+  ;; ALWAYS:      (type $ref|$C|_=>_none (func_subtype (param (ref $C)) func))
+
+  ;; ALWAYS:      (func $calls1 (type $none_=>_none)
+  ;; ALWAYS-NEXT:  (call $refinable1
+  ;; ALWAYS-NEXT:   (struct.new_default $A)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT:  (call $refinable1_0
+  ;; ALWAYS-NEXT:   (struct.new_default $B)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT: )
+  ;; CAREFUL:      (func $calls1 (type $none_=>_none)
+  ;; CAREFUL-NEXT:  (call $refinable1
+  ;; CAREFUL-NEXT:   (struct.new_default $A)
+  ;; CAREFUL-NEXT:  )
+  ;; CAREFUL-NEXT:  (call $refinable1
+  ;; CAREFUL-NEXT:   (struct.new_default $B)
+  ;; CAREFUL-NEXT:  )
+  ;; CAREFUL-NEXT: )
+  (func $calls1
+    (call $refinable1
+      (struct.new $A)
+    )
+    (call $refinable1
+      (struct.new $B)
+    )
+  )
+
+  ;; ALWAYS:      (func $calls2 (type $none_=>_none)
+  ;; ALWAYS-NEXT:  (call $refinable1_1
+  ;; ALWAYS-NEXT:   (struct.new_default $C)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT:  (call $refinable2_0
+  ;; ALWAYS-NEXT:   (struct.new_default $B)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT: )
+  ;; CAREFUL:      (func $calls2 (type $none_=>_none)
+  ;; CAREFUL-NEXT:  (call $refinable1
+  ;; CAREFUL-NEXT:   (struct.new_default $C)
+  ;; CAREFUL-NEXT:  )
+  ;; CAREFUL-NEXT:  (call $refinable2
+  ;; CAREFUL-NEXT:   (struct.new_default $B)
+  ;; CAREFUL-NEXT:  )
+  ;; CAREFUL-NEXT: )
+  (func $calls2
+    (call $refinable1
+      (struct.new $C)
+    )
+    (call $refinable2
+      (struct.new $B)
+    )
+  )
+
+  ;; ALWAYS:      (func $refinable1 (type $ref|$A|_=>_none) (param $ref (ref $A))
+  ;; ALWAYS-NEXT:  (drop
+  ;; ALWAYS-NEXT:   (local.get $ref)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT: )
+  ;; CAREFUL:      (func $refinable1 (type $ref|$A|_=>_none) (param $0 (ref $A))
+  ;; CAREFUL-NEXT:  (nop)
+  ;; CAREFUL-NEXT: )
+  (func $refinable1 (param $ref (ref $A))
+    (drop
+      (local.get $ref)
+    )
+  )
+
+  ;; ALWAYS:      (func $refinable2 (type $ref|$A|_=>_none) (param $ref (ref $A))
+  ;; ALWAYS-NEXT:  (drop
+  ;; ALWAYS-NEXT:   (local.get $ref)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT: )
+  ;; CAREFUL:      (func $refinable2 (type $ref|$A|_=>_none) (param $0 (ref $A))
+  ;; CAREFUL-NEXT:  (nop)
+  ;; CAREFUL-NEXT: )
+  (func $refinable2 (param $ref (ref $A))
+    (drop
+      (local.get $ref)
+    )
+  )
+)
+
+;; ALWAYS:      (func $refinable1_0 (type $ref|$B|_=>_none) (param $ref (ref $B))
+;; ALWAYS-NEXT:  (drop
+;; ALWAYS-NEXT:   (local.get $ref)
+;; ALWAYS-NEXT:  )
+;; ALWAYS-NEXT: )
+
+;; ALWAYS:      (func $refinable1_1 (type $ref|$C|_=>_none) (param $ref (ref $C))
+;; ALWAYS-NEXT:  (drop
+;; ALWAYS-NEXT:   (local.get $ref)
+;; ALWAYS-NEXT:  )
+;; ALWAYS-NEXT: )
+
+;; ALWAYS:      (func $refinable2_0 (type $ref|$B|_=>_none) (param $ref (ref $B))
+;; ALWAYS-NEXT:  (drop
+;; ALWAYS-NEXT:   (local.get $ref)
+;; ALWAYS-NEXT:  )
+;; ALWAYS-NEXT: )
+(module
+  ;; A case where even CAREFUL mode will monomorphize, as it helps the target
+  ;; function get optimized better.
+
+  ;; ALWAYS:      (type $A (struct_subtype  data))
+  ;; CAREFUL:      (type $A (struct_subtype  data))
+  (type $A (struct_subtype data))
+
+  ;; ALWAYS:      (type $B (struct_subtype  $A))
+  ;; CAREFUL:      (type $B (struct_subtype  $A))
+  (type $B (struct_subtype $A))
+
+  ;; ALWAYS:      (type $ref|$B|_=>_none (func_subtype (param (ref $B)) func))
+
+  ;; ALWAYS:      (type $none_=>_none (func_subtype func))
+
+  ;; ALWAYS:      (type $ref|$A|_=>_none (func_subtype (param (ref $A)) func))
+
+  ;; ALWAYS:      (import "a" "b" (func $import (param (ref $B))))
+
+  ;; ALWAYS:      (global $global (mut i32) (i32.const 1))
+  ;; CAREFUL:      (type $ref|$B|_=>_none (func_subtype (param (ref $B)) func))
+
+  ;; CAREFUL:      (type $none_=>_none (func_subtype func))
+
+  ;; CAREFUL:      (type $ref|$A|_=>_none (func_subtype (param (ref $A)) func))
+
+  ;; CAREFUL:      (import "a" "b" (func $import (param (ref $B))))
+
+  ;; CAREFUL:      (global $global (mut i32) (i32.const 1))
+  (global $global (mut i32) (i32.const 1))
+
+  (import "a" "b" (func $import (param (ref $B))))
+
+  ;; ALWAYS:      (func $calls (type $none_=>_none)
+  ;; ALWAYS-NEXT:  (call $refinable
+  ;; ALWAYS-NEXT:   (struct.new_default $A)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT:  (call $refinable
+  ;; ALWAYS-NEXT:   (struct.new_default $A)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT:  (call $refinable_0
+  ;; ALWAYS-NEXT:   (struct.new_default $B)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT:  (call $refinable_0
+  ;; ALWAYS-NEXT:   (struct.new_default $B)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT: )
+  ;; CAREFUL:      (func $calls (type $none_=>_none)
+  ;; CAREFUL-NEXT:  (call $refinable
+  ;; CAREFUL-NEXT:   (struct.new_default $A)
+  ;; CAREFUL-NEXT:  )
+  ;; CAREFUL-NEXT:  (call $refinable
+  ;; CAREFUL-NEXT:   (struct.new_default $A)
+  ;; CAREFUL-NEXT:  )
+  ;; CAREFUL-NEXT:  (call $refinable_0
+  ;; CAREFUL-NEXT:   (struct.new_default $B)
+  ;; CAREFUL-NEXT:  )
+  ;; CAREFUL-NEXT:  (call $refinable_0
+  ;; CAREFUL-NEXT:   (struct.new_default $B)
+  ;; CAREFUL-NEXT:  )
+  ;; CAREFUL-NEXT: )
+  (func $calls
+    ;; The calls sending $B will switch to calling a refined version, as the
+    ;; refined version is better, even in CAREFUL mode.
+    (call $refinable
+      (struct.new $A)
+    )
+    (call $refinable
+      (struct.new $A)
+    )
+    (call $refinable
+      (struct.new $B)
+    )
+    (call $refinable
+      (struct.new $B)
+    )
+  )
+
+  ;; ALWAYS:      (func $refinable (type $ref|$A|_=>_none) (param $ref (ref $A))
+  ;; ALWAYS-NEXT:  (local $x (ref $A))
+  ;; ALWAYS-NEXT:  (call $import
+  ;; ALWAYS-NEXT:   (ref.cast_static $B
+  ;; ALWAYS-NEXT:    (local.get $ref)
+  ;; ALWAYS-NEXT:   )
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT:  (local.set $x
+  ;; ALWAYS-NEXT:   (select (result (ref $A))
+  ;; ALWAYS-NEXT:    (local.get $ref)
+  ;; ALWAYS-NEXT:    (struct.new_default $B)
+  ;; ALWAYS-NEXT:    (global.get $global)
+  ;; ALWAYS-NEXT:   )
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT:  (call $import
+  ;; ALWAYS-NEXT:   (ref.cast_static $B
+  ;; ALWAYS-NEXT:    (local.get $x)
+  ;; ALWAYS-NEXT:   )
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT:  (call $import
+  ;; ALWAYS-NEXT:   (ref.cast_static $B
+  ;; ALWAYS-NEXT:    (local.get $x)
+  ;; ALWAYS-NEXT:   )
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT:  (call $import
+  ;; ALWAYS-NEXT:   (ref.cast_static $B
+  ;; ALWAYS-NEXT:    (local.get $ref)
+  ;; ALWAYS-NEXT:   )
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT: )
+  ;; CAREFUL:      (func $refinable (type $ref|$A|_=>_none) (param $0 (ref $A))
+  ;; CAREFUL-NEXT:  (local $1 (ref $A))
+  ;; CAREFUL-NEXT:  (call $import
+  ;; CAREFUL-NEXT:   (ref.cast_static $B
+  ;; CAREFUL-NEXT:    (local.get $0)
+  ;; CAREFUL-NEXT:   )
+  ;; CAREFUL-NEXT:  )
+  ;; CAREFUL-NEXT:  (call $import
+  ;; CAREFUL-NEXT:   (ref.cast_static $B
+  ;; CAREFUL-NEXT:    (local.tee $1
+  ;; CAREFUL-NEXT:     (select (result (ref $A))
+  ;; CAREFUL-NEXT:      (local.get $0)
+  ;; CAREFUL-NEXT:      (struct.new_default $B)
+  ;; CAREFUL-NEXT:      (global.get $global)
+  ;; CAREFUL-NEXT:     )
+  ;; CAREFUL-NEXT:    )
+  ;; CAREFUL-NEXT:   )
+  ;; CAREFUL-NEXT:  )
+  ;; CAREFUL-NEXT:  (call $import
+  ;; CAREFUL-NEXT:   (ref.cast_static $B
+  ;; CAREFUL-NEXT:    (local.get $1)
+  ;; CAREFUL-NEXT:   )
+  ;; CAREFUL-NEXT:  )
+  ;; CAREFUL-NEXT:  (call $import
+  ;; CAREFUL-NEXT:   (ref.cast_static $B
+  ;; CAREFUL-NEXT:    (local.get $0)
+  ;; CAREFUL-NEXT:   )
+  ;; CAREFUL-NEXT:  )
+  ;; CAREFUL-NEXT: )
+  (func $refinable (param $ref (ref $A))
+    (local $x (ref $A))
+    ;; The refined version of this function will not have the cast, since
+    ;; optimizations manage to remove it using the more refined type.
+    ;;
+    ;; (That is the case in CAREFUL mode, which optimizes; in ALWAYS mode the
+    ;; cast will remain since we monomorphize without bothering to optimize and
+    ;; see if there is any benefit.)
+    (call $import
+      (ref.cast_static $B
+        (local.get $ref)
+      )
+    )
+    ;; Also copy the param into a local. The local should get refined to $B in
+    ;; the refined function in CAREFUL mode.
+    (local.set $x
+      ;; Use a select here so optimizations don't just merge $x and $ref.
+      (select (result (ref $A))
+        (local.get $ref)
+        (struct.new $B)
+        (global.get $global)
+      )
+    )
+    (call $import
+      (ref.cast_static $B
+        (local.get $x)
+      )
+    )
+    (call $import
+      (ref.cast_static $B
+        (local.get $x)
+      )
+    )
+    ;; Another use of $ref, also to avoid opts merging $x and $ref.
+    (call $import
+      (ref.cast_static $B
+        (local.get $ref)
+      )
+    )
+  )
+)
+
+;; ALWAYS:      (func $refinable_0 (type $ref|$B|_=>_none) (param $ref (ref $B))
+;; ALWAYS-NEXT:  (local $x (ref $A))
+;; ALWAYS-NEXT:  (call $import
+;; ALWAYS-NEXT:   (ref.cast_static $B
+;; ALWAYS-NEXT:    (local.get $ref)
+;; ALWAYS-NEXT:   )
+;; ALWAYS-NEXT:  )
+;; ALWAYS-NEXT:  (local.set $x
+;; ALWAYS-NEXT:   (select (result (ref $B))
+;; ALWAYS-NEXT:    (local.get $ref)
+;; ALWAYS-NEXT:    (struct.new_default $B)
+;; ALWAYS-NEXT:    (global.get $global)
+;; ALWAYS-NEXT:   )
+;; ALWAYS-NEXT:  )
+;; ALWAYS-NEXT:  (call $import
+;; ALWAYS-NEXT:   (ref.cast_static $B
+;; ALWAYS-NEXT:    (local.get $x)
+;; ALWAYS-NEXT:   )
+;; ALWAYS-NEXT:  )
+;; ALWAYS-NEXT:  (call $import
+;; ALWAYS-NEXT:   (ref.cast_static $B
+;; ALWAYS-NEXT:    (local.get $x)
+;; ALWAYS-NEXT:   )
+;; ALWAYS-NEXT:  )
+;; ALWAYS-NEXT:  (call $import
+;; ALWAYS-NEXT:   (ref.cast_static $B
+;; ALWAYS-NEXT:    (local.get $ref)
+;; ALWAYS-NEXT:   )
+;; ALWAYS-NEXT:  )
+;; ALWAYS-NEXT: )
+
+;; CAREFUL:      (func $refinable_0 (type $ref|$B|_=>_none) (param $0 (ref $B))
+;; CAREFUL-NEXT:  (local $1 (ref $B))
+;; CAREFUL-NEXT:  (call $import
+;; CAREFUL-NEXT:   (local.get $0)
+;; CAREFUL-NEXT:  )
+;; CAREFUL-NEXT:  (call $import
+;; CAREFUL-NEXT:   (local.tee $1
+;; CAREFUL-NEXT:    (select (result (ref $B))
+;; CAREFUL-NEXT:     (local.get $0)
+;; CAREFUL-NEXT:     (struct.new_default $B)
+;; CAREFUL-NEXT:     (global.get $global)
+;; CAREFUL-NEXT:    )
+;; CAREFUL-NEXT:   )
+;; CAREFUL-NEXT:  )
+;; CAREFUL-NEXT:  (call $import
+;; CAREFUL-NEXT:   (local.get $1)
+;; CAREFUL-NEXT:  )
+;; CAREFUL-NEXT:  (call $import
+;; CAREFUL-NEXT:   (local.get $0)
+;; CAREFUL-NEXT:  )
+;; CAREFUL-NEXT: )
+(module
+  ;; Test that we avoid recursive calls.
+
+  ;; ALWAYS:      (type $ref|$A|_=>_none (func_subtype (param (ref $A)) func))
+
+  ;; ALWAYS:      (type $A (struct_subtype  data))
+  ;; CAREFUL:      (type $ref|$A|_=>_none (func_subtype (param (ref $A)) func))
+
+  ;; CAREFUL:      (type $A (struct_subtype  data))
+  (type $A (struct_subtype data))
+  ;; ALWAYS:      (type $B (struct_subtype  $A))
+  ;; CAREFUL:      (type $B (struct_subtype  $A))
+  (type $B (struct_subtype $A))
+
+
+  ;; ALWAYS:      (func $calls (type $ref|$A|_=>_none) (param $ref (ref $A))
+  ;; ALWAYS-NEXT:  (call $calls
+  ;; ALWAYS-NEXT:   (struct.new_default $B)
+  ;; ALWAYS-NEXT:  )
+  ;; ALWAYS-NEXT: )
+  ;; CAREFUL:      (func $calls (type $ref|$A|_=>_none) (param $ref (ref $A))
+  ;; CAREFUL-NEXT:  (call $calls
+  ;; CAREFUL-NEXT:   (struct.new_default $B)
+  ;; CAREFUL-NEXT:  )
+  ;; CAREFUL-NEXT: )
+  (func $calls (param $ref (ref $A))
+    ;; We should change nothing in this recursive call, even though we are
+    ;; sending a more refined type, so we could try to monomorphize in theory.
+    (call $calls
+      (struct.new $B)
+    )
+  )
+)