src/passes/Monomorphize.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243

/*
 * Copyright 2022 WebAssembly Community Group participants
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

//
// When we see a call foo(arg1, arg2) and at least one of the arguments has a
// more refined type than is declared in the function being called, create a
// copy of the function with the refined type. That copy can then potentially be
// optimized in useful ways later.
//
// Inlining also monomorphizes in effect. What this pass does is handle the
// cases where inlining cannot be done.
//
// To see when monomorphizing makes sense, this optimizes the target function
// both with and without the more refined types. If the refined types help then
// the version with might remove a cast, for example. Note that while doing so
// we keep the optimization results of the version without - there is no reason
// to forget them since we've gone to the trouble anyhow. So this pass may have
// the side effect of performing minor optimizations on functions. There is also
// a variant of the pass that always monomorphizes, even when it does not seem
// helpful, which is useful for testing, and possibly in cases where we need
// more than just local optimizations to see the benefit - for example, perhaps
// GUFA ends up more powerful later on.
//
// TODO: When we optimize we could run multiple cycles: A calls B calls C might
//       end up with the refined+optimized B now having refined types in its
//       call to C, which it did not have before. This is in fact the expected
//       pattern of incremental monomorphization. Doing it in the pass could be
//       more efficient as later cycles can focus only on what was just
//       optimized and changed. Also, operating on functions just modified would
//       help the case of A calls B and we end up optimizing A after we consider
//       A->B, and the optimized version sends more refined types to B, which
//       could unlock more potential.
// TODO: We could sort the functions so that we start from root functions and
//       end on leaves. That would make it more likely for a single iteration to
//       do more work, as if A->B->C then we'd do A->B and optimize B and only
//       then look at B->C.
// TODO: Also run the result-refining part of SignatureRefining, as if we
//       refine the result then callers of the function may benefit, even if
//       there is no benefit in the function itself.
// TODO: If this is too slow, we could "group" things, for example we could
//       compute the LUB of a bunch of calls to a target and then investigate
//       that one case and use it in all those callers.
// TODO: Not just direct calls? But updating vtables is complex.
// TODO: Not just types? We could monomorphize using Literal values. E.g. for
//       function references, if we monomorphized we'd end up specializing qsort
//       for the particular functions it is given.
//

#include "ir/cost.h"
#include "ir/find_all.h"
#include "ir/module-utils.h"
#include "ir/names.h"
#include "ir/type-updating.h"
#include "ir/utils.h"
#include "pass.h"
#include "wasm-type.h"
#include "wasm.h"

namespace wasm {

namespace {

struct Monomorphize : public Pass {
  // If set, we run some opts to see if monomorphization helps, and skip it if
  // not.
  bool onlyWhenHelpful;

  Monomorphize(bool onlyWhenHelpful) : onlyWhenHelpful(onlyWhenHelpful) {}

  void run(Module* module) override {
    if (!module->features.hasGC()) {
      return;
    }

    // TODO: parallelize, see comments below

    // Note the list of all functions. We'll be adding more, and do not want to
    // operate on those.
    std::vector<Name> funcNames;
    ModuleUtils::iterDefinedFunctions(
      *module, [&](Function* func) { funcNames.push_back(func->name); });

    // Find the calls in each function and optimize where we can, changing them
    // to call more refined targets.
    for (auto name : funcNames) {
      auto* func = module->getFunction(name);
      for (auto* call : FindAll<Call>(func->body).list) {
        if (call->type == Type::unreachable) {
          // Ignore unreachable code.
          // TODO: return_call?
          continue;
        }

        if (call->target == name) {
          // Avoid recursion, which adds some complexity (as we'd be modifying
          // ourselves if we apply optimizations).
          continue;
        }

        call->target = getRefinedTarget(call, module);
      }
    }
  }

  // Given a call, make a copy of the function it is calling that has more
  // refined arguments that fit the arguments being passed perfectly.
  Name getRefinedTarget(Call* call, Module* module) {
    auto target = call->target;
    auto* func = module->getFunction(target);
    if (func->imported()) {
      // Nothing to do since this calls outside of the module.
      return target;
    }
    auto params = func->getParams();
    bool hasRefinedParam = false;
    for (Index i = 0; i < call->operands.size(); i++) {
      if (call->operands[i]->type != params[i]) {
        hasRefinedParam = true;
        break;
      }
    }
    if (!hasRefinedParam) {
      // Nothing to do since all params are fully refined already.
      return target;
    }

    std::vector<Type> refinedTypes;
    for (auto* operand : call->operands) {
      refinedTypes.push_back(operand->type);
    }
    auto refinedParams = Type(refinedTypes);
    auto iter = funcParamMap.find({target, refinedParams});
    if (iter != funcParamMap.end()) {
      return iter->second;
    }

    // This is the first time we see this situation. Let's see if it is worth
    // monomorphizing.

    // Create a new function with refined parameters as a copy of the original.
    auto refinedTarget = Names::getValidFunctionName(*module, target);
    auto* refinedFunc = ModuleUtils::copyFunction(func, *module, refinedTarget);
    TypeUpdating::updateParamTypes(refinedFunc, refinedTypes, *module);
    refinedFunc->type = HeapType(Signature(refinedParams, func->getResults()));

    // Assume we'll choose to use the refined target, but if we are being
    // careful then we might change our mind.
    auto chosenTarget = refinedTarget;
    if (onlyWhenHelpful) {
      // Optimize both functions using minimal opts, hopefully enough to see if
      // there is a benefit to the refined types (such as the new types allowing
      // a cast to be removed).
      // TODO: Atm this can be done many times per function as it is once per
      //       function and per set of types sent to it. Perhaps have some
      //       total limit to avoid slow runtimes.
      // TODO: We can end up optimizing |func| more than once. It may be
      //       different each time if the previous optimization helped, but
      //       often it will be identical. We could save the original version
      //       and use that as the starting point here (and cache the optimized
      //       version), but then we'd be throwing away optimization results. Or
      //       we could see if later optimizations do not further decrease the
      //       cost, and if so, use a cached value for the cost on such
      //       "already maximally optimized" functions. The former approach is
      //       more amenable to parallelization, as it avoids path dependence -
      //       the other approaches are deterministic but they depend on the
      //       order in which we see things. But it does require saving a copy
      //       of the function, which uses memory, which is avoided if we just
      //       keep optimizing from the current contents as we go. It's not
      //       obvious which approach is best here.
      doMinimalOpts(func);
      doMinimalOpts(refinedFunc);

      auto costBefore = CostAnalyzer(func->body).cost;
      auto costAfter = CostAnalyzer(refinedFunc->body).cost;
      if (costAfter >= costBefore) {
        // We failed to improve. Remove the new function and return the old
        // target.
        module->removeFunction(refinedTarget);
        chosenTarget = target;
      }
    }

    // Mark the chosen target in the map, so we don't do this work again: every
    // pair of target and refinedParams is only considered once.
    funcParamMap[{target, refinedParams}] = chosenTarget;

    return chosenTarget;
  }

  // Run minimal function-level optimizations on a function. This optimizes at
  // -O1 which is very fast and runs in linear time basically, and so it should
  // be reasonable to run as part of this pass: -O1 is several times faster than
  // a full -O2, in particular, and so if we run this as part of -O2 we should
  // not be making it much slower overall.
  // TODO: Perhaps we don't need all of -O1, and can focus on specific things we
  //       expect to help. That would be faster, but we'd always run the risk of
  //       missing things, especially as new passes are added later and we don't
  //       think to add them here.
  //       Alternatively, perhaps we should have a mode that does use -O1 or
  //       even -O2 or above, as in theory any optimization could end up
  //       mattering a lot here.
  void doMinimalOpts(Function* func) {
    PassRunner runner(getPassRunner());
    runner.options.optimizeLevel = 1;
    // Local subtyping is not run in -O1, but we really do want it here since
    // the entire point is that parameters now have more refined types, which
    // can lead to locals reading them being refinable as well.
    runner.add("local-subtyping");
    runner.addDefaultFunctionOptimizationPasses();
    runner.setIsNested(true);
    runner.runOnFunction(func);
  }

  // Maps [func name, param types] to the name of a new function whose params
  // have those types.
  //
  // Note that this can contain funcParamMap{A, types} = A, that is, that maps
  // a function name to itself. That indicates we found no benefit from
  // refining with those particular types, and saves us from computing it again
  // later on.
  std::unordered_map<std::pair<Name, Type>, Name> funcParamMap;
};

} // anonymous namespace

Pass* createMonomorphizePass() { return new Monomorphize(true); }

Pass* createMonomorphizeAlwaysPass() { return new Monomorphize(false); }

} // namespace wasm