1 files changed, 538 insertions, 0 deletions
diff --git a/src/ir/possible-contents.h b/src/ir/possible-contents.h
new file mode 100644
index 000000000..38bc263e9
--- /dev/null
+++ b/src/ir/possible-contents.h
@@ -0,0 +1,538 @@
+/*
+ * Copyright 2022 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef wasm_ir_possible_contents_h
+#define wasm_ir_possible_contents_h
+
+#include <variant>
+
+#include "ir/possible-constant.h"
+#include "ir/subtypes.h"
+#include "support/small_vector.h"
+#include "wasm-builder.h"
+#include "wasm.h"
+
+namespace wasm {
+
+//
+// PossibleContents represents the possible contents at a particular location
+// (such as in a local or in a function parameter). This is a little similar to
+// PossibleConstantValues, but considers more types of contents than constant
+// values - in particular, it can track types to some extent.
+//
+// The specific contents this can vary over are:
+//
+//  * None:            No possible value.
+//
+//  * Literal:         One possible constant value like an i32 of 42.
+//
+//  * Global:          The name of a global whose value is here. We do not know
+//                     the actual value at compile time, but we know it is equal
+//                     to that global. Typically we can only infer this for
+//                     immutable globals.
+//
+//  * ExactType:       Any possible value of a specific exact type - *not*
+//                     including subtypes. For example, (struct.new $Foo) has
+//                     ExactType contents of type $Foo.
+//                     If the type here is nullable then null is also allowed.
+//                     TODO: Add ConeType, which would include subtypes.
+//                     TODO: Add ExactTypePlusContents or such, which would be
+//                           used on e.g. a struct.new with an immutable field
+//                           to which we assign a constant: not only do we know
+//                           the exact type, but also certain field's values.
+//
+//  * Many:            Anything else. Many things are possible here, and we do
+//                     not track what they might be, so we must assume the worst
+//                     in the calling code.
+//
+class PossibleContents {
+  struct None : public std::monostate {};
+
+  struct GlobalInfo {
+    Name name;
+    // The type of the global in the module. We stash this here so that we do
+    // not need to pass around a module all the time.
+    // TODO: could we save size in this variant if we did pass around the
+    //       module?
+    Type type;
+    bool operator==(const GlobalInfo& other) const {
+      return name == other.name && type == other.type;
+    }
+  };
+
+  using ExactType = Type;
+
+  struct Many : public std::monostate {};
+
+  // TODO: This is similar to the variant in PossibleConstantValues, and perhaps
+  //       we could share code, but extending a variant using template magic may
+  //       not be worthwhile. Another option might be to make PCV inherit from
+  //       this and disallow ExactType etc., but PCV might get slower.
+  using Variant = std::variant<None, Literal, GlobalInfo, ExactType, Many>;
+  Variant value;
+
+public:
+  PossibleContents() : value(None()) {}
+  PossibleContents(const PossibleContents& other) = default;
+
+  template<typename T> explicit PossibleContents(T val) : value(val) {}
+
+  // Most users will use one of the following static functions to construct a
+  // new instance:
+
+  static PossibleContents none() { return PossibleContents{None()}; }
+  static PossibleContents literal(Literal c) { return PossibleContents{c}; }
+  static PossibleContents global(Name name, Type type) {
+    return PossibleContents{GlobalInfo{name, type}};
+  }
+  static PossibleContents exactType(Type type) {
+    return PossibleContents{ExactType(type)};
+  }
+  static PossibleContents many() { return PossibleContents{Many()}; }
+
+  PossibleContents& operator=(const PossibleContents& other) = default;
+
+  bool operator==(const PossibleContents& other) const {
+    return value == other.value;
+  }
+
+  bool operator!=(const PossibleContents& other) const {
+    return !(*this == other);
+  }
+
+  // Combine the information in a given PossibleContents to this one. The
+  // contents here will then include whatever content was possible in |other|.
+  void combine(const PossibleContents& other);
+
+  bool isNone() const { return std::get_if<None>(&value); }
+  bool isLiteral() const { return std::get_if<Literal>(&value); }
+  bool isGlobal() const { return std::get_if<GlobalInfo>(&value); }
+  bool isExactType() const { return std::get_if<Type>(&value); }
+  bool isMany() const { return std::get_if<Many>(&value); }
+
+  Literal getLiteral() const {
+    assert(isLiteral());
+    return std::get<Literal>(value);
+  }
+
+  Name getGlobal() const {
+    assert(isGlobal());
+    return std::get<GlobalInfo>(value).name;
+  }
+
+  bool isNull() const { return isLiteral() && getLiteral().isNull(); }
+
+  // Return the relevant type here. Note that the *meaning* of the type varies
+  // by the contents: type $foo of a global means that type or any subtype, as a
+  // subtype might be written to it, while type $foo of a Literal or an
+  // ExactType means that type and nothing else; see hasExactType().
+  //
+  // If no type is possible, return unreachable; if many types are, return none.
+  Type getType() const {
+    if (auto* literal = std::get_if<Literal>(&value)) {
+      return literal->type;
+    } else if (auto* global = std::get_if<GlobalInfo>(&value)) {
+      return global->type;
+    } else if (auto* type = std::get_if<Type>(&value)) {
+      return *type;
+    } else if (std::get_if<None>(&value)) {
+      return Type::unreachable;
+    } else if (std::get_if<Many>(&value)) {
+      return Type::none;
+    } else {
+      WASM_UNREACHABLE("bad value");
+    }
+  }
+
+  // Returns whether the type we can report here is exact, that is, nothing of a
+  // strict subtype might show up - the contents here have an exact type.
+  //
+  // This is different from isExactType() which checks if all we know about the
+  // contents here is their exact type. Specifically, we may know both an exact
+  // type and also more than just that, which is the case with a Literal.
+  //
+  // This returns false for None and Many, for whom it is not well-defined.
+  bool hasExactType() const { return isExactType() || isLiteral(); }
+
+  // Whether we can make an Expression* for this containing the proper contents.
+  // We can do that for a Literal (emitting a Const or RefFunc etc.) or a
+  // Global (emitting a GlobalGet), but not for anything else yet.
+  bool canMakeExpression() const { return isLiteral() || isGlobal(); }
+
+  Expression* makeExpression(Module& wasm) {
+    assert(canMakeExpression());
+    Builder builder(wasm);
+    if (isLiteral()) {
+      return builder.makeConstantExpression(getLiteral());
+    } else {
+      auto name = getGlobal();
+      return builder.makeGlobalGet(name, wasm.getGlobal(name)->type);
+    }
+  }
+
+  size_t hash() const {
+    // Encode this using three bits for the variant type, then the rest of the
+    // contents.
+    if (isNone()) {
+      return 0;
+    } else if (isLiteral()) {
+      return size_t(1) | (std::hash<Literal>()(getLiteral()) << 3);
+    } else if (isGlobal()) {
+      return size_t(2) | (std::hash<Name>()(getGlobal()) << 3);
+    } else if (isExactType()) {
+      return size_t(3) | (std::hash<Type>()(getType()) << 3);
+    } else if (isMany()) {
+      return 4;
+    } else {
+      WASM_UNREACHABLE("bad variant");
+    }
+  }
+
+  void dump(std::ostream& o, Module* wasm = nullptr) const {
+    o << '[';
+    if (isNone()) {
+      o << "None";
+    } else if (isLiteral()) {
+      o << "Literal " << getLiteral();
+      auto t = getType();
+      if (t.isRef()) {
+        auto h = t.getHeapType();
+        o << " HT: " << h;
+      }
+    } else if (isGlobal()) {
+      o << "GlobalInfo $" << getGlobal();
+    } else if (isExactType()) {
+      o << "ExactType " << getType();
+      auto t = getType();
+      if (t.isRef()) {
+        auto h = t.getHeapType();
+        o << " HT: " << h;
+        if (wasm && wasm->typeNames.count(h)) {
+          o << " $" << wasm->typeNames[h].name;
+        }
+        if (t.isNullable()) {
+          o << " null";
+        }
+      }
+    } else if (isMany()) {
+      o << "Many";
+    } else {
+      WASM_UNREACHABLE("bad variant");
+    }
+    o << ']';
+  }
+};
+
+// The various *Location structs (ExpressionLocation, ResultLocation, etc.)
+// describe particular locations where content can appear.
+
+// The location of a specific IR expression.
+struct ExpressionLocation {
+  Expression* expr;
+  // If this expression contains a tuple then each index in the tuple will have
+  // its own location with a corresponding tupleIndex. If this is not a tuple
+  // then we only use tupleIndex 0.
+  Index tupleIndex;
+  bool operator==(const ExpressionLocation& other) const {
+    return expr == other.expr && tupleIndex == other.tupleIndex;
+  }
+};
+
+// The location of one of the results of a function.
+struct ResultLocation {
+  Function* func;
+  Index index;
+  bool operator==(const ResultLocation& other) const {
+    return func == other.func && index == other.index;
+  }
+};
+
+// The location of one of the locals in a function (either a param or a var).
+// TODO: would separating params from vars help? (SSA might be enough)
+struct LocalLocation {
+  Function* func;
+  // The index of the local.
+  Index index;
+  // As in ExpressionLocation, the index inside the tuple, or 0 if not a tuple.
+  Index tupleIndex;
+  bool operator==(const LocalLocation& other) const {
+    return func == other.func && index == other.index &&
+           tupleIndex == other.tupleIndex;
+  }
+};
+
+// The location of a break target in a function, identified by its name.
+struct BreakTargetLocation {
+  Function* func;
+  Name target;
+  // As in ExpressionLocation, the index inside the tuple, or 0 if not a tuple.
+  // That is, if the branch target has a tuple type, then each branch to that
+  // location sends a tuple, and we'll have a separate BreakTargetLocation for
+  // each, indexed by the index in the tuple that the branch sends.
+  Index tupleIndex;
+  bool operator==(const BreakTargetLocation& other) const {
+    return func == other.func && target == other.target &&
+           tupleIndex == other.tupleIndex;
+  }
+};
+
+// The location of a global in the module.
+struct GlobalLocation {
+  Name name;
+  bool operator==(const GlobalLocation& other) const {
+    return name == other.name;
+  }
+};
+
+// The location of one of the parameters in a function signature.
+struct SignatureParamLocation {
+  HeapType type;
+  Index index;
+  bool operator==(const SignatureParamLocation& other) const {
+    return type == other.type && index == other.index;
+  }
+};
+
+// The location of one of the results in a function signature.
+struct SignatureResultLocation {
+  HeapType type;
+  Index index;
+  bool operator==(const SignatureResultLocation& other) const {
+    return type == other.type && index == other.index;
+  }
+};
+
+// The location of contents in a struct or array (i.e., things that can fit in a
+// dataref). Note that this is specific to this type - it does not include data
+// about subtypes or supertypes.
+struct DataLocation {
+  HeapType type;
+  // The index of the field in a struct, or 0 for an array (where we do not
+  // attempt to differentiate by index).
+  Index index;
+  bool operator==(const DataLocation& other) const {
+    return type == other.type && index == other.index;
+  }
+};
+
+// The location of anything written to a particular tag.
+struct TagLocation {
+  Name tag;
+  // If the tag has more than one element, we'll have a separate TagLocation for
+  // each, with corresponding indexes. If the tag has just one element we'll
+  // only have one TagLocation with index 0.
+  Index tupleIndex;
+  bool operator==(const TagLocation& other) const {
+    return tag == other.tag && tupleIndex == other.tupleIndex;
+  }
+};
+
+// A null value. This is used as the location of the default value of a var in a
+// function, a null written to a struct field in struct.new_with_default, etc.
+struct NullLocation {
+  Type type;
+  bool operator==(const NullLocation& other) const {
+    return type == other.type;
+  }
+};
+
+// A special type of location that does not refer to something concrete in the
+// wasm, but is used to optimize the graph. A "cone read" is a struct.get or
+// array.get of a type that is not exact, so it can read the "cone" of all the
+// subtypes. In general a read of a cone type (as opposed to an exact type) will
+// require N incoming links, from each of the N subtypes - and we need that
+// for each struct.get of a cone. If there are M such gets then we have N * M
+// edges for this. Instead, we make a single canonical "cone read" location, and
+// add a single link to it from each get, which is only N + M (plus the cost
+// of adding "latency" in requiring an additional step along the way for the
+// data to flow along).
+struct ConeReadLocation {
+  HeapType type;
+  // The index of the field in a struct, or 0 for an array (where we do not
+  // attempt to differentiate by index).
+  Index index;
+  bool operator==(const ConeReadLocation& other) const {
+    return type == other.type && index == other.index;
+  }
+};
+
+// A location is a variant over all the possible flavors of locations that we
+// have.
+using Location = std::variant<ExpressionLocation,
+                              ResultLocation,
+                              LocalLocation,
+                              BreakTargetLocation,
+                              GlobalLocation,
+                              SignatureParamLocation,
+                              SignatureResultLocation,
+                              DataLocation,
+                              TagLocation,
+                              NullLocation,
+                              ConeReadLocation>;
+
+} // namespace wasm
+
+namespace std {
+
+std::ostream& operator<<(std::ostream& stream,
+                         const wasm::PossibleContents& contents);
+
+template<> struct hash<wasm::PossibleContents> {
+  size_t operator()(const wasm::PossibleContents& contents) const {
+    return contents.hash();
+  }
+};
+
+// Define hashes of all the *Location flavors so that Location itself is
+// hashable and we can use it in unordered maps and sets.
+
+template<> struct hash<wasm::ExpressionLocation> {
+  size_t operator()(const wasm::ExpressionLocation& loc) const {
+    return std::hash<std::pair<size_t, wasm::Index>>{}(
+      {size_t(loc.expr), loc.tupleIndex});
+  }
+};
+
+template<> struct hash<wasm::ResultLocation> {
+  size_t operator()(const wasm::ResultLocation& loc) const {
+    return std::hash<std::pair<size_t, wasm::Index>>{}(
+      {size_t(loc.func), loc.index});
+  }
+};
+
+template<> struct hash<wasm::LocalLocation> {
+  size_t operator()(const wasm::LocalLocation& loc) const {
+    return std::hash<std::pair<size_t, std::pair<wasm::Index, wasm::Index>>>{}(
+      {size_t(loc.func), {loc.index, loc.tupleIndex}});
+  }
+};
+
+template<> struct hash<wasm::BreakTargetLocation> {
+  size_t operator()(const wasm::BreakTargetLocation& loc) const {
+    return std::hash<std::pair<size_t, std::pair<wasm::Name, wasm::Index>>>{}(
+      {size_t(loc.func), {loc.target, loc.tupleIndex}});
+  }
+};
+
+template<> struct hash<wasm::GlobalLocation> {
+  size_t operator()(const wasm::GlobalLocation& loc) const {
+    return std::hash<wasm::Name>{}(loc.name);
+  }
+};
+
+template<> struct hash<wasm::SignatureParamLocation> {
+  size_t operator()(const wasm::SignatureParamLocation& loc) const {
+    return std::hash<std::pair<wasm::HeapType, wasm::Index>>{}(
+      {loc.type, loc.index});
+  }
+};
+
+template<> struct hash<wasm::SignatureResultLocation> {
+  size_t operator()(const wasm::SignatureResultLocation& loc) const {
+    return std::hash<std::pair<wasm::HeapType, wasm::Index>>{}(
+      {loc.type, loc.index});
+  }
+};
+
+template<> struct hash<wasm::DataLocation> {
+  size_t operator()(const wasm::DataLocation& loc) const {
+    return std::hash<std::pair<wasm::HeapType, wasm::Index>>{}(
+      {loc.type, loc.index});
+  }
+};
+
+template<> struct hash<wasm::TagLocation> {
+  size_t operator()(const wasm::TagLocation& loc) const {
+    return std::hash<std::pair<wasm::Name, wasm::Index>>{}(
+      {loc.tag, loc.tupleIndex});
+  }
+};
+
+template<> struct hash<wasm::NullLocation> {
+  size_t operator()(const wasm::NullLocation& loc) const {
+    return std::hash<wasm::Type>{}(loc.type);
+  }
+};
+
+template<> struct hash<wasm::ConeReadLocation> {
+  size_t operator()(const wasm::ConeReadLocation& loc) const {
+    return std::hash<std::pair<wasm::HeapType, wasm::Index>>{}(
+      {loc.type, loc.index});
+  }
+};
+
+} // namespace std
+
+namespace wasm {
+
+// Analyze the entire wasm file to find which contents are possible in which
+// locations. This assumes a closed world and starts from roots - newly created
+// values - and propagates them to the locations they reach. After the
+// analysis the user of this class can ask which contents are possible at any
+// location.
+//
+// This focuses on useful information for the typical user of this API.
+// Specifically, we find out:
+//
+//  1. What locations have no content reaching them at all. That means the code
+//     is unreachable. (Other passes may handle this, but ContentOracle does it
+//     for all things, so it might catch situations other passes do not cover;
+//     and, it takes no effort to support this here).
+//  2. For all locations, we try to find when they must contain a constant value
+//     like i32(42) or ref.func(foo).
+//  3. For locations that contain references, information about the subtypes
+//     possible there. For example, if something has wasm type anyref in the IR,
+//     we might find it must contain an exact type of something specific.
+//
+// Note that there is not much use in providing type info for locations that are
+// *not* references. If a local is i32, for example, then it cannot contain any
+// subtype anyhow, since i32 is not a reference and has no subtypes. And we know
+// the type i32 from the wasm anyhow, that is, the caller will know it.
+// Therefore the only useful information we can provide on top of the info
+// already in the wasm is either that nothing can be there (1, above), or that a
+// constant must be there (2, above), and so we do not make an effort to track
+// non-reference types here. This makes the internals of ContentOracle simpler
+// and faster. A noticeable outcome of that is that querying the contents of an
+// i32 local will return Many and not ExactType{i32} (assuming we could not
+// infer either that there must be nothing there, or a constant). Again, the
+// caller is assumed to know the wasm IR type anyhow, and also other
+// optimization passes work on the types in the IR, so we do not focus on that
+// here.
+class ContentOracle {
+  Module& wasm;
+
+  void analyze();
+
+public:
+  ContentOracle(Module& wasm) : wasm(wasm) { analyze(); }
+
+  // Get the contents possible at a location.
+  PossibleContents getContents(Location location) {
+    auto iter = locationContents.find(location);
+    if (iter == locationContents.end()) {
+      // We know of no possible contents here.
+      return PossibleContents::none();
+    }
+    return iter->second;
+  }
+
+private:
+  std::unordered_map<Location, PossibleContents> locationContents;
+};
+
+} // namespace wasm
+
+#endif // wasm_ir_possible_contents_h