PossibleContents + ContentOracle (#4685)

This pulls out the core PossibleContents and ContentOracle classes from the very large #4598, making a smaller PR that can be reviewed first. This includes unit tests for the code, but comprehensive testing will only appear in the later PR, when a new pass is added that uses all this. PossibleContents tracks the possible contents at particular locations in the program. It can track constant values as well as "this must contain this exact type", which is more than wasm itself can indicate. *Location structs are provided to declare locations in the wasm, like the location of a local or of a function argument. ContentOracle analyzes the entire program, and can then map a Location to the PossibleContents there, which a later pass will use to optimize.
author: Alon Zakai <azakai@google.com> 2022-06-21 16:26:03 -0700
committer: GitHub <noreply@github.com> 2022-06-21 16:26:03 -0700
commit: 7fa4c0841c31930759fbad2efb8ada3ef0e6f3ef (patch)
tree: 2960d25f343d4435415cb97cafaaf994e8651fbb /src/ir/possible-contents.h
parent: 6f37355ae24449c178abcdb08294d0fb3ef5f50c (diff)
download: binaryen-7fa4c0841c31930759fbad2efb8ada3ef0e6f3ef.tar.gz
binaryen-7fa4c0841c31930759fbad2efb8ada3ef0e6f3ef.tar.bz2
binaryen-7fa4c0841c31930759fbad2efb8ada3ef0e6f3ef.zip
1 files changed, 538 insertions, 0 deletions
diff --git a/src/ir/possible-contents.h b/src/ir/possible-contents.h
new file mode 100644
index 000000000..38bc263e9
--- /dev/null
+++ b/src/ir/possible-contents.h
@@ -0,0 +1,538 @@
+/*
+ * Copyright 2022 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef wasm_ir_possible_contents_h
+#define wasm_ir_possible_contents_h
+
+#include <variant>
+
+#include "ir/possible-constant.h"
+#include "ir/subtypes.h"
+#include "support/small_vector.h"
+#include "wasm-builder.h"
+#include "wasm.h"
+
+namespace wasm {
+
+//
+// PossibleContents represents the possible contents at a particular location
+// (such as in a local or in a function parameter). This is a little similar to
+// PossibleConstantValues, but considers more types of contents than constant
+// values - in particular, it can track types to some extent.
+//
+// The specific contents this can vary over are:
+//
+//  * None:            No possible value.
+//
+//  * Literal:         One possible constant value like an i32 of 42.
+//
+//  * Global:          The name of a global whose value is here. We do not know
+//                     the actual value at compile time, but we know it is equal
+//                     to that global. Typically we can only infer this for
+//                     immutable globals.
+//
+//  * ExactType:       Any possible value of a specific exact type - *not*
+//                     including subtypes. For example, (struct.new $Foo) has
+//                     ExactType contents of type $Foo.
+//                     If the type here is nullable then null is also allowed.
+//                     TODO: Add ConeType, which would include subtypes.
+//                     TODO: Add ExactTypePlusContents or such, which would be
+//                           used on e.g. a struct.new with an immutable field
+//                           to which we assign a constant: not only do we know
+//                           the exact type, but also certain field's values.
+//
+//  * Many:            Anything else. Many things are possible here, and we do
+//                     not track what they might be, so we must assume the worst
+//                     in the calling code.
+//
+class PossibleContents {
+  struct None : public std::monostate {};
+
+  struct GlobalInfo {
+    Name name;
+    // The type of the global in the module. We stash this here so that we do
+    // not need to pass around a module all the time.
+    // TODO: could we save size in this variant if we did pass around the
+    //       module?
+    Type type;
+    bool operator==(const GlobalInfo& other) const {
+      return name == other.name && type == other.type;
+    }
+  };
+
+  using ExactType = Type;
+
+  struct Many : public std::monostate {};
+
+  // TODO: This is similar to the variant in PossibleConstantValues, and perhaps
+  //       we could share code, but extending a variant using template magic may
+  //       not be worthwhile. Another option might be to make PCV inherit from
+  //       this and disallow ExactType etc., but PCV might get slower.
+  using Variant = std::variant<None, Literal, GlobalInfo, ExactType, Many>;
+  Variant value;
+
+public:
+  PossibleContents() : value(None()) {}
+  PossibleContents(const PossibleContents& other) = default;
+
+  template<typename T> explicit PossibleContents(T val) : value(val) {}
+
+  // Most users will use one of the following static functions to construct a
+  // new instance:
+
+  static PossibleContents none() { return PossibleContents{None()}; }
+  static PossibleContents literal(Literal c) { return PossibleContents{c}; }
+  static PossibleContents global(Name name, Type type) {
+    return PossibleContents{GlobalInfo{name, type}};
+  }
+  static PossibleContents exactType(Type type) {
+    return PossibleContents{ExactType(type)};
+  }
+  static PossibleContents many() { return PossibleContents{Many()}; }
+
+  PossibleContents& operator=(const PossibleContents& other) = default;
+
+  bool operator==(const PossibleContents& other) const {
+    return value == other.value;
+  }
+
+  bool operator!=(const PossibleContents& other) const {
+    return !(*this == other);
+  }
+
+  // Combine the information in a given PossibleContents to this one. The
+  // contents here will then include whatever content was possible in |other|.
+  void combine(const PossibleContents& other);
+
+  bool isNone() const { return std::get_if<None>(&value); }
+  bool isLiteral() const { return std::get_if<Literal>(&value); }
+  bool isGlobal() const { return std::get_if<GlobalInfo>(&value); }
+  bool isExactType() const { return std::get_if<Type>(&value); }
+  bool isMany() const { return std::get_if<Many>(&value); }
+
+  Literal getLiteral() const {
+    assert(isLiteral());
+    return std::get<Literal>(value);
+  }
+
+  Name getGlobal() const {
+    assert(isGlobal());
+    return std::get<GlobalInfo>(value).name;
+  }
+
+  bool isNull() const { return isLiteral() && getLiteral().isNull(); }
+
+  // Return the relevant type here. Note that the *meaning* of the type varies
+  // by the contents: type $foo of a global means that type or any subtype, as a
+  // subtype might be written to it, while type $foo of a Literal or an
+  // ExactType means that type and nothing else; see hasExactType().
+  //
+  // If no type is possible, return unreachable; if many types are, return none.
+  Type getType() const {
+    if (auto* literal = std::get_if<Literal>(&value)) {
+      return literal->type;
+    } else if (auto* global = std::get_if<GlobalInfo>(&value)) {
+      return global->type;
+    } else if (auto* type = std::get_if<Type>(&value)) {
+      return *type;
+    } else if (std::get_if<None>(&value)) {
+      return Type::unreachable;
+    } else if (std::get_if<Many>(&value)) {
+      return Type::none;
+    } else {
+      WASM_UNREACHABLE("bad value");
+    }
+  }
+
+  // Returns whether the type we can report here is exact, that is, nothing of a
+  // strict subtype might show up - the contents here have an exact type.
+  //
+  // This is different from isExactType() which checks if all we know about the
+  // contents here is their exact type. Specifically, we may know both an exact
+  // type and also more than just that, which is the case with a Literal.
+  //
+  // This returns false for None and Many, for whom it is not well-defined.
+  bool hasExactType() const { return isExactType() || isLiteral(); }
+
+  // Whether we can make an Expression* for this containing the proper contents.
+  // We can do that for a Literal (emitting a Const or RefFunc etc.) or a
+  // Global (emitting a GlobalGet), but not for anything else yet.
+  bool canMakeExpression() const { return isLiteral() || isGlobal(); }
+
+  Expression* makeExpression(Module& wasm) {
+    assert(canMakeExpression());
+    Builder builder(wasm);
+    if (isLiteral()) {
+      return builder.makeConstantExpression(getLiteral());
+    } else {
+      auto name = getGlobal();
+      return builder.makeGlobalGet(name, wasm.getGlobal(name)->type);
+    }
+  }
+
+  size_t hash() const {
+    // Encode this using three bits for the variant type, then the rest of the
+    // contents.
+    if (isNone()) {
+      return 0;
+    } else if (isLiteral()) {
+      return size_t(1) | (std::hash<Literal>()(getLiteral()) << 3);
+    } else if (isGlobal()) {
+      return size_t(2) | (std::hash<Name>()(getGlobal()) << 3);
+    } else if (isExactType()) {
+      return size_t(3) | (std::hash<Type>()(getType()) << 3);
+    } else if (isMany()) {
+      return 4;
+    } else {
+      WASM_UNREACHABLE("bad variant");
+    }
+  }
+
+  void dump(std::ostream& o, Module* wasm = nullptr) const {
+    o << '[';
+    if (isNone()) {
+      o << "None";
+    } else if (isLiteral()) {
+      o << "Literal " << getLiteral();
+      auto t = getType();
+      if (t.isRef()) {
+        auto h = t.getHeapType();
+        o << " HT: " << h;
+      }
+    } else if (isGlobal()) {
+      o << "GlobalInfo $" << getGlobal();
+    } else if (isExactType()) {
+      o << "ExactType " << getType();
+      auto t = getType();
+      if (t.isRef()) {
+        auto h = t.getHeapType();
+        o << " HT: " << h;
+        if (wasm && wasm->typeNames.count(h)) {
+          o << " $" << wasm->typeNames[h].name;
+        }
+        if (t.isNullable()) {
+          o << " null";
+        }
+      }
+    } else if (isMany()) {
+      o << "Many";
+    } else {
+      WASM_UNREACHABLE("bad variant");
+    }
+    o << ']';
+  }
+};
+
+// The various *Location structs (ExpressionLocation, ResultLocation, etc.)
+// describe particular locations where content can appear.
+
+// The location of a specific IR expression.
+struct ExpressionLocation {
+  Expression* expr;
+  // If this expression contains a tuple then each index in the tuple will have
+  // its own location with a corresponding tupleIndex. If this is not a tuple
+  // then we only use tupleIndex 0.
+  Index tupleIndex;
+  bool operator==(const ExpressionLocation& other) const {
+    return expr == other.expr && tupleIndex == other.tupleIndex;
+  }
+};
+
+// The location of one of the results of a function.
+struct ResultLocation {
+  Function* func;
+  Index index;
+  bool operator==(const ResultLocation& other) const {
+    return func == other.func && index == other.index;
+  }
+};
+
+// The location of one of the locals in a function (either a param or a var).
+// TODO: would separating params from vars help? (SSA might be enough)
+struct LocalLocation {
+  Function* func;
+  // The index of the local.
+  Index index;
+  // As in ExpressionLocation, the index inside the tuple, or 0 if not a tuple.
+  Index tupleIndex;
+  bool operator==(const LocalLocation& other) const {
+    return func == other.func && index == other.index &&
+           tupleIndex == other.tupleIndex;
+  }
+};
+
+// The location of a break target in a function, identified by its name.
+struct BreakTargetLocation {
+  Function* func;
+  Name target;
+  // As in ExpressionLocation, the index inside the tuple, or 0 if not a tuple.
+  // That is, if the branch target has a tuple type, then each branch to that
+  // location sends a tuple, and we'll have a separate BreakTargetLocation for
+  // each, indexed by the index in the tuple that the branch sends.
+  Index tupleIndex;
+  bool operator==(const BreakTargetLocation& other) const {
+    return func == other.func && target == other.target &&
+           tupleIndex == other.tupleIndex;
+  }
+};
+
+// The location of a global in the module.
+struct GlobalLocation {
+  Name name;
+  bool operator==(const GlobalLocation& other) const {
+    return name == other.name;
+  }
+};
+
+// The location of one of the parameters in a function signature.
+struct SignatureParamLocation {
+  HeapType type;
+  Index index;
+  bool operator==(const SignatureParamLocation& other) const {
+    return type == other.type && index == other.index;
+  }
+};
+
+// The location of one of the results in a function signature.
+struct SignatureResultLocation {
+  HeapType type;
+  Index index;
+  bool operator==(const SignatureResultLocation& other) const {
+    return type == other.type && index == other.index;
+  }
+};
+
+// The location of contents in a struct or array (i.e., things that can fit in a
+// dataref). Note that this is specific to this type - it does not include data
+// about subtypes or supertypes.
+struct DataLocation {
+  HeapType type;
+  // The index of the field in a struct, or 0 for an array (where we do not
+  // attempt to differentiate by index).
+  Index index;
+  bool operator==(const DataLocation& other) const {
+    return type == other.type && index == other.index;
+  }
+};
+
+// The location of anything written to a particular tag.
+struct TagLocation {
+  Name tag;
+  // If the tag has more than one element, we'll have a separate TagLocation for
+  // each, with corresponding indexes. If the tag has just one element we'll
+  // only have one TagLocation with index 0.
+  Index tupleIndex;
+  bool operator==(const TagLocation& other) const {
+    return tag == other.tag && tupleIndex == other.tupleIndex;
+  }
+};
+
+// A null value. This is used as the location of the default value of a var in a
+// function, a null written to a struct field in struct.new_with_default, etc.
+struct NullLocation {
+  Type type;
+  bool operator==(const NullLocation& other) const {
+    return type == other.type;
+  }
+};
+
+// A special type of location that does not refer to something concrete in the
+// wasm, but is used to optimize the graph. A "cone read" is a struct.get or
+// array.get of a type that is not exact, so it can read the "cone" of all the
+// subtypes. In general a read of a cone type (as opposed to an exact type) will
+// require N incoming links, from each of the N subtypes - and we need that
+// for each struct.get of a cone. If there are M such gets then we have N * M
+// edges for this. Instead, we make a single canonical "cone read" location, and
+// add a single link to it from each get, which is only N + M (plus the cost
+// of adding "latency" in requiring an additional step along the way for the
+// data to flow along).
+struct ConeReadLocation {
+  HeapType type;
+  // The index of the field in a struct, or 0 for an array (where we do not
+  // attempt to differentiate by index).
+  Index index;
+  bool operator==(const ConeReadLocation& other) const {
+    return type == other.type && index == other.index;
+  }
+};
+
+// A location is a variant over all the possible flavors of locations that we
+// have.
+using Location = std::variant<ExpressionLocation,
+                              ResultLocation,
+                              LocalLocation,
+                              BreakTargetLocation,
+                              GlobalLocation,
+                              SignatureParamLocation,
+                              SignatureResultLocation,
+                              DataLocation,
+                              TagLocation,
+                              NullLocation,
+                              ConeReadLocation>;
+
+} // namespace wasm
+
+namespace std {
+
+std::ostream& operator<<(std::ostream& stream,
+                         const wasm::PossibleContents& contents);
+
+template<> struct hash<wasm::PossibleContents> {
+  size_t operator()(const wasm::PossibleContents& contents) const {
+    return contents.hash();
+  }
+};
+
+// Define hashes of all the *Location flavors so that Location itself is
+// hashable and we can use it in unordered maps and sets.
+
+template<> struct hash<wasm::ExpressionLocation> {
+  size_t operator()(const wasm::ExpressionLocation& loc) const {
+    return std::hash<std::pair<size_t, wasm::Index>>{}(
+      {size_t(loc.expr), loc.tupleIndex});
+  }
+};
+
+template<> struct hash<wasm::ResultLocation> {
+  size_t operator()(const wasm::ResultLocation& loc) const {
+    return std::hash<std::pair<size_t, wasm::Index>>{}(
+      {size_t(loc.func), loc.index});
+  }
+};
+
+template<> struct hash<wasm::LocalLocation> {
+  size_t operator()(const wasm::LocalLocation& loc) const {
+    return std::hash<std::pair<size_t, std::pair<wasm::Index, wasm::Index>>>{}(
+      {size_t(loc.func), {loc.index, loc.tupleIndex}});
+  }
+};
+
+template<> struct hash<wasm::BreakTargetLocation> {
+  size_t operator()(const wasm::BreakTargetLocation& loc) const {
+    return std::hash<std::pair<size_t, std::pair<wasm::Name, wasm::Index>>>{}(
+      {size_t(loc.func), {loc.target, loc.tupleIndex}});
+  }
+};
+
+template<> struct hash<wasm::GlobalLocation> {
+  size_t operator()(const wasm::GlobalLocation& loc) const {
+    return std::hash<wasm::Name>{}(loc.name);
+  }
+};
+
+template<> struct hash<wasm::SignatureParamLocation> {
+  size_t operator()(const wasm::SignatureParamLocation& loc) const {
+    return std::hash<std::pair<wasm::HeapType, wasm::Index>>{}(
+      {loc.type, loc.index});
+  }
+};
+
+template<> struct hash<wasm::SignatureResultLocation> {
+  size_t operator()(const wasm::SignatureResultLocation& loc) const {
+    return std::hash<std::pair<wasm::HeapType, wasm::Index>>{}(
+      {loc.type, loc.index});
+  }
+};
+
+template<> struct hash<wasm::DataLocation> {
+  size_t operator()(const wasm::DataLocation& loc) const {
+    return std::hash<std::pair<wasm::HeapType, wasm::Index>>{}(
+      {loc.type, loc.index});
+  }
+};
+
+template<> struct hash<wasm::TagLocation> {
+  size_t operator()(const wasm::TagLocation& loc) const {
+    return std::hash<std::pair<wasm::Name, wasm::Index>>{}(
+      {loc.tag, loc.tupleIndex});
+  }
+};
+
+template<> struct hash<wasm::NullLocation> {
+  size_t operator()(const wasm::NullLocation& loc) const {
+    return std::hash<wasm::Type>{}(loc.type);
+  }
+};
+
+template<> struct hash<wasm::ConeReadLocation> {
+  size_t operator()(const wasm::ConeReadLocation& loc) const {
+    return std::hash<std::pair<wasm::HeapType, wasm::Index>>{}(
+      {loc.type, loc.index});
+  }
+};
+
+} // namespace std
+
+namespace wasm {
+
+// Analyze the entire wasm file to find which contents are possible in which
+// locations. This assumes a closed world and starts from roots - newly created
+// values - and propagates them to the locations they reach. After the
+// analysis the user of this class can ask which contents are possible at any
+// location.
+//
+// This focuses on useful information for the typical user of this API.
+// Specifically, we find out:
+//
+//  1. What locations have no content reaching them at all. That means the code
+//     is unreachable. (Other passes may handle this, but ContentOracle does it
+//     for all things, so it might catch situations other passes do not cover;
+//     and, it takes no effort to support this here).
+//  2. For all locations, we try to find when they must contain a constant value
+//     like i32(42) or ref.func(foo).
+//  3. For locations that contain references, information about the subtypes
+//     possible there. For example, if something has wasm type anyref in the IR,
+//     we might find it must contain an exact type of something specific.
+//
+// Note that there is not much use in providing type info for locations that are
+// *not* references. If a local is i32, for example, then it cannot contain any
+// subtype anyhow, since i32 is not a reference and has no subtypes. And we know
+// the type i32 from the wasm anyhow, that is, the caller will know it.
+// Therefore the only useful information we can provide on top of the info
+// already in the wasm is either that nothing can be there (1, above), or that a
+// constant must be there (2, above), and so we do not make an effort to track
+// non-reference types here. This makes the internals of ContentOracle simpler
+// and faster. A noticeable outcome of that is that querying the contents of an
+// i32 local will return Many and not ExactType{i32} (assuming we could not
+// infer either that there must be nothing there, or a constant). Again, the
+// caller is assumed to know the wasm IR type anyhow, and also other
+// optimization passes work on the types in the IR, so we do not focus on that
+// here.
+class ContentOracle {
+  Module& wasm;
+
+  void analyze();
+
+public:
+  ContentOracle(Module& wasm) : wasm(wasm) { analyze(); }
+
+  // Get the contents possible at a location.
+  PossibleContents getContents(Location location) {
+    auto iter = locationContents.find(location);
+    if (iter == locationContents.end()) {
+      // We know of no possible contents here.
+      return PossibleContents::none();
+    }
+    return iter->second;
+  }
+
+private:
+  std::unordered_map<Location, PossibleContents> locationContents;
+};
+
+} // namespace wasm
+
+#endif // wasm_ir_possible_contents_h
author	Alon Zakai <azakai@google.com>	2022-06-21 16:26:03 -0700
committer	GitHub <noreply@github.com>	2022-06-21 16:26:03 -0700
commit	7fa4c0841c31930759fbad2efb8ada3ef0e6f3ef (patch)
tree	2960d25f343d4435415cb97cafaaf994e8651fbb /src/ir/possible-contents.h
parent	6f37355ae24449c178abcdb08294d0fb3ef5f50c (diff)
download	binaryen-7fa4c0841c31930759fbad2efb8ada3ef0e6f3ef.tar.gz binaryen-7fa4c0841c31930759fbad2efb8ada3ef0e6f3ef.tar.bz2 binaryen-7fa4c0841c31930759fbad2efb8ada3ef0e6f3ef.zip