[Parser][NFC] Create a public wat-lexer.h header (#4695)

wat-parser-internal.h was already quite large after implementing just the lexer, so it made sense to rename it to be lexer-specific and start a new file for the higher-level parser. Also make it a proper .cpp file and split the testable interface out into wat-lexer.h.
author: Thomas Lively <7121787+tlively@users.noreply.github.com> 2022-05-27 13:28:04 -0700
committer: GitHub <noreply@github.com> 2022-05-27 13:28:04 -0700
commit: 404bd2cc11cbc85c79b33a75142dbcf16d530a1b (patch)
tree: dde9c424d1c4c9291c06a1d029283ce28a29ce9b /src
parent: f21774f3865e506b6dea912af8f8be16fd29dacb (diff)
download: binaryen-404bd2cc11cbc85c79b33a75142dbcf16d530a1b.tar.gz
binaryen-404bd2cc11cbc85c79b33a75142dbcf16d530a1b.tar.bz2
binaryen-404bd2cc11cbc85c79b33a75142dbcf16d530a1b.zip
3 files changed, 283 insertions, 241 deletions
diff --git a/src/wasm/CMakeLists.txt b/src/wasm/CMakeLists.txt
index 20fc2e9fb..974f3cd10 100644
--- a/src/wasm/CMakeLists.txt
+++ b/src/wasm/CMakeLists.txt
@@ -12,6 +12,7 @@ set(wasm_SOURCES
   wasm-stack.cpp
   wasm-type.cpp
   wasm-validator.cpp
+  wat-lexer.cpp
   ${wasm_HEADERS}
 )
 # wasm-debug.cpp includes LLVM header using std::iterator (deprecated in C++17)
diff --git a/src/wasm/wat-parser-internal.h b/src/wasm/wat-lexer.cpp
index b2a270423..450d94a19 100644
--- a/src/wasm/wat-parser-internal.h
+++ b/src/wasm/wat-lexer.cpp
@@ -14,16 +14,6 @@
  * limitations under the License.
  */
 
-// Usage note
-// ----------
-//
-// This parser is a work in progress and this file should not yet be included
-// anywhere except for in its own tests. Once the parser is usable, we will add
-// wat-parser.h to declare the public parsing API and wat-parser.cpp to
-// implement the public parsing functions in terms of the private API in this
-// header. The private API will stay in this header rather than moving to
-// wat-parser.cpp so that we can continue to unit test it.
-
 #include <cassert>
 #include <cctype>
 #include <cmath>
@@ -32,6 +22,8 @@
 #include <sstream>
 #include <variant>
 
+#include "wat-lexer.h"
+
 using namespace std::string_view_literals;
 
 namespace wasm::WATParser {
@@ -106,8 +98,6 @@ public:
   void takeAll() { lexedSize = input.size(); }
 };
 
-enum Signedness { Unsigned, Signed };
-
 enum OverflowBehavior { DisallowOverflow, IgnoreOverflow };
 
 std::optional<int> getDigit(char c) {
@@ -786,258 +776,127 @@ std::optional<LexResult> keyword(std::string_view in) {
   return ctx.lexed();
 }
 
-// ======
-// Tokens
-// ======
-
-struct LParenTok {
-  friend std::ostream& operator<<(std::ostream& os, const LParenTok&) {
-    return os << "'('";
-  }
-
-  friend bool operator==(const LParenTok&, const LParenTok&) { return true; }
-};
-
-struct RParenTok {
-  friend std::ostream& operator<<(std::ostream& os, const RParenTok&) {
-    return os << "')'";
-  }
-
-  friend bool operator==(const RParenTok&, const RParenTok&) { return true; }
-};
-
-struct IntTok {
-  uint64_t n;
-  Signedness signedness;
-
-  friend std::ostream& operator<<(std::ostream& os, const IntTok& tok) {
-    return os << tok.n << (tok.signedness == Signed ? " signed" : " unsigned");
-  }
-
-  friend bool operator==(const IntTok& t1, const IntTok& t2) {
-    return t1.n == t2.n && t1.signedness == t2.signedness;
-  }
-};
-
-struct FloatTok {
-  // The payload if we lexed a nan with payload. We cannot store the payload
-  // directly in `d` because we do not know at this point whether we are parsing
-  // an f32 or f64 and therefore we do not know what the allowable payloads are.
-  std::optional<uint64_t> nanPayload;
-  double d;
-
-  friend std::ostream& operator<<(std::ostream& os, const FloatTok& tok) {
-    if (std::isnan(tok.d)) {
-      os << (std::signbit(tok.d) ? "+" : "-");
-      if (tok.nanPayload) {
-        return os << "nan:0x" << std::hex << *tok.nanPayload << std::dec;
-      }
-      return os << "nan";
-    }
-    return os << tok.d;
-  }
+} // anonymous namespace
 
-  friend bool operator==(const FloatTok& t1, const FloatTok& t2) {
-    return std::signbit(t1.d) == std::signbit(t2.d) &&
-           (t1.d == t2.d || (std::isnan(t1.d) && std::isnan(t2.d) &&
-                             t1.nanPayload == t2.nanPayload));
+void Lexer::skipSpace() {
+  if (auto ctx = space(next())) {
+    index += ctx->span.size();
   }
-};
+}
 
-struct IdTok {
-  friend std::ostream& operator<<(std::ostream& os, const IdTok&) {
-    return os << "id";
+void Lexer::lexToken() {
+  // TODO: Ensure we're getting the longest possible match.
+  Token tok;
+  if (auto t = lparen(next())) {
+    tok = Token{t->span, LParenTok{}};
+  } else if (auto t = rparen(next())) {
+    tok = Token{t->span, RParenTok{}};
+  } else if (auto t = ident(next())) {
+    tok = Token{t->span, IdTok{}};
+  } else if (auto t = integer(next())) {
+    tok = Token{t->span, IntTok{t->n, t->signedness}};
+  } else if (auto t = float_(next())) {
+    tok = Token{t->span, FloatTok{t->nanPayload, t->d}};
+  } else if (auto t = str(next())) {
+    tok = Token{t->span, StringTok{t->str}};
+  } else if (auto t = keyword(next())) {
+    tok = Token{t->span, KeywordTok{}};
+  } else {
+    // TODO: Do something about lexing errors.
+    curr = std::nullopt;
+    return;
   }
+  index += tok.span.size();
+  curr = {tok};
+}
 
-  friend bool operator==(const IdTok&, const IdTok&) { return true; }
-};
-
-struct StringTok {
-  std::optional<std::string> str;
-
-  friend std::ostream& operator<<(std::ostream& os, const StringTok& tok) {
-    if (tok.str) {
-      os << '"' << *tok.str << '"';
+TextPos Lexer::position(const char* c) {
+  assert(size_t(c - buffer.data()) < buffer.size());
+  TextPos pos{1, 0};
+  for (const char* p = buffer.data(); p != c; ++p) {
+    if (*p == '\n') {
+      pos.line++;
+      pos.col = 0;
     } else {
-      os << "(raw string)";
+      pos.col++;
     }
-    return os;
-  }
-
-  friend bool operator==(const StringTok& t1, const StringTok& t2) {
-    return t1.str == t2.str;
   }
-};
-
-struct KeywordTok {
-  friend std::ostream& operator<<(std::ostream& os, const KeywordTok&) {
-    return os << "keyword";
-  }
-
-  friend bool operator==(const KeywordTok&, const KeywordTok&) { return true; }
-};
-
-struct Token {
-  using Data = std::variant<LParenTok,
-                            RParenTok,
-                            IntTok,
-                            FloatTok,
-                            IdTok,
-                            StringTok,
-                            KeywordTok>;
-
-  std::string_view span;
-  Data data;
-
-  // Suppress clang-tidy false positive about unused functions.
-  [[maybe_unused]] friend std::ostream& operator<<(std::ostream& os,
-                                                   const Token& tok) {
-    std::visit([&](const auto& t) { os << t; }, tok.data);
-    return os << " \"" << tok.span << "\"";
-  }
-
-  [[maybe_unused]] friend bool operator==(const Token& t1, const Token& t2) {
-    return t1.span == t2.span &&
-           std::visit(
-             [](auto& d1, auto& d2) {
-               if constexpr (std::is_same_v<decltype(d1), decltype(d2)>) {
-                 return d1 == d2;
-               } else {
-                 return false;
-               }
-             },
-             t1.data,
-             t2.data);
-  }
-};
-
-struct TextPos {
-  size_t line;
-  size_t col;
+  return pos;
+}
 
-  bool operator==(const TextPos& other) const {
-    return line == other.line && col == other.col;
-  }
-  bool operator!=(const TextPos& other) const { return !(*this == other); }
+bool TextPos::operator==(const TextPos& other) const {
+  return line == other.line && col == other.col;
+}
 
-  // Suppress clang-tidy false positive about unused functions.
-  [[maybe_unused]] friend std::ostream& operator<<(std::ostream& os,
-                                                   const TextPos& pos) {
-    return os << pos.line << ":" << pos.col;
-  }
-};
+bool IntTok::operator==(const IntTok& other) const {
+  return n == other.n && signedness == other.signedness;
+}
 
-// Lexer's purpose is twofold. First, it wraps a buffer to provide a tokenizing
-// iterator over it. Second, it implements that iterator itself. Also provides
-// utilities for locating the text position of tokens within the buffer. Text
-// positions are computed on demand rather than eagerly because they are
-// typically only needed when there is an error to report.
-struct Lexer {
-  using iterator = Lexer;
-  using difference_type = std::ptrdiff_t;
-  using value_type = Token;
-  using pointer = const Token*;
-  using reference = const Token&;
-  using iterator_category = std::forward_iterator_tag;
-
-  std::string_view buffer;
-  size_t index = 0;
-  std::optional<Token> curr;
-
-  // The end sentinel.
-  Lexer() = default;
-
-  Lexer(std::string_view buffer) : buffer(buffer) {
-    skipSpace();
-    lexToken();
-    skipSpace();
-  }
+bool FloatTok::operator==(const FloatTok& other) const {
+  return std::signbit(d) == std::signbit(other.d) &&
+         (d == other.d || (std::isnan(d) && std::isnan(other.d) &&
+                           nanPayload == other.nanPayload));
+}
 
-  std::string_view next() const { return buffer.substr(index); }
+bool Token::operator==(const Token& other) const {
+  return span == other.span &&
+         std::visit(
+           [](auto& t1, auto& t2) {
+             if constexpr (std::is_same_v<decltype(t1), decltype(t2)>) {
+               return t1 == t2;
+             } else {
+               return false;
+             }
+           },
+           data,
+           other.data);
+}
 
-  void skipSpace() {
-    if (auto ctx = space(next())) {
-      index += ctx->span.size();
-    }
-  }
+std::ostream& operator<<(std::ostream& os, const TextPos& pos) {
+  return os << pos.line << ":" << pos.col;
+}
 
-  void lexToken() {
-    // TODO: Ensure we're getting the longest possible match.
-    Token tok;
-    if (auto t = lparen(next())) {
-      tok = Token{t->span, LParenTok{}};
-    } else if (auto t = rparen(next())) {
-      tok = Token{t->span, RParenTok{}};
-    } else if (auto t = ident(next())) {
-      tok = Token{t->span, IdTok{}};
-    } else if (auto t = integer(next())) {
-      tok = Token{t->span, IntTok{t->n, t->signedness}};
-    } else if (auto t = float_(next())) {
-      tok = Token{t->span, FloatTok{t->nanPayload, t->d}};
-    } else if (auto t = str(next())) {
-      tok = Token{t->span, StringTok{t->str}};
-    } else if (auto t = keyword(next())) {
-      tok = Token{t->span, KeywordTok{}};
-    } else {
-      // TODO: Do something about lexing errors.
-      curr = std::nullopt;
-      return;
-    }
-    index += tok.span.size();
-    curr = {tok};
-  }
+std::ostream& operator<<(std::ostream& os, const LParenTok&) {
+  return os << "'('";
+}
 
-  Lexer& operator++() {
-    // Preincrement
-    lexToken();
-    skipSpace();
-    return *this;
-  }
+std::ostream& operator<<(std::ostream& os, const RParenTok&) {
+  return os << "')'";
+}
 
-  Lexer operator++(int) {
-    // Postincrement
-    Lexer ret = *this;
-    ++(*this);
-    return ret;
-  }
+std::ostream& operator<<(std::ostream& os, const IdTok&) { return os << "id"; }
 
-  const Token& operator*() { return *curr; }
-  const Token* operator->() { return &*curr; }
+std::ostream& operator<<(std::ostream& os, const IntTok& tok) {
+  return os << tok.n << (tok.signedness == Signed ? " signed" : " unsigned");
+}
 
-  bool operator==(const Lexer& other) const {
-    // The iterator is equal to the end sentinel when there is no current token.
-    if (!curr && !other.curr) {
-      return true;
+std::ostream& operator<<(std::ostream& os, const FloatTok& tok) {
+  if (std::isnan(tok.d)) {
+    os << (std::signbit(tok.d) ? "+" : "-");
+    if (tok.nanPayload) {
+      return os << "nan:0x" << std::hex << *tok.nanPayload << std::dec;
     }
-    // Otherwise they are equivalent when they are at the same position.
-    return index == other.index;
+    return os << "nan";
   }
+  return os << tok.d;
+}
 
-  bool operator!=(const Lexer& other) const { return !(*this == other); }
-
-  Lexer begin() { return *this; }
-
-  Lexer end() { return Lexer(); }
-
-  TextPos position(const char* c) {
-    assert(size_t(c - buffer.data()) < buffer.size());
-    TextPos pos{1, 0};
-    for (const char* p = buffer.data(); p != c; ++p) {
-      if (*p == '\n') {
-        pos.line++;
-        pos.col = 0;
-      } else {
-        pos.col++;
-      }
-    }
-    return pos;
+std::ostream& operator<<(std::ostream& os, const StringTok& tok) {
+  if (tok.str) {
+    os << '"' << *tok.str << '"';
+  } else {
+    os << "(raw string)";
   }
+  return os;
+}
 
-  TextPos position(std::string_view span) { return position(span.data()); }
-
-  TextPos position(Token tok) { return position(tok.span); }
-};
+std::ostream& operator<<(std::ostream& os, const KeywordTok&) {
+  return os << "keyword";
+}
 
-} // anonymous namespace
+std::ostream& operator<<(std::ostream& os, const Token& tok) {
+  std::visit([&](const auto& t) { os << t; }, tok.data);
+  return os << " \"" << tok.span << "\"";
+}
 
 } // namespace wasm::WATParser
diff --git a/src/wat-lexer.h b/src/wat-lexer.h
new file mode 100644
index 000000000..d5c3b33ce
--- /dev/null
+++ b/src/wat-lexer.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright 2022 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>
+#include <iterator>
+#include <optional>
+#include <ostream>
+#include <string_view>
+#include <variant>
+
+#ifndef wasm_wat_parser_h
+#define wasm_wat_parser_h
+
+namespace wasm::WATParser {
+
+struct TextPos {
+  size_t line;
+  size_t col;
+
+  bool operator==(const TextPos& other) const;
+  bool operator!=(const TextPos& other) const { return !(*this == other); }
+
+  friend std::ostream& operator<<(std::ostream& os, const TextPos& pos);
+};
+
+// ======
+// Tokens
+// ======
+
+struct LParenTok {
+  bool operator==(const LParenTok&) const { return true; }
+  friend std::ostream& operator<<(std::ostream&, const LParenTok&);
+};
+
+struct RParenTok {
+  bool operator==(const RParenTok&) const { return true; }
+  friend std::ostream& operator<<(std::ostream&, const RParenTok&);
+};
+
+struct IdTok {
+  bool operator==(const IdTok&) const { return true; }
+  friend std::ostream& operator<<(std::ostream&, const IdTok&);
+};
+
+enum Signedness { Unsigned, Signed };
+
+struct IntTok {
+  uint64_t n;
+  Signedness signedness;
+
+  bool operator==(const IntTok&) const;
+  friend std::ostream& operator<<(std::ostream&, const IntTok&);
+};
+
+struct FloatTok {
+  // The payload if we lexed a nan with payload. We cannot store the payload
+  // directly in `d` because we do not know at this point whether we are parsing
+  // an f32 or f64 and therefore we do not know what the allowable payloads are.
+  std::optional<uint64_t> nanPayload;
+  double d;
+
+  bool operator==(const FloatTok&) const;
+  friend std::ostream& operator<<(std::ostream&, const FloatTok&);
+};
+
+struct StringTok {
+  std::optional<std::string> str;
+
+  bool operator==(const StringTok& other) const { return str == other.str; }
+  friend std::ostream& operator<<(std::ostream&, const StringTok&);
+};
+
+struct KeywordTok {
+  bool operator==(const KeywordTok&) const { return true; }
+  friend std::ostream& operator<<(std::ostream&, const KeywordTok&);
+};
+
+struct Token {
+  using Data = std::variant<LParenTok,
+                            RParenTok,
+                            IdTok,
+                            IntTok,
+                            FloatTok,
+                            StringTok,
+                            KeywordTok>;
+  std::string_view span;
+  Data data;
+
+  bool operator==(const Token&) const;
+  friend std::ostream& operator<<(std::ostream& os, const Token&);
+};
+
+// =====
+// Lexer
+// =====
+
+// Lexer's purpose is twofold. First, it wraps a buffer to provide a tokenizing
+// iterator over it. Second, it implements that iterator itself. Also provides
+// utilities for locating the text position of tokens within the buffer. Text
+// positions are computed on demand rather than eagerly because they are
+// typically only needed when there is an error to report.
+struct Lexer {
+  using iterator = Lexer;
+  using difference_type = std::ptrdiff_t;
+  using value_type = Token;
+  using pointer = const Token*;
+  using reference = const Token&;
+  using iterator_category = std::forward_iterator_tag;
+
+private:
+  std::string_view buffer;
+  size_t index = 0;
+  std::optional<Token> curr;
+
+public:
+  // The end sentinel.
+  Lexer() = default;
+
+  Lexer(std::string_view buffer) : buffer(buffer) {
+    skipSpace();
+    lexToken();
+    skipSpace();
+  }
+
+  std::string_view next() const { return buffer.substr(index); }
+  Lexer& operator++() {
+    // Preincrement
+    lexToken();
+    skipSpace();
+    return *this;
+  }
+
+  Lexer operator++(int) {
+    // Postincrement
+    Lexer ret = *this;
+    ++(*this);
+    return ret;
+  }
+
+  const Token& operator*() { return *curr; }
+  const Token* operator->() { return &*curr; }
+
+  bool operator==(const Lexer& other) const {
+    // The iterator is equal to the end sentinel when there is no current token.
+    if (!curr && !other.curr) {
+      return true;
+    }
+    // Otherwise they are equivalent when they are at the same position.
+    return index == other.index;
+  }
+
+  bool operator!=(const Lexer& other) const { return !(*this == other); }
+
+  Lexer begin() { return *this; }
+
+  Lexer end() { return Lexer(); }
+
+  TextPos position(const char* c);
+  TextPos position(std::string_view span) { return position(span.data()); }
+  TextPos position(Token tok) { return position(tok.span); }
+
+private:
+  void skipSpace();
+  void lexToken();
+};
+
+} // namespace wasm::WATParser
+
+#endif // wasm_wat_parser_h
author	Thomas Lively <7121787+tlively@users.noreply.github.com>	2022-05-27 13:28:04 -0700
committer	GitHub <noreply@github.com>	2022-05-27 13:28:04 -0700
commit	404bd2cc11cbc85c79b33a75142dbcf16d530a1b (patch)
tree	dde9c424d1c4c9291c06a1d029283ce28a29ce9b /src
parent	f21774f3865e506b6dea912af8f8be16fd29dacb (diff)
download	binaryen-404bd2cc11cbc85c79b33a75142dbcf16d530a1b.tar.gz binaryen-404bd2cc11cbc85c79b33a75142dbcf16d530a1b.tar.bz2 binaryen-404bd2cc11cbc85c79b33a75142dbcf16d530a1b.zip