[Parser] Start a new text format parser (#4680)

Begin implementing a new text format parser that will accept the standard text format. Start with a lexer that can iterate over tokens in an underlying text buffer. The initial supported tokens are integers, parentheses, and whitespace including comments. The implementation is in a new private internal header so it can be included into a gtest source file even though it is not meant to be a public API. Once the parser is more complete, there will be an additional public header exposing a more concise public API and the private header will be included into a source file that implements that public API. The new parser will improve on the existing text format parser not only because it will accept the full standard text format, but also because its code will be simpler and easier to maintain and because it will hopefully be faster as well. The new parser will be built out of small functions that closely mirror the grammar productions given in the spec and will heavily use C++17 features like string_view, optional, and variant to provide more self-documenting and efficient code. Future PRs will add support for lexing other kinds of tokens followed by support for parsing more complex constructs.
author: Thomas Lively <7121787+tlively@users.noreply.github.com> 2022-05-24 17:35:04 -0700
committer: GitHub <noreply@github.com> 2022-05-24 17:35:04 -0700
commit: 3542bd9429529bfd9fbe6a85f8f97a9388ef3c2a (patch)
tree: eb938ceda474b88011ade83d4880ec5f252c2ec9 /src/wasm/wat-parser-internal.h
parent: 4c193ecdfc873201c9feab61b9533216deb53f70 (diff)
download: binaryen-3542bd9429529bfd9fbe6a85f8f97a9388ef3c2a.tar.gz
binaryen-3542bd9429529bfd9fbe6a85f8f97a9388ef3c2a.tar.bz2
binaryen-3542bd9429529bfd9fbe6a85f8f97a9388ef3c2a.zip
1 files changed, 557 insertions, 0 deletions
diff --git a/src/wasm/wat-parser-internal.h b/src/wasm/wat-parser-internal.h
new file mode 100644
index 000000000..ede89de66
--- /dev/null
+++ b/src/wasm/wat-parser-internal.h
@@ -0,0 +1,557 @@
+/*
+ * Copyright 2022 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Usage note
+// ----------
+//
+// This parser is a work in progress and this file should not yet be included
+// anywhere except for in its own tests. Once the parser is usable, we will add
+// wat-parser.h to declare the public parsing API and wat-parser.cpp to
+// implement the public parsing functions in terms of the private API in this
+// header. The private API will stay in this header rather than moving to
+// wat-parser.cpp so that we can continue to unit test it.
+
+#include <cassert>
+#include <cctype>
+#include <iostream>
+#include <optional>
+#include <variant>
+
+using namespace std::string_view_literals;
+
+namespace wasm::WATParser {
+
+namespace {
+
+// ================
+// Lexical Analysis
+// ================
+
+// The result of lexing a token fragment.
+struct LexResult {
+  std::string_view span;
+};
+
+// Lexing context that accumulates lexed input to produce a token fragment.
+struct LexCtx {
+private:
+  // The input we are lexing.
+  std::string_view input;
+
+  // How much of the input we have already lexed.
+  size_t lexedSize = 0;
+
+public:
+  explicit LexCtx(std::string_view in) : input(in) {}
+
+  // Return the fragment that has been lexed so far.
+  std::optional<LexResult> lexed() const {
+    if (lexedSize > 0) {
+      return {LexResult{input.substr(0, lexedSize)}};
+    }
+    return {};
+  }
+
+  // The next input that has not already been lexed.
+  std::string_view next() const { return input.substr(lexedSize); }
+
+  // The size of the unlexed input.
+  size_t size() const { return input.size() - lexedSize; }
+
+  // Whether there is no more input.
+  bool empty() const { return size() == 0; }
+
+  // Tokens must be separated by spaces or parentheses.
+  bool canFinish() const;
+
+  // Whether the unlexed input starts with prefix `sv`.
+  size_t startsWith(std::string_view sv) const {
+    return next().substr(0, sv.size()) == sv;
+  }
+
+  // Consume the next `n` characters.
+  void take(size_t n) { lexedSize += n; }
+
+  // Consume an additional lexed fragment.
+  void take(const LexResult& res) { lexedSize += res.span.size(); }
+
+  // Consume the prefix and return true if possible.
+  bool takePrefix(std::string_view sv) {
+    if (startsWith(sv)) {
+      take(sv.size());
+      return true;
+    }
+    return false;
+  }
+
+  // Consume the rest of the input.
+  void takeAll() { lexedSize = input.size(); }
+};
+
+enum Signedness { Unsigned, Signed };
+
+// The result of lexing an integer token fragment.
+struct LexIntResult : LexResult {
+  uint64_t n;
+  Signedness signedness;
+};
+
+// Lexing context that accumulates lexed input to produce an integer token
+// fragment.
+struct LexIntCtx : LexCtx {
+  using LexCtx::take;
+
+private:
+  uint64_t n = 0;
+  Signedness signedness = Unsigned;
+  bool negative = false;
+  bool overflow = false;
+
+  std::optional<int> getDigit(char c) {
+    if ('0' <= c && c <= '9') {
+      return {c - '0'};
+    }
+    return std::nullopt;
+  }
+
+  std::optional<int> getHexDigit(char c) {
+    if ('0' <= c && c <= '9') {
+      return {c - '0'};
+    }
+    if ('A' <= c && c <= 'F') {
+      return {10 + c - 'A'};
+    }
+    if ('a' <= c && c <= 'f') {
+      return {10 + c - 'a'};
+    }
+    return std::nullopt;
+  }
+
+public:
+  explicit LexIntCtx(std::string_view in) : LexCtx(in) {}
+
+  std::optional<LexIntResult> lexed() {
+    // Check most significant bit for overflow of signed numbers.
+    if (overflow) {
+      return {};
+    }
+    auto basic = LexCtx::lexed();
+    if (!basic) {
+      return {};
+    }
+    if (signedness == Signed) {
+      if (negative) {
+        if (n > (1ull << 63)) {
+          // TODO: Add error production for signed underflow.
+          return {};
+        }
+      } else {
+        if (n > (1ull << 63) - 1) {
+          // TODO: Add error production for signed overflow.
+          return {};
+        }
+      }
+    }
+    return {LexIntResult{*basic, negative ? -n : n, signedness}};
+  }
+
+  void takeSign() {
+    if (takePrefix("+"sv)) {
+      signedness = Signed;
+    } else if (takePrefix("-"sv)) {
+      signedness = Signed;
+      negative = true;
+    }
+  }
+
+  bool takeDigit() {
+    if (!empty()) {
+      if (auto d = getDigit(next()[0])) {
+        take(1);
+        uint64_t newN = n * 10 + *d;
+        if (newN < n) {
+          overflow = true;
+        }
+        n = newN;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool takeHexdigit() {
+    if (!empty()) {
+      if (auto h = getHexDigit(next()[0])) {
+        take(1);
+        uint64_t newN = n * 16 + *h;
+        if (newN < n) {
+          overflow = true;
+        }
+        n = newN;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void take(const LexIntResult& res) {
+    LexCtx::take(res);
+    n = res.n;
+  }
+};
+
+std::optional<LexResult> lparen(std::string_view in) {
+  LexCtx ctx(in);
+  ctx.takePrefix("("sv);
+  return ctx.lexed();
+}
+
+std::optional<LexResult> rparen(std::string_view in) {
+  LexCtx ctx(in);
+  ctx.takePrefix(")"sv);
+  return ctx.lexed();
+}
+
+// comment      ::= linecomment | blockcomment
+// linecomment  ::= ';;' linechar* ('\n' | eof)
+// linechar     ::= c:char                      (if c != '\n')
+// blockcomment ::= '(;' blockchar* ';)'
+// blockchar    ::= c:char                      (if c != ';' and c != '(')
+//                | ';'                         (if the next char is not ')')
+//                | '('                         (if the next char is not ';')
+//                | blockcomment
+std::optional<LexResult> comment(std::string_view in) {
+  LexCtx ctx(in);
+  if (ctx.size() < 2) {
+    return {};
+  }
+
+  // Line comment
+  if (ctx.takePrefix(";;"sv)) {
+    if (auto size = ctx.next().find('\n'); size != ""sv.npos) {
+      ctx.take(size);
+    } else {
+      ctx.takeAll();
+    }
+    return ctx.lexed();
+  }
+
+  // Block comment (possibly nested!)
+  if (ctx.takePrefix("(;"sv)) {
+    size_t depth = 1;
+    while (depth > 0 && ctx.size() >= 2) {
+      if (ctx.takePrefix("(;"sv)) {
+        ++depth;
+      } else if (ctx.takePrefix(";)"sv)) {
+        --depth;
+      } else {
+        ctx.take(1);
+      }
+    }
+    if (depth > 0) {
+      // TODO: Add error production for non-terminated block comment.
+      return {};
+    }
+    return ctx.lexed();
+  }
+
+  return {};
+}
+
+std::optional<LexResult> spacechar(std::string_view in) {
+  LexCtx ctx(in);
+  ctx.takePrefix(" "sv) || ctx.takePrefix("\n"sv) || ctx.takePrefix("\r"sv) ||
+    ctx.takePrefix("\t"sv);
+  return ctx.lexed();
+}
+
+// space  ::= (' ' | format | comment)*
+// format ::= '\t' | '\n' | '\r'
+std::optional<LexResult> space(std::string_view in) {
+  LexCtx ctx(in);
+  while (ctx.size()) {
+    if (auto lexed = spacechar(ctx.next())) {
+      ctx.take(*lexed);
+    } else if (auto lexed = comment(ctx.next())) {
+      ctx.take(*lexed);
+    } else {
+      break;
+    }
+  }
+  return ctx.lexed();
+}
+
+bool LexCtx::canFinish() const {
+  // Logically we want to check for eof, parens, and space. But we don't
+  // actually want to parse more than a couple characters of space, so check for
+  // individual space chars or comment starts instead.
+  return empty() || lparen(next()) || rparen(next()) || spacechar(next()) ||
+         startsWith(";;"sv);
+}
+
+// num   ::= d:digit => d
+//         |  n:num '_'? d:digit => 10*n + d
+// digit ::= '0' => 0 | ... | '9' => 9
+std::optional<LexIntResult> num(std::string_view in) {
+  LexIntCtx ctx(in);
+  if (!ctx.takeDigit()) {
+    return {};
+  }
+  while (true) {
+    bool under = ctx.takePrefix("_"sv);
+    if (!ctx.takeDigit()) {
+      if (!under) {
+        return ctx.lexed();
+      }
+      return {};
+    }
+  }
+}
+
+// hexnum   ::= h:hexdigit => h
+//            | n:hexnum '_'? h:hexdigit => 16*n + h
+// hexdigit ::= d:digit => d
+//            | 'A' => 10 | ... | 'F' => 15
+//            | 'a' => 10 | ... | 'f' => 15
+std::optional<LexIntResult> hexnum(std::string_view in) {
+  LexIntCtx ctx(in);
+  if (!ctx.takeHexdigit()) {
+    return {};
+  }
+  while (true) {
+    bool under = ctx.takePrefix("_"sv);
+    if (!ctx.takeHexdigit()) {
+      if (!under) {
+        return ctx.lexed();
+      }
+      return {};
+    }
+  }
+}
+
+// uN ::= n:num         => n (if n < 2^N)
+//      | '0x' n:hexnum => n (if n < 2^N)
+// sN ::= s:sign n:num         => [s]n (if -2^(N-1) <= [s]n < 2^(N-1))
+//      | s:sign '0x' n:hexnum => [s]n (if -2^(N-1) <= [s]n < 2^(N-1))
+// sign ::= {} => + | '+' => + | '-' => -
+//
+// Note: Defer bounds and sign checking until we know what kind of integer we
+// expect.
+std::optional<LexIntResult> integer(std::string_view in) {
+  LexIntCtx ctx(in);
+  ctx.takeSign();
+  if (ctx.takePrefix("0x"sv)) {
+    if (auto lexed = hexnum(ctx.next())) {
+      ctx.take(*lexed);
+      if (ctx.canFinish()) {
+        return ctx.lexed();
+      }
+    }
+    // TODO: Add error production for unrecognized hexnum.
+    return {};
+  }
+  if (auto lexed = num(ctx.next())) {
+    ctx.take(*lexed);
+    if (ctx.canFinish()) {
+      return ctx.lexed();
+    }
+  }
+  return {};
+}
+
+// ======
+// Tokens
+// ======
+
+struct LParenTok {
+  friend std::ostream& operator<<(std::ostream& os, const LParenTok&) {
+    return os << "'('";
+  }
+
+  friend bool operator==(const LParenTok&, const LParenTok&) { return true; }
+};
+
+struct RParenTok {
+  friend std::ostream& operator<<(std::ostream& os, const RParenTok&) {
+    return os << "')'";
+  }
+
+  friend bool operator==(const RParenTok&, const RParenTok&) { return true; }
+};
+
+struct IntTok {
+  uint64_t n;
+  Signedness signedness;
+
+  friend std::ostream& operator<<(std::ostream& os, const IntTok& tok) {
+    return os << tok.n << (tok.signedness == Signed ? " signed" : " unsigned");
+  }
+
+  friend bool operator==(const IntTok& t1, const IntTok& t2) {
+    return t1.n == t2.n && t1.signedness == t2.signedness;
+  }
+};
+
+struct Token {
+  using Data = std::variant<LParenTok, RParenTok, IntTok>;
+
+  std::string_view span;
+  Data data;
+
+  // Suppress clang-tidy false positive about unused functions.
+  [[maybe_unused]] friend std::ostream& operator<<(std::ostream& os,
+                                                   const Token& tok) {
+    std::visit([&](const auto& t) { os << t; }, tok.data);
+    return os << " \"" << tok.span << "\"";
+  }
+
+  [[maybe_unused]] friend bool operator==(const Token& t1, const Token& t2) {
+    return t1.span == t2.span &&
+           std::visit(
+             [](auto& d1, auto& d2) {
+               if constexpr (std::is_same_v<decltype(d1), decltype(d2)>) {
+                 return d1 == d2;
+               } else {
+                 return false;
+               }
+             },
+             t1.data,
+             t2.data);
+  }
+};
+
+struct TextPos {
+  size_t line;
+  size_t col;
+
+  bool operator==(const TextPos& other) const {
+    return line == other.line && col == other.col;
+  }
+  bool operator!=(const TextPos& other) const { return !(*this == other); }
+
+  // Suppress clang-tidy false positive about unused functions.
+  [[maybe_unused]] friend std::ostream& operator<<(std::ostream& os,
+                                                   const TextPos& pos) {
+    return os << pos.line << ":" << pos.col;
+  }
+};
+
+// Lexer's purpose is twofold. First, it wraps a buffer to provide a tokenizing
+// iterator over it. Second, it implements that iterator itself. Also provides
+// utilities for locating the text position of tokens within the buffer. Text
+// positions are computed on demand rather than eagerly because they are
+// typically only needed when there is an error to report.
+struct Lexer {
+  using iterator = Lexer;
+  using difference_type = std::ptrdiff_t;
+  using value_type = Token;
+  using pointer = const Token*;
+  using reference = const Token&;
+  using iterator_category = std::forward_iterator_tag;
+
+  std::string_view buffer;
+  size_t index = 0;
+  std::optional<Token> curr;
+
+  // The end sentinel.
+  Lexer() = default;
+
+  Lexer(std::string_view buffer) : buffer(buffer) {
+    skipSpace();
+    lexToken();
+    skipSpace();
+  }
+
+  std::string_view next() const { return buffer.substr(index); }
+
+  void skipSpace() {
+    if (auto ctx = space(next())) {
+      index += ctx->span.size();
+    }
+  }
+
+  void lexToken() {
+    // TODO: Ensure we're getting the longest possible match.
+    Token tok;
+    if (auto t = lparen(next())) {
+      tok = Token{t->span, LParenTok{}};
+    } else if (auto t = rparen(next())) {
+      tok = Token{t->span, RParenTok{}};
+    } else if (auto t = integer(next())) {
+      tok = Token{t->span, IntTok{t->n, t->signedness}};
+    } else {
+      // TODO: Do something about lexing errors.
+      curr = std::nullopt;
+      return;
+    }
+    index += tok.span.size();
+    curr = {tok};
+  }
+
+  Lexer& operator++() {
+    // Preincrement
+    lexToken();
+    skipSpace();
+    return *this;
+  }
+
+  Lexer operator++(int) {
+    // Postincrement
+    Lexer ret = *this;
+    ++(*this);
+    return ret;
+  }
+
+  const Token& operator*() { return *curr; }
+  const Token* operator->() { return &*curr; }
+
+  bool operator==(const Lexer& other) const {
+    // The iterator is equal to the end sentinel when there is no current token.
+    if (!curr && !other.curr) {
+      return true;
+    }
+    // Otherwise they are equivalent when they are at the same position.
+    return index == other.index;
+  }
+
+  bool operator!=(const Lexer& other) const { return !(*this == other); }
+
+  Lexer begin() { return *this; }
+
+  Lexer end() { return Lexer(); }
+
+  TextPos position(const char* c) {
+    assert(size_t(c - buffer.data()) < buffer.size());
+    TextPos pos{1, 0};
+    for (const char* p = buffer.data(); p != c; ++p) {
+      if (*p == '\n') {
+        pos.line++;
+        pos.col = 0;
+      } else {
+        pos.col++;
+      }
+    }
+    return pos;
+  }
+
+  TextPos position(std::string_view span) { return position(span.data()); }
+
+  TextPos position(Token tok) { return position(tok.span); }
+};
+
+} // anonymous namespace
+
+} // namespace wasm::WATParser
author	Thomas Lively <7121787+tlively@users.noreply.github.com>	2022-05-24 17:35:04 -0700
committer	GitHub <noreply@github.com>	2022-05-24 17:35:04 -0700
commit	3542bd9429529bfd9fbe6a85f8f97a9388ef3c2a (patch)
tree	eb938ceda474b88011ade83d4880ec5f252c2ec9 /src/wasm/wat-parser-internal.h
parent	4c193ecdfc873201c9feab61b9533216deb53f70 (diff)
download	binaryen-3542bd9429529bfd9fbe6a85f8f97a9388ef3c2a.tar.gz binaryen-3542bd9429529bfd9fbe6a85f8f97a9388ef3c2a.tar.bz2 binaryen-3542bd9429529bfd9fbe6a85f8f97a9388ef3c2a.zip