[Parser] Simplify the lexer interface (#6319)

The lexer was previously an iterator over tokens, but that expressivity is not actually used in the parser. Instead, we have `input.h` that adapts the token iterator interface into an iterface that is actually useful. As a first step toward simplifying the lexer implementation to no longer be an iterator over tokens, update its interface by moving the adaptation from input.h to the lexer itself. This requires extensive changes to the lexer unit tests, which will not have to change further when we actually simplify the lexer implementation.
author: Thomas Lively <tlively@google.com> 2024-02-20 13:08:37 -0800
committer: GitHub <noreply@github.com> 2024-02-20 13:08:37 -0800
commit: c0cdd267492956e9789148c8e478c467dd59d67b (patch)
tree: ce57c212b27345b4957c538d1c51ef2fbd3b8dac /src
parent: a9f01c0c911afabc86dfc210f3ea596f1c35de6e (diff)
download: binaryen-c0cdd267492956e9789148c8e478c467dd59d67b.tar.gz
binaryen-c0cdd267492956e9789148c8e478c467dd59d67b.tar.bz2
binaryen-c0cdd267492956e9789148c8e478c467dd59d67b.zip
3 files changed, 252 insertions, 318 deletions
diff --git a/src/parser/input-impl.h b/src/parser/input-impl.h
deleted file mode 100644
index e3cf52015..000000000
--- a/src/parser/input-impl.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright 2023 WebAssembly Community Group participants
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "input.h"
-
-#ifndef parser_input_impl_h
-#define parser_input_impl_h
-
-inline std::optional<Token> ParseInput::peek() {
-  if (!empty()) {
-    return *lexer;
-  }
-  return {};
-}
-
-inline bool ParseInput::takeLParen() {
-  auto t = peek();
-  if (!t || !t->isLParen()) {
-    return false;
-  }
-  ++lexer;
-  return true;
-}
-
-inline bool ParseInput::takeRParen() {
-  auto t = peek();
-  if (!t || !t->isRParen()) {
-    return false;
-  }
-  ++lexer;
-  return true;
-}
-
-inline bool ParseInput::takeUntilParen() {
-  while (true) {
-    auto t = peek();
-    if (!t) {
-      return false;
-    }
-    if (t->isLParen() || t->isRParen()) {
-      return true;
-    }
-    ++lexer;
-  }
-}
-
-inline std::optional<Name> ParseInput::takeID() {
-  if (auto t = peek()) {
-    if (auto id = t->getID()) {
-      ++lexer;
-      // See comment on takeName.
-      return Name(std::string(*id));
-    }
-  }
-  return {};
-}
-
-inline std::optional<std::string_view> ParseInput::takeKeyword() {
-  if (auto t = peek()) {
-    if (auto keyword = t->getKeyword()) {
-      ++lexer;
-      return *keyword;
-    }
-  }
-  return {};
-}
-
-inline bool ParseInput::takeKeyword(std::string_view expected) {
-  if (auto t = peek()) {
-    if (auto keyword = t->getKeyword()) {
-      if (*keyword == expected) {
-        ++lexer;
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-inline std::optional<uint64_t> ParseInput::takeOffset() {
-  if (auto t = peek()) {
-    if (auto keyword = t->getKeyword()) {
-      if (keyword->substr(0, 7) != "offset="sv) {
-        return {};
-      }
-      Lexer subLexer(keyword->substr(7));
-      if (subLexer == subLexer.end()) {
-        return {};
-      }
-      if (auto o = subLexer->getU<uint64_t>()) {
-        ++subLexer;
-        if (subLexer == subLexer.end()) {
-          ++lexer;
-          return o;
-        }
-      }
-    }
-  }
-  return std::nullopt;
-}
-
-inline std::optional<uint32_t> ParseInput::takeAlign() {
-  if (auto t = peek()) {
-    if (auto keyword = t->getKeyword()) {
-      if (keyword->substr(0, 6) != "align="sv) {
-        return {};
-      }
-      Lexer subLexer(keyword->substr(6));
-      if (subLexer == subLexer.end()) {
-        return {};
-      }
-      if (auto a = subLexer->getU<uint32_t>()) {
-        ++subLexer;
-        if (subLexer == subLexer.end()) {
-          ++lexer;
-          return a;
-        }
-      }
-    }
-  }
-  return {};
-}
-
-template<typename T> inline std::optional<T> ParseInput::takeU() {
-  if (auto t = peek()) {
-    if (auto n = t->getU<T>()) {
-      ++lexer;
-      return n;
-    }
-  }
-  return std::nullopt;
-}
-
-template<typename T> inline std::optional<T> ParseInput::takeI() {
-  if (auto t = peek()) {
-    if (auto n = t->getI<T>()) {
-      ++lexer;
-      return n;
-    }
-  }
-  return std::nullopt;
-}
-
-inline std::optional<uint64_t> ParseInput::takeU64() {
-  return takeU<uint64_t>();
-}
-
-inline std::optional<uint64_t> ParseInput::takeI64() {
-  return takeI<uint64_t>();
-}
-
-inline std::optional<uint32_t> ParseInput::takeU32() {
-  return takeU<uint64_t>();
-}
-
-inline std::optional<uint32_t> ParseInput::takeI32() {
-  return takeI<uint32_t>();
-}
-
-inline std::optional<uint16_t> ParseInput::takeI16() {
-  return takeI<uint16_t>();
-}
-
-inline std::optional<uint8_t> ParseInput::takeU8() { return takeU<uint8_t>(); }
-
-inline std::optional<uint8_t> ParseInput::takeI8() { return takeI<uint8_t>(); }
-
-inline std::optional<double> ParseInput::takeF64() {
-  if (auto t = peek()) {
-    if (auto d = t->getF64()) {
-      ++lexer;
-      return d;
-    }
-  }
-  return std::nullopt;
-}
-
-inline std::optional<float> ParseInput::takeF32() {
-  if (auto t = peek()) {
-    if (auto f = t->getF32()) {
-      ++lexer;
-      return f;
-    }
-  }
-  return std::nullopt;
-}
-
-inline std::optional<std::string> ParseInput::takeString() {
-  if (auto t = peek()) {
-    if (auto s = t->getString()) {
-      ++lexer;
-      return std::string(*s);
-    }
-  }
-  return {};
-}
-
-inline std::optional<Name> ParseInput::takeName() {
-  // TODO: Move this to lexer and validate UTF.
-  if (auto str = takeString()) {
-    // Copy to a std::string to make sure we have a null terminator, otherwise
-    // the `Name` constructor won't work correctly.
-    // TODO: Update `Name` to use string_view instead of char* and/or to take
-    // rvalue strings to avoid this extra copy.
-    return Name(std::string(*str));
-  }
-  return {};
-}
-
-inline bool ParseInput::takeSExprStart(std::string_view expected) {
-  auto original = lexer;
-  if (takeLParen() && takeKeyword(expected)) {
-    return true;
-  }
-  lexer = original;
-  return false;
-}
-
-inline bool ParseInput::peekSExprStart(std::string_view expected) {
-  auto original = lexer;
-  if (!takeLParen()) {
-    return false;
-  }
-  bool ret = takeKeyword(expected);
-  lexer = original;
-  return ret;
-}
-
-inline Index ParseInput::getPos() {
-  if (auto t = peek()) {
-    return lexer.getIndex() - t->span.size();
-  }
-  return lexer.getIndex();
-}
-
-inline Err ParseInput::err(Index pos, std::string reason) {
-  std::stringstream msg;
-  msg << lexer.position(pos) << ": error: " << reason;
-  return Err{msg.str()};
-}
-
-#endif // parser_input_impl_h
diff --git a/src/parser/input.h b/src/parser/input.h
index 6086ed1a5..f83f5a40a 100644
--- a/src/parser/input.h
+++ b/src/parser/input.h
@@ -41,40 +41,47 @@ struct ParseInput {
 
   bool empty() { return lexer.empty(); }
 
-  std::optional<Token> peek();
-  bool takeLParen();
-  bool takeRParen();
-  bool takeUntilParen();
-  std::optional<Name> takeID();
-  std::optional<std::string_view> takeKeyword();
-  bool takeKeyword(std::string_view expected);
-  std::optional<uint64_t> takeOffset();
-  std::optional<uint32_t> takeAlign();
-  std::optional<uint64_t> takeU64();
-  std::optional<uint64_t> takeI64();
-  std::optional<uint32_t> takeU32();
-  std::optional<uint32_t> takeI32();
-  std::optional<uint16_t> takeI16();
-  std::optional<uint8_t> takeU8();
-  std::optional<uint8_t> takeI8();
-  std::optional<double> takeF64();
-  std::optional<float> takeF32();
-  std::optional<std::string> takeString();
-  std::optional<Name> takeName();
-  bool takeSExprStart(std::string_view expected);
-  bool peekSExprStart(std::string_view expected);
+  // TODO: Remove this useless layer of abstraction between the Lexer and
+  // Parser.
+  std::optional<Token> peek() { return lexer.peek(); }
+  bool takeLParen() { return lexer.takeLParen(); }
+  bool takeRParen() { return lexer.takeRParen(); }
+  bool takeUntilParen() { return lexer.takeUntilParen(); }
+  std::optional<Name> takeID() { return lexer.takeID(); }
+  std::optional<std::string_view> takeKeyword() { return lexer.takeKeyword(); }
+  bool takeKeyword(std::string_view expected) {
+    return lexer.takeKeyword(expected);
+  }
+  std::optional<uint64_t> takeOffset() { return lexer.takeOffset(); }
+  std::optional<uint32_t> takeAlign() { return lexer.takeAlign(); }
+  std::optional<uint64_t> takeU64() { return lexer.takeU64(); }
+  std::optional<uint64_t> takeI64() { return lexer.takeI64(); }
+  std::optional<uint32_t> takeU32() { return lexer.takeU32(); }
+  std::optional<uint32_t> takeI32() { return lexer.takeI32(); }
+  std::optional<uint16_t> takeI16() { return lexer.takeI16(); }
+  std::optional<uint8_t> takeU8() { return lexer.takeU8(); }
+  std::optional<uint8_t> takeI8() { return lexer.takeI8(); }
+  std::optional<double> takeF64() { return lexer.takeF64(); }
+  std::optional<float> takeF32() { return lexer.takeF32(); }
+  std::optional<std::string> takeString() { return lexer.takeString(); }
+  std::optional<Name> takeName() { return lexer.takeName(); }
+  bool takeSExprStart(std::string_view expected) {
+    return lexer.takeSExprStart(expected);
+  }
+  bool peekSExprStart(std::string_view expected) {
+    return lexer.peekSExprStart(expected);
+  }
 
-  Index getPos();
-  [[nodiscard]] Err err(Index pos, std::string reason);
-  [[nodiscard]] Err err(std::string reason) { return err(getPos(), reason); }
+  Index getPos() { return lexer.getPos(); }
 
-private:
-  template<typename T> std::optional<T> takeU();
-  template<typename T> std::optional<T> takeS();
-  template<typename T> std::optional<T> takeI();
-};
+  [[nodiscard]] Err err(Index pos, std::string reason) {
+    std::stringstream msg;
+    msg << lexer.position(pos) << ": error: " << reason;
+    return Err{msg.str()};
+  }
 
-#include "input-impl.h"
+  [[nodiscard]] Err err(std::string reason) { return err(getPos(), reason); }
+};
 
 } // namespace wasm::WATParser
 
diff --git a/src/parser/lexer.h b/src/parser/lexer.h
index f0da151f9..8f9bd103a 100644
--- a/src/parser/lexer.h
+++ b/src/parser/lexer.h
@@ -23,6 +23,8 @@
 #include <string_view>
 #include <variant>
 
+#include "support/name.h"
+
 #ifndef parser_lexer_h
 #define parser_lexer_h
 
@@ -147,13 +149,6 @@ struct Token {
 // positions are computed on demand rather than eagerly because they are
 // typically only needed when there is an error to report.
 struct Lexer {
-  using iterator = Lexer;
-  using difference_type = std::ptrdiff_t;
-  using value_type = Token;
-  using pointer = const Token*;
-  using reference = const Token&;
-  using iterator_category = std::forward_iterator_tag;
-
 private:
   std::string_view buffer;
   size_t index = 0;
@@ -169,51 +164,238 @@ public:
 
   void setIndex(size_t i) {
     index = i;
-    skipSpace();
-    lexToken();
+    advance();
   }
 
-  std::string_view next() const { return buffer.substr(index); }
-  Lexer& operator++() {
-    // Preincrement
-    skipSpace();
-    lexToken();
-    return *this;
+  std::optional<Token> peek() const { return curr; }
+
+  bool takeLParen() {
+    auto t = peek();
+    if (!t || !t->isLParen()) {
+      return false;
+    }
+    advance();
+    return true;
   }
 
-  Lexer operator++(int) {
-    // Postincrement
-    Lexer ret = *this;
-    ++(*this);
-    return ret;
+  bool takeRParen() {
+    auto t = peek();
+    if (!t || !t->isRParen()) {
+      return false;
+    }
+    advance();
+    return true;
+  }
+
+  bool takeUntilParen() {
+    while (true) {
+      auto t = peek();
+      if (!t) {
+        return false;
+      }
+      if (t->isLParen() || t->isRParen()) {
+        return true;
+      }
+      advance();
+    }
+  }
+
+  std::optional<Name> takeID() {
+    if (auto t = peek()) {
+      if (auto id = t->getID()) {
+        advance();
+        // See comment on takeName.
+        return Name(std::string(*id));
+      }
+    }
+    return {};
+  }
+
+  std::optional<std::string_view> takeKeyword() {
+    if (auto t = peek()) {
+      if (auto keyword = t->getKeyword()) {
+        advance();
+        return *keyword;
+      }
+    }
+    return {};
+  }
+
+  bool takeKeyword(std::string_view expected) {
+    if (auto t = peek()) {
+      if (auto keyword = t->getKeyword()) {
+        if (*keyword == expected) {
+          advance();
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  std::optional<uint64_t> takeOffset() {
+    using namespace std::string_view_literals;
+    if (auto t = peek()) {
+      if (auto keyword = t->getKeyword()) {
+        if (keyword->substr(0, 7) != "offset="sv) {
+          return {};
+        }
+        Lexer subLexer(keyword->substr(7));
+        if (subLexer.empty()) {
+          return {};
+        }
+        if (auto o = subLexer.peek()->getU<uint64_t>()) {
+          subLexer.advance();
+          if (subLexer.empty()) {
+            advance();
+            return o;
+          }
+        }
+      }
+    }
+    return std::nullopt;
+  }
+
+  std::optional<uint32_t> takeAlign() {
+    using namespace std::string_view_literals;
+    if (auto t = peek()) {
+      if (auto keyword = t->getKeyword()) {
+        if (keyword->substr(0, 6) != "align="sv) {
+          return {};
+        }
+        Lexer subLexer(keyword->substr(6));
+        if (subLexer.empty()) {
+          return {};
+        }
+        if (auto a = subLexer.peek()->getU<uint32_t>()) {
+          subLexer.advance();
+          if (subLexer.empty()) {
+            advance();
+            return a;
+          }
+        }
+      }
+    }
+    return {};
+  }
+
+  template<typename T> std::optional<T> takeU() {
+    if (auto t = peek()) {
+      if (auto n = t->getU<T>()) {
+        advance();
+        return n;
+      }
+    }
+    return std::nullopt;
+  }
+
+  template<typename T> std::optional<T> takeI() {
+    if (auto t = peek()) {
+      if (auto n = t->getI<T>()) {
+        advance();
+        return n;
+      }
+    }
+    return std::nullopt;
+  }
+
+  std::optional<uint64_t> takeU64() { return takeU<uint64_t>(); }
+
+  std::optional<uint64_t> takeI64() { return takeI<uint64_t>(); }
+
+  std::optional<uint32_t> takeU32() { return takeU<uint32_t>(); }
+
+  std::optional<uint32_t> takeI32() { return takeI<uint32_t>(); }
+
+  std::optional<uint16_t> takeI16() { return takeI<uint16_t>(); }
+
+  std::optional<uint8_t> takeU8() { return takeU<uint8_t>(); }
+
+  std::optional<uint8_t> takeI8() { return takeI<uint8_t>(); }
+
+  std::optional<double> takeF64() {
+    if (auto t = peek()) {
+      if (auto d = t->getF64()) {
+        advance();
+        return d;
+      }
+    }
+    return std::nullopt;
+  }
+
+  std::optional<float> takeF32() {
+    if (auto t = peek()) {
+      if (auto f = t->getF32()) {
+        advance();
+        return f;
+      }
+    }
+    return std::nullopt;
   }
 
-  const Token& operator*() { return *curr; }
-  const Token* operator->() { return &*curr; }
+  std::optional<std::string> takeString() {
+    if (auto t = peek()) {
+      if (auto s = t->getString()) {
+        advance();
+        return std::string(*s);
+      }
+    }
+    return {};
+  }
 
-  bool operator==(const Lexer& other) const {
-    // The iterator is equal to the end sentinel when there is no current token.
-    if (!curr && !other.curr) {
+  std::optional<Name> takeName() {
+    // TODO: Move this to lexer and validate UTF.
+    if (auto str = takeString()) {
+      // Copy to a std::string to make sure we have a null terminator, otherwise
+      // the `Name` constructor won't work correctly.
+      // TODO: Update `Name` to use string_view instead of char* and/or to take
+      // rvalue strings to avoid this extra copy.
+      return Name(std::string(*str));
+    }
+    return {};
+  }
+
+  bool takeSExprStart(std::string_view expected) {
+    auto original = *this;
+    if (takeLParen() && takeKeyword(expected)) {
       return true;
     }
-    // Otherwise they are equivalent when they are at the same position.
-    return index == other.index;
+    *this = original;
+    return false;
   }
 
-  bool operator!=(const Lexer& other) const { return !(*this == other); }
+  bool peekSExprStart(std::string_view expected) {
+    auto original = *this;
+    if (!takeLParen()) {
+      return false;
+    }
+    bool ret = takeKeyword(expected);
+    *this = original;
+    return ret;
+  }
 
-  Lexer begin() { return *this; }
+  std::string_view next() const { return buffer.substr(index); }
 
-  Lexer end() const { return Lexer(); }
+  void advance() {
+    skipSpace();
+    lexToken();
+  }
 
-  bool empty() const { return *this == end(); }
+  bool empty() const { return !curr; }
 
   TextPos position(const char* c) const;
   TextPos position(size_t i) const { return position(buffer.data() + i); }
   TextPos position(std::string_view span) const {
     return position(span.data());
   }
-  TextPos position(Token tok) const { return position(tok.span); }
+  TextPos position() const { return position(getPos()); }
+
+  size_t getPos() const {
+    if (auto t = peek()) {
+      return getIndex() - t->span.size();
+    }
+    return getIndex();
+  }
 
 private:
   void skipSpace();
author	Thomas Lively <tlively@google.com>	2024-02-20 13:08:37 -0800
committer	GitHub <noreply@github.com>	2024-02-20 13:08:37 -0800
commit	c0cdd267492956e9789148c8e478c467dd59d67b (patch)
tree	ce57c212b27345b4957c538d1c51ef2fbd3b8dac /src
parent	a9f01c0c911afabc86dfc210f3ea596f1c35de6e (diff)
download	binaryen-c0cdd267492956e9789148c8e478c467dd59d67b.tar.gz binaryen-c0cdd267492956e9789148c8e478c467dd59d67b.tar.bz2 binaryen-c0cdd267492956e9789148c8e478c467dd59d67b.zip