[Parser] Do not eagerly lex parens (#6540)

The lexer currently lexes tokens eagerly and stores them in a `Token` variant ahead of when they are actually requested by the parser. It is wasteful, however, to classify tokens before they are requested by the parser because it is likely that the next token will be precisely the kind the parser requests. The work of checking and rejecting other possible classifications ahead of time is not useful. To make incremental progress toward removing `Token` completely, lex parentheses on demand instead of eagerly.
author: Thomas Lively <tlively@google.com> 2024-04-25 20:48:08 -0700
committer: GitHub <noreply@github.com> 2024-04-25 20:48:08 -0700
commit: 35560732b6a2c6960a6e72ea478bc0238a967c30 (patch)
tree: 14f91fbd240f7b7d7e6b942af63b2ee3bcff2034 /src/parser
parent: c33f126046d6504064d587b8bd7c310a7fdf2087 (diff)
download: binaryen-35560732b6a2c6960a6e72ea478bc0238a967c30.tar.gz
binaryen-35560732b6a2c6960a6e72ea478bc0238a967c30.tar.bz2
binaryen-35560732b6a2c6960a6e72ea478bc0238a967c30.zip
2 files changed, 36 insertions, 65 deletions
diff --git a/src/parser/lexer.cpp b/src/parser/lexer.cpp
index ef25b6302..7c9bbb225 100644
--- a/src/parser/lexer.cpp
+++ b/src/parser/lexer.cpp
@@ -374,18 +374,6 @@ struct LexAnnotationCtx : LexCtx {
   }
 };
 
-std::optional<LexResult> lparen(std::string_view in) {
-  LexCtx ctx(in);
-  ctx.takePrefix("("sv);
-  return ctx.lexed();
-}
-
-std::optional<LexResult> rparen(std::string_view in) {
-  LexCtx ctx(in);
-  ctx.takePrefix(")"sv);
-  return ctx.lexed();
-}
-
 std::optional<LexResult> idchar(std::string_view);
 std::optional<LexResult> space(std::string_view);
 std::optional<LexResult> keyword(std::string_view);
@@ -554,8 +542,8 @@ bool LexCtx::canFinish() const {
   // Logically we want to check for eof, parens, and space. But we don't
   // actually want to parse more than a couple characters of space, so check for
   // individual space chars or comment starts instead.
-  return empty() || lparen(next()) || rparen(next()) || spacechar(next()) ||
-         startsWith(";;"sv);
+  return empty() || startsWith("("sv) || startsWith(")"sv) ||
+         spacechar(next()) || startsWith(";;"sv);
 }
 
 // num   ::= d:digit => d
@@ -1057,14 +1045,34 @@ void Lexer::skipSpace() {
   }
 }
 
+bool Lexer::takeLParen() {
+  if (curr) {
+    return false;
+  }
+  if (LexCtx(next()).startsWith("("sv)) {
+    ++index;
+    advance();
+    return true;
+  }
+  return false;
+}
+
+bool Lexer::takeRParen() {
+  if (curr) {
+    return false;
+  }
+  if (LexCtx(next()).startsWith(")"sv)) {
+    ++index;
+    advance();
+    return true;
+  }
+  return false;
+}
+
 void Lexer::lexToken() {
   // TODO: Ensure we're getting the longest possible match.
   Token tok;
-  if (auto t = lparen(next())) {
-    tok = Token{t->span, LParenTok{}};
-  } else if (auto t = rparen(next())) {
-    tok = Token{t->span, RParenTok{}};
-  } else if (auto t = ident(next())) {
+  if (auto t = ident(next())) {
     tok = Token{t->span, IdTok{t->isStr, t->str}};
   } else if (auto t = integer(next())) {
     tok = Token{t->span, IntTok{t->n, t->sign}};
@@ -1129,14 +1137,6 @@ std::ostream& operator<<(std::ostream& os, const TextPos& pos) {
   return os << pos.line << ":" << pos.col;
 }
 
-std::ostream& operator<<(std::ostream& os, const LParenTok&) {
-  return os << "'('";
-}
-
-std::ostream& operator<<(std::ostream& os, const RParenTok&) {
-  return os << "')'";
-}
-
 std::ostream& operator<<(std::ostream& os, const IdTok&) { return os << "id"; }
 
 std::ostream& operator<<(std::ostream& os, const IntTok& tok) {
diff --git a/src/parser/lexer.h b/src/parser/lexer.h
index e601091db..10ba7c25a 100644
--- a/src/parser/lexer.h
+++ b/src/parser/lexer.h
@@ -45,16 +45,6 @@ struct TextPos {
 // Tokens
 // ======
 
-struct LParenTok {
-  bool operator==(const LParenTok&) const { return true; }
-  friend std::ostream& operator<<(std::ostream&, const LParenTok&);
-};
-
-struct RParenTok {
-  bool operator==(const RParenTok&) const { return true; }
-  friend std::ostream& operator<<(std::ostream&, const RParenTok&);
-};
-
 struct IdTok {
   // Whether this ID has `$"..."` format
   bool isStr;
@@ -103,13 +93,7 @@ struct KeywordTok {
 };
 
 struct Token {
-  using Data = std::variant<LParenTok,
-                            RParenTok,
-                            IdTok,
-                            IntTok,
-                            FloatTok,
-                            StringTok,
-                            KeywordTok>;
+  using Data = std::variant<IdTok, IntTok, FloatTok, StringTok, KeywordTok>;
   std::string_view span;
   Data data;
 
@@ -117,10 +101,6 @@ struct Token {
   // Token classification
   // ====================
 
-  bool isLParen() const { return std::get_if<LParenTok>(&data); }
-
-  bool isRParen() const { return std::get_if<RParenTok>(&data); }
-
   std::optional<std::string_view> getKeyword() const {
     if (std::get_if<KeywordTok>(&data)) {
       return span;
@@ -173,34 +153,25 @@ public:
     advance();
   }
 
-  bool takeLParen() {
-    if (!curr || !curr->isLParen()) {
-      return false;
-    }
-    advance();
-    return true;
-  }
+  bool takeLParen();
 
   bool peekLParen() { return Lexer(*this).takeLParen(); }
 
-  bool takeRParen() {
-    if (!curr || !curr->isRParen()) {
-      return false;
-    }
-    advance();
-    return true;
-  }
+  bool takeRParen();
 
   bool peekRParen() { return Lexer(*this).takeRParen(); }
 
   bool takeUntilParen() {
     while (true) {
-      if (!curr) {
+      if (empty()) {
         return false;
       }
-      if (curr->isLParen() || curr->isRParen()) {
+      if (peekLParen() || peekRParen()) {
         return true;
       }
+      if (!curr) {
+        ++index;
+      }
       advance();
     }
   }
@@ -392,7 +363,7 @@ public:
     lexToken();
   }
 
-  bool empty() const { return !curr; }
+  bool empty() const { return !curr && index == buffer.size(); }
 
   TextPos position(const char* c) const;
   TextPos position(size_t i) const { return position(buffer.data() + i); }
author	Thomas Lively <tlively@google.com>	2024-04-25 20:48:08 -0700
committer	GitHub <noreply@github.com>	2024-04-25 20:48:08 -0700
commit	35560732b6a2c6960a6e72ea478bc0238a967c30 (patch)
tree	14f91fbd240f7b7d7e6b942af63b2ee3bcff2034 /src/parser
parent	c33f126046d6504064d587b8bd7c310a7fdf2087 (diff)
download	binaryen-35560732b6a2c6960a6e72ea478bc0238a967c30.tar.gz binaryen-35560732b6a2c6960a6e72ea478bc0238a967c30.tar.bz2 binaryen-35560732b6a2c6960a6e72ea478bc0238a967c30.zip