diff options
author | Thomas Lively <tlively@google.com> | 2024-04-25 20:48:08 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-25 20:48:08 -0700 |
commit | 35560732b6a2c6960a6e72ea478bc0238a967c30 (patch) | |
tree | 14f91fbd240f7b7d7e6b942af63b2ee3bcff2034 /src/parser | |
parent | c33f126046d6504064d587b8bd7c310a7fdf2087 (diff) | |
download | binaryen-35560732b6a2c6960a6e72ea478bc0238a967c30.tar.gz binaryen-35560732b6a2c6960a6e72ea478bc0238a967c30.tar.bz2 binaryen-35560732b6a2c6960a6e72ea478bc0238a967c30.zip |
[Parser] Do not eagerly lex parens (#6540)
The lexer currently lexes tokens eagerly and stores them in a `Token` variant
ahead of when they are actually requested by the parser. It is wasteful,
however, to classify tokens before they are requested by the parser because it
is likely that the next token will be precisely the kind the parser requests.
The work of checking and rejecting other possible classifications ahead of time
is not useful.
To make incremental progress toward removing `Token` completely, lex parentheses
on demand instead of eagerly.
Diffstat (limited to 'src/parser')
-rw-r--r-- | src/parser/lexer.cpp | 54 | ||||
-rw-r--r-- | src/parser/lexer.h | 47 |
2 files changed, 36 insertions, 65 deletions
diff --git a/src/parser/lexer.cpp b/src/parser/lexer.cpp index ef25b6302..7c9bbb225 100644 --- a/src/parser/lexer.cpp +++ b/src/parser/lexer.cpp @@ -374,18 +374,6 @@ struct LexAnnotationCtx : LexCtx { } }; -std::optional<LexResult> lparen(std::string_view in) { - LexCtx ctx(in); - ctx.takePrefix("("sv); - return ctx.lexed(); -} - -std::optional<LexResult> rparen(std::string_view in) { - LexCtx ctx(in); - ctx.takePrefix(")"sv); - return ctx.lexed(); -} - std::optional<LexResult> idchar(std::string_view); std::optional<LexResult> space(std::string_view); std::optional<LexResult> keyword(std::string_view); @@ -554,8 +542,8 @@ bool LexCtx::canFinish() const { // Logically we want to check for eof, parens, and space. But we don't // actually want to parse more than a couple characters of space, so check for // individual space chars or comment starts instead. - return empty() || lparen(next()) || rparen(next()) || spacechar(next()) || - startsWith(";;"sv); + return empty() || startsWith("("sv) || startsWith(")"sv) || + spacechar(next()) || startsWith(";;"sv); } // num ::= d:digit => d @@ -1057,14 +1045,34 @@ void Lexer::skipSpace() { } } +bool Lexer::takeLParen() { + if (curr) { + return false; + } + if (LexCtx(next()).startsWith("("sv)) { + ++index; + advance(); + return true; + } + return false; +} + +bool Lexer::takeRParen() { + if (curr) { + return false; + } + if (LexCtx(next()).startsWith(")"sv)) { + ++index; + advance(); + return true; + } + return false; +} + void Lexer::lexToken() { // TODO: Ensure we're getting the longest possible match. Token tok; - if (auto t = lparen(next())) { - tok = Token{t->span, LParenTok{}}; - } else if (auto t = rparen(next())) { - tok = Token{t->span, RParenTok{}}; - } else if (auto t = ident(next())) { + if (auto t = ident(next())) { tok = Token{t->span, IdTok{t->isStr, t->str}}; } else if (auto t = integer(next())) { tok = Token{t->span, IntTok{t->n, t->sign}}; @@ -1129,14 +1137,6 @@ std::ostream& operator<<(std::ostream& os, const TextPos& pos) { return os << pos.line << ":" << pos.col; } -std::ostream& operator<<(std::ostream& os, const LParenTok&) { - return os << "'('"; -} - -std::ostream& operator<<(std::ostream& os, const RParenTok&) { - return os << "')'"; -} - std::ostream& operator<<(std::ostream& os, const IdTok&) { return os << "id"; } std::ostream& operator<<(std::ostream& os, const IntTok& tok) { diff --git a/src/parser/lexer.h b/src/parser/lexer.h index e601091db..10ba7c25a 100644 --- a/src/parser/lexer.h +++ b/src/parser/lexer.h @@ -45,16 +45,6 @@ struct TextPos { // Tokens // ====== -struct LParenTok { - bool operator==(const LParenTok&) const { return true; } - friend std::ostream& operator<<(std::ostream&, const LParenTok&); -}; - -struct RParenTok { - bool operator==(const RParenTok&) const { return true; } - friend std::ostream& operator<<(std::ostream&, const RParenTok&); -}; - struct IdTok { // Whether this ID has `$"..."` format bool isStr; @@ -103,13 +93,7 @@ struct KeywordTok { }; struct Token { - using Data = std::variant<LParenTok, - RParenTok, - IdTok, - IntTok, - FloatTok, - StringTok, - KeywordTok>; + using Data = std::variant<IdTok, IntTok, FloatTok, StringTok, KeywordTok>; std::string_view span; Data data; @@ -117,10 +101,6 @@ struct Token { // Token classification // ==================== - bool isLParen() const { return std::get_if<LParenTok>(&data); } - - bool isRParen() const { return std::get_if<RParenTok>(&data); } - std::optional<std::string_view> getKeyword() const { if (std::get_if<KeywordTok>(&data)) { return span; @@ -173,34 +153,25 @@ public: advance(); } - bool takeLParen() { - if (!curr || !curr->isLParen()) { - return false; - } - advance(); - return true; - } + bool takeLParen(); bool peekLParen() { return Lexer(*this).takeLParen(); } - bool takeRParen() { - if (!curr || !curr->isRParen()) { - return false; - } - advance(); - return true; - } + bool takeRParen(); bool peekRParen() { return Lexer(*this).takeRParen(); } bool takeUntilParen() { while (true) { - if (!curr) { + if (empty()) { return false; } - if (curr->isLParen() || curr->isRParen()) { + if (peekLParen() || peekRParen()) { return true; } + if (!curr) { + ++index; + } advance(); } } @@ -392,7 +363,7 @@ public: lexToken(); } - bool empty() const { return !curr; } + bool empty() const { return !curr && index == buffer.size(); } TextPos position(const char* c) const; TextPos position(size_t i) const { return position(buffer.data() + i); } |