diff options
author | Thomas Lively <7121787+tlively@users.noreply.github.com> | 2022-05-27 13:28:04 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-05-27 13:28:04 -0700 |
commit | 404bd2cc11cbc85c79b33a75142dbcf16d530a1b (patch) | |
tree | dde9c424d1c4c9291c06a1d029283ce28a29ce9b /src | |
parent | f21774f3865e506b6dea912af8f8be16fd29dacb (diff) | |
download | binaryen-404bd2cc11cbc85c79b33a75142dbcf16d530a1b.tar.gz binaryen-404bd2cc11cbc85c79b33a75142dbcf16d530a1b.tar.bz2 binaryen-404bd2cc11cbc85c79b33a75142dbcf16d530a1b.zip |
[Parser][NFC] Create a public wat-lexer.h header (#4695)
wat-parser-internal.h was already quite large after implementing just the lexer,
so it made sense to rename it to be lexer-specific and start a new file for the
higher-level parser. Also make it a proper .cpp file and split the testable
interface out into wat-lexer.h.
Diffstat (limited to 'src')
-rw-r--r-- | src/wasm/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/wasm/wat-lexer.cpp (renamed from src/wasm/wat-parser-internal.h) | 341 | ||||
-rw-r--r-- | src/wat-lexer.h | 182 |
3 files changed, 283 insertions, 241 deletions
diff --git a/src/wasm/CMakeLists.txt b/src/wasm/CMakeLists.txt index 20fc2e9fb..974f3cd10 100644 --- a/src/wasm/CMakeLists.txt +++ b/src/wasm/CMakeLists.txt @@ -12,6 +12,7 @@ set(wasm_SOURCES wasm-stack.cpp wasm-type.cpp wasm-validator.cpp + wat-lexer.cpp ${wasm_HEADERS} ) # wasm-debug.cpp includes LLVM header using std::iterator (deprecated in C++17) diff --git a/src/wasm/wat-parser-internal.h b/src/wasm/wat-lexer.cpp index b2a270423..450d94a19 100644 --- a/src/wasm/wat-parser-internal.h +++ b/src/wasm/wat-lexer.cpp @@ -14,16 +14,6 @@ * limitations under the License. */ -// Usage note -// ---------- -// -// This parser is a work in progress and this file should not yet be included -// anywhere except for in its own tests. Once the parser is usable, we will add -// wat-parser.h to declare the public parsing API and wat-parser.cpp to -// implement the public parsing functions in terms of the private API in this -// header. The private API will stay in this header rather than moving to -// wat-parser.cpp so that we can continue to unit test it. - #include <cassert> #include <cctype> #include <cmath> @@ -32,6 +22,8 @@ #include <sstream> #include <variant> +#include "wat-lexer.h" + using namespace std::string_view_literals; namespace wasm::WATParser { @@ -106,8 +98,6 @@ public: void takeAll() { lexedSize = input.size(); } }; -enum Signedness { Unsigned, Signed }; - enum OverflowBehavior { DisallowOverflow, IgnoreOverflow }; std::optional<int> getDigit(char c) { @@ -786,258 +776,127 @@ std::optional<LexResult> keyword(std::string_view in) { return ctx.lexed(); } -// ====== -// Tokens -// ====== - -struct LParenTok { - friend std::ostream& operator<<(std::ostream& os, const LParenTok&) { - return os << "'('"; - } - - friend bool operator==(const LParenTok&, const LParenTok&) { return true; } -}; - -struct RParenTok { - friend std::ostream& operator<<(std::ostream& os, const RParenTok&) { - return os << "')'"; - } - - friend bool operator==(const RParenTok&, const RParenTok&) { return true; } -}; - -struct IntTok { - uint64_t n; - Signedness signedness; - - friend std::ostream& operator<<(std::ostream& os, const IntTok& tok) { - return os << tok.n << (tok.signedness == Signed ? " signed" : " unsigned"); - } - - friend bool operator==(const IntTok& t1, const IntTok& t2) { - return t1.n == t2.n && t1.signedness == t2.signedness; - } -}; - -struct FloatTok { - // The payload if we lexed a nan with payload. We cannot store the payload - // directly in `d` because we do not know at this point whether we are parsing - // an f32 or f64 and therefore we do not know what the allowable payloads are. - std::optional<uint64_t> nanPayload; - double d; - - friend std::ostream& operator<<(std::ostream& os, const FloatTok& tok) { - if (std::isnan(tok.d)) { - os << (std::signbit(tok.d) ? "+" : "-"); - if (tok.nanPayload) { - return os << "nan:0x" << std::hex << *tok.nanPayload << std::dec; - } - return os << "nan"; - } - return os << tok.d; - } +} // anonymous namespace - friend bool operator==(const FloatTok& t1, const FloatTok& t2) { - return std::signbit(t1.d) == std::signbit(t2.d) && - (t1.d == t2.d || (std::isnan(t1.d) && std::isnan(t2.d) && - t1.nanPayload == t2.nanPayload)); +void Lexer::skipSpace() { + if (auto ctx = space(next())) { + index += ctx->span.size(); } -}; +} -struct IdTok { - friend std::ostream& operator<<(std::ostream& os, const IdTok&) { - return os << "id"; +void Lexer::lexToken() { + // TODO: Ensure we're getting the longest possible match. + Token tok; + if (auto t = lparen(next())) { + tok = Token{t->span, LParenTok{}}; + } else if (auto t = rparen(next())) { + tok = Token{t->span, RParenTok{}}; + } else if (auto t = ident(next())) { + tok = Token{t->span, IdTok{}}; + } else if (auto t = integer(next())) { + tok = Token{t->span, IntTok{t->n, t->signedness}}; + } else if (auto t = float_(next())) { + tok = Token{t->span, FloatTok{t->nanPayload, t->d}}; + } else if (auto t = str(next())) { + tok = Token{t->span, StringTok{t->str}}; + } else if (auto t = keyword(next())) { + tok = Token{t->span, KeywordTok{}}; + } else { + // TODO: Do something about lexing errors. + curr = std::nullopt; + return; } + index += tok.span.size(); + curr = {tok}; +} - friend bool operator==(const IdTok&, const IdTok&) { return true; } -}; - -struct StringTok { - std::optional<std::string> str; - - friend std::ostream& operator<<(std::ostream& os, const StringTok& tok) { - if (tok.str) { - os << '"' << *tok.str << '"'; +TextPos Lexer::position(const char* c) { + assert(size_t(c - buffer.data()) < buffer.size()); + TextPos pos{1, 0}; + for (const char* p = buffer.data(); p != c; ++p) { + if (*p == '\n') { + pos.line++; + pos.col = 0; } else { - os << "(raw string)"; + pos.col++; } - return os; - } - - friend bool operator==(const StringTok& t1, const StringTok& t2) { - return t1.str == t2.str; } -}; - -struct KeywordTok { - friend std::ostream& operator<<(std::ostream& os, const KeywordTok&) { - return os << "keyword"; - } - - friend bool operator==(const KeywordTok&, const KeywordTok&) { return true; } -}; - -struct Token { - using Data = std::variant<LParenTok, - RParenTok, - IntTok, - FloatTok, - IdTok, - StringTok, - KeywordTok>; - - std::string_view span; - Data data; - - // Suppress clang-tidy false positive about unused functions. - [[maybe_unused]] friend std::ostream& operator<<(std::ostream& os, - const Token& tok) { - std::visit([&](const auto& t) { os << t; }, tok.data); - return os << " \"" << tok.span << "\""; - } - - [[maybe_unused]] friend bool operator==(const Token& t1, const Token& t2) { - return t1.span == t2.span && - std::visit( - [](auto& d1, auto& d2) { - if constexpr (std::is_same_v<decltype(d1), decltype(d2)>) { - return d1 == d2; - } else { - return false; - } - }, - t1.data, - t2.data); - } -}; - -struct TextPos { - size_t line; - size_t col; + return pos; +} - bool operator==(const TextPos& other) const { - return line == other.line && col == other.col; - } - bool operator!=(const TextPos& other) const { return !(*this == other); } +bool TextPos::operator==(const TextPos& other) const { + return line == other.line && col == other.col; +} - // Suppress clang-tidy false positive about unused functions. - [[maybe_unused]] friend std::ostream& operator<<(std::ostream& os, - const TextPos& pos) { - return os << pos.line << ":" << pos.col; - } -}; +bool IntTok::operator==(const IntTok& other) const { + return n == other.n && signedness == other.signedness; +} -// Lexer's purpose is twofold. First, it wraps a buffer to provide a tokenizing -// iterator over it. Second, it implements that iterator itself. Also provides -// utilities for locating the text position of tokens within the buffer. Text -// positions are computed on demand rather than eagerly because they are -// typically only needed when there is an error to report. -struct Lexer { - using iterator = Lexer; - using difference_type = std::ptrdiff_t; - using value_type = Token; - using pointer = const Token*; - using reference = const Token&; - using iterator_category = std::forward_iterator_tag; - - std::string_view buffer; - size_t index = 0; - std::optional<Token> curr; - - // The end sentinel. - Lexer() = default; - - Lexer(std::string_view buffer) : buffer(buffer) { - skipSpace(); - lexToken(); - skipSpace(); - } +bool FloatTok::operator==(const FloatTok& other) const { + return std::signbit(d) == std::signbit(other.d) && + (d == other.d || (std::isnan(d) && std::isnan(other.d) && + nanPayload == other.nanPayload)); +} - std::string_view next() const { return buffer.substr(index); } +bool Token::operator==(const Token& other) const { + return span == other.span && + std::visit( + [](auto& t1, auto& t2) { + if constexpr (std::is_same_v<decltype(t1), decltype(t2)>) { + return t1 == t2; + } else { + return false; + } + }, + data, + other.data); +} - void skipSpace() { - if (auto ctx = space(next())) { - index += ctx->span.size(); - } - } +std::ostream& operator<<(std::ostream& os, const TextPos& pos) { + return os << pos.line << ":" << pos.col; +} - void lexToken() { - // TODO: Ensure we're getting the longest possible match. - Token tok; - if (auto t = lparen(next())) { - tok = Token{t->span, LParenTok{}}; - } else if (auto t = rparen(next())) { - tok = Token{t->span, RParenTok{}}; - } else if (auto t = ident(next())) { - tok = Token{t->span, IdTok{}}; - } else if (auto t = integer(next())) { - tok = Token{t->span, IntTok{t->n, t->signedness}}; - } else if (auto t = float_(next())) { - tok = Token{t->span, FloatTok{t->nanPayload, t->d}}; - } else if (auto t = str(next())) { - tok = Token{t->span, StringTok{t->str}}; - } else if (auto t = keyword(next())) { - tok = Token{t->span, KeywordTok{}}; - } else { - // TODO: Do something about lexing errors. - curr = std::nullopt; - return; - } - index += tok.span.size(); - curr = {tok}; - } +std::ostream& operator<<(std::ostream& os, const LParenTok&) { + return os << "'('"; +} - Lexer& operator++() { - // Preincrement - lexToken(); - skipSpace(); - return *this; - } +std::ostream& operator<<(std::ostream& os, const RParenTok&) { + return os << "')'"; +} - Lexer operator++(int) { - // Postincrement - Lexer ret = *this; - ++(*this); - return ret; - } +std::ostream& operator<<(std::ostream& os, const IdTok&) { return os << "id"; } - const Token& operator*() { return *curr; } - const Token* operator->() { return &*curr; } +std::ostream& operator<<(std::ostream& os, const IntTok& tok) { + return os << tok.n << (tok.signedness == Signed ? " signed" : " unsigned"); +} - bool operator==(const Lexer& other) const { - // The iterator is equal to the end sentinel when there is no current token. - if (!curr && !other.curr) { - return true; +std::ostream& operator<<(std::ostream& os, const FloatTok& tok) { + if (std::isnan(tok.d)) { + os << (std::signbit(tok.d) ? "+" : "-"); + if (tok.nanPayload) { + return os << "nan:0x" << std::hex << *tok.nanPayload << std::dec; } - // Otherwise they are equivalent when they are at the same position. - return index == other.index; + return os << "nan"; } + return os << tok.d; +} - bool operator!=(const Lexer& other) const { return !(*this == other); } - - Lexer begin() { return *this; } - - Lexer end() { return Lexer(); } - - TextPos position(const char* c) { - assert(size_t(c - buffer.data()) < buffer.size()); - TextPos pos{1, 0}; - for (const char* p = buffer.data(); p != c; ++p) { - if (*p == '\n') { - pos.line++; - pos.col = 0; - } else { - pos.col++; - } - } - return pos; +std::ostream& operator<<(std::ostream& os, const StringTok& tok) { + if (tok.str) { + os << '"' << *tok.str << '"'; + } else { + os << "(raw string)"; } + return os; +} - TextPos position(std::string_view span) { return position(span.data()); } - - TextPos position(Token tok) { return position(tok.span); } -}; +std::ostream& operator<<(std::ostream& os, const KeywordTok&) { + return os << "keyword"; +} -} // anonymous namespace +std::ostream& operator<<(std::ostream& os, const Token& tok) { + std::visit([&](const auto& t) { os << t; }, tok.data); + return os << " \"" << tok.span << "\""; +} } // namespace wasm::WATParser diff --git a/src/wat-lexer.h b/src/wat-lexer.h new file mode 100644 index 000000000..d5c3b33ce --- /dev/null +++ b/src/wat-lexer.h @@ -0,0 +1,182 @@ +/* + * Copyright 2022 WebAssembly Community Group participants + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cstddef> +#include <iterator> +#include <optional> +#include <ostream> +#include <string_view> +#include <variant> + +#ifndef wasm_wat_parser_h +#define wasm_wat_parser_h + +namespace wasm::WATParser { + +struct TextPos { + size_t line; + size_t col; + + bool operator==(const TextPos& other) const; + bool operator!=(const TextPos& other) const { return !(*this == other); } + + friend std::ostream& operator<<(std::ostream& os, const TextPos& pos); +}; + +// ====== +// Tokens +// ====== + +struct LParenTok { + bool operator==(const LParenTok&) const { return true; } + friend std::ostream& operator<<(std::ostream&, const LParenTok&); +}; + +struct RParenTok { + bool operator==(const RParenTok&) const { return true; } + friend std::ostream& operator<<(std::ostream&, const RParenTok&); +}; + +struct IdTok { + bool operator==(const IdTok&) const { return true; } + friend std::ostream& operator<<(std::ostream&, const IdTok&); +}; + +enum Signedness { Unsigned, Signed }; + +struct IntTok { + uint64_t n; + Signedness signedness; + + bool operator==(const IntTok&) const; + friend std::ostream& operator<<(std::ostream&, const IntTok&); +}; + +struct FloatTok { + // The payload if we lexed a nan with payload. We cannot store the payload + // directly in `d` because we do not know at this point whether we are parsing + // an f32 or f64 and therefore we do not know what the allowable payloads are. + std::optional<uint64_t> nanPayload; + double d; + + bool operator==(const FloatTok&) const; + friend std::ostream& operator<<(std::ostream&, const FloatTok&); +}; + +struct StringTok { + std::optional<std::string> str; + + bool operator==(const StringTok& other) const { return str == other.str; } + friend std::ostream& operator<<(std::ostream&, const StringTok&); +}; + +struct KeywordTok { + bool operator==(const KeywordTok&) const { return true; } + friend std::ostream& operator<<(std::ostream&, const KeywordTok&); +}; + +struct Token { + using Data = std::variant<LParenTok, + RParenTok, + IdTok, + IntTok, + FloatTok, + StringTok, + KeywordTok>; + std::string_view span; + Data data; + + bool operator==(const Token&) const; + friend std::ostream& operator<<(std::ostream& os, const Token&); +}; + +// ===== +// Lexer +// ===== + +// Lexer's purpose is twofold. First, it wraps a buffer to provide a tokenizing +// iterator over it. Second, it implements that iterator itself. Also provides +// utilities for locating the text position of tokens within the buffer. Text +// positions are computed on demand rather than eagerly because they are +// typically only needed when there is an error to report. +struct Lexer { + using iterator = Lexer; + using difference_type = std::ptrdiff_t; + using value_type = Token; + using pointer = const Token*; + using reference = const Token&; + using iterator_category = std::forward_iterator_tag; + +private: + std::string_view buffer; + size_t index = 0; + std::optional<Token> curr; + +public: + // The end sentinel. + Lexer() = default; + + Lexer(std::string_view buffer) : buffer(buffer) { + skipSpace(); + lexToken(); + skipSpace(); + } + + std::string_view next() const { return buffer.substr(index); } + Lexer& operator++() { + // Preincrement + lexToken(); + skipSpace(); + return *this; + } + + Lexer operator++(int) { + // Postincrement + Lexer ret = *this; + ++(*this); + return ret; + } + + const Token& operator*() { return *curr; } + const Token* operator->() { return &*curr; } + + bool operator==(const Lexer& other) const { + // The iterator is equal to the end sentinel when there is no current token. + if (!curr && !other.curr) { + return true; + } + // Otherwise they are equivalent when they are at the same position. + return index == other.index; + } + + bool operator!=(const Lexer& other) const { return !(*this == other); } + + Lexer begin() { return *this; } + + Lexer end() { return Lexer(); } + + TextPos position(const char* c); + TextPos position(std::string_view span) { return position(span.data()); } + TextPos position(Token tok) { return position(tok.span); } + +private: + void skipSpace(); + void lexToken(); +}; + +} // namespace wasm::WATParser + +#endif // wasm_wat_parser_h |