diff options
Diffstat (limited to 'src/parser/lexer.h')
-rw-r--r-- | src/parser/lexer.h | 227 |
1 files changed, 227 insertions, 0 deletions
diff --git a/src/parser/lexer.h b/src/parser/lexer.h new file mode 100644 index 000000000..67d29b002 --- /dev/null +++ b/src/parser/lexer.h @@ -0,0 +1,227 @@ +/* + * Copyright 2023 WebAssembly Community Group participants + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <iterator> +#include <optional> +#include <ostream> +#include <string_view> +#include <variant> + +#ifndef parser_lexer_h +#define parser_lexer_h + +namespace wasm::WATParser { + +struct TextPos { + size_t line; + size_t col; + + bool operator==(const TextPos& other) const; + bool operator!=(const TextPos& other) const { return !(*this == other); } + + friend std::ostream& operator<<(std::ostream& os, const TextPos& pos); +}; + +// ====== +// Tokens +// ====== + +struct LParenTok { + bool operator==(const LParenTok&) const { return true; } + friend std::ostream& operator<<(std::ostream&, const LParenTok&); +}; + +struct RParenTok { + bool operator==(const RParenTok&) const { return true; } + friend std::ostream& operator<<(std::ostream&, const RParenTok&); +}; + +struct IdTok { + bool operator==(const IdTok&) const { return true; } + friend std::ostream& operator<<(std::ostream&, const IdTok&); +}; + +enum Sign { NoSign, Pos, Neg }; + +struct IntTok { + uint64_t n; + Sign sign; + + bool operator==(const IntTok&) const; + friend std::ostream& operator<<(std::ostream&, const IntTok&); +}; + +struct FloatTok { + // The payload if we lexed a nan with payload. We cannot store the payload + // directly in `d` because we do not know at this point whether we are parsing + // an f32 or f64 and therefore we do not know what the allowable payloads are. + // No payload with NaN means to use the default payload for the expected float + // width. + std::optional<uint64_t> nanPayload; + double d; + + bool operator==(const FloatTok&) const; + friend std::ostream& operator<<(std::ostream&, const FloatTok&); +}; + +struct StringTok { + std::optional<std::string> str; + + bool operator==(const StringTok& other) const { return str == other.str; } + friend std::ostream& operator<<(std::ostream&, const StringTok&); +}; + +struct KeywordTok { + bool operator==(const KeywordTok&) const { return true; } + friend std::ostream& operator<<(std::ostream&, const KeywordTok&); +}; + +struct Token { + using Data = std::variant<LParenTok, + RParenTok, + IdTok, + IntTok, + FloatTok, + StringTok, + KeywordTok>; + std::string_view span; + Data data; + + // ==================== + // Token classification + // ==================== + + bool isLParen() const { return std::get_if<LParenTok>(&data); } + + bool isRParen() const { return std::get_if<RParenTok>(&data); } + + std::optional<std::string_view> getID() const { + if (std::get_if<IdTok>(&data)) { + // Drop leading '$'. + return span.substr(1); + } + return {}; + } + + std::optional<std::string_view> getKeyword() const { + if (std::get_if<KeywordTok>(&data)) { + return span; + } + return {}; + } + std::optional<uint64_t> getU64() const; + std::optional<int64_t> getS64() const; + std::optional<uint64_t> getI64() const; + std::optional<uint32_t> getU32() const; + std::optional<int32_t> getS32() const; + std::optional<uint32_t> getI32() const; + std::optional<double> getF64() const; + std::optional<float> getF32() const; + std::optional<std::string_view> getString() const; + + bool operator==(const Token&) const; + friend std::ostream& operator<<(std::ostream& os, const Token&); +}; + +// ===== +// Lexer +// ===== + +// Lexer's purpose is twofold. First, it wraps a buffer to provide a tokenizing +// iterator over it. Second, it implements that iterator itself. Also provides +// utilities for locating the text position of tokens within the buffer. Text +// positions are computed on demand rather than eagerly because they are +// typically only needed when there is an error to report. +struct Lexer { + using iterator = Lexer; + using difference_type = std::ptrdiff_t; + using value_type = Token; + using pointer = const Token*; + using reference = const Token&; + using iterator_category = std::forward_iterator_tag; + +private: + std::string_view buffer; + size_t index = 0; + std::optional<Token> curr; + +public: + // The end sentinel. + Lexer() = default; + + Lexer(std::string_view buffer) : buffer(buffer) { setIndex(0); } + + size_t getIndex() const { return index; } + + void setIndex(size_t i) { + index = i; + skipSpace(); + lexToken(); + } + + std::string_view next() const { return buffer.substr(index); } + Lexer& operator++() { + // Preincrement + skipSpace(); + lexToken(); + return *this; + } + + Lexer operator++(int) { + // Postincrement + Lexer ret = *this; + ++(*this); + return ret; + } + + const Token& operator*() { return *curr; } + const Token* operator->() { return &*curr; } + + bool operator==(const Lexer& other) const { + // The iterator is equal to the end sentinel when there is no current token. + if (!curr && !other.curr) { + return true; + } + // Otherwise they are equivalent when they are at the same position. + return index == other.index; + } + + bool operator!=(const Lexer& other) const { return !(*this == other); } + + Lexer begin() { return *this; } + + Lexer end() const { return Lexer(); } + + bool empty() const { return *this == end(); } + + TextPos position(const char* c) const; + TextPos position(size_t i) const { return position(buffer.data() + i); } + TextPos position(std::string_view span) const { + return position(span.data()); + } + TextPos position(Token tok) const { return position(tok.span); } + +private: + void skipSpace(); + void lexToken(); +}; + +} // namespace wasm::WATParser + +#endif // parser_lexer_h |