diff options
author | Thomas Lively <tlively@google.com> | 2024-04-25 22:56:36 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-25 22:56:36 -0700 |
commit | abd51437426c72a2d2f8195da5d5cf570941b805 (patch) | |
tree | bed515cd6cb6b582721ceef0ce1cbf7397be307c /src/parser/lexer.cpp | |
parent | df6878612a32d50408fa9dc71e84199bc823a250 (diff) | |
download | binaryen-abd51437426c72a2d2f8195da5d5cf570941b805.tar.gz binaryen-abd51437426c72a2d2f8195da5d5cf570941b805.tar.bz2 binaryen-abd51437426c72a2d2f8195da5d5cf570941b805.zip |
[Parser] Do not eagerly lex numbers (#6544)
Lex integers and floats on demand to avoid wasted work. Remove `Token`
completely now that all kinds of tokens are lexed on demand.
Diffstat (limited to 'src/parser/lexer.cpp')
-rw-r--r-- | src/parser/lexer.cpp | 321 |
1 files changed, 132 insertions, 189 deletions
diff --git a/src/parser/lexer.cpp b/src/parser/lexer.cpp index f3646c0be..87a9d12f3 100644 --- a/src/parser/lexer.cpp +++ b/src/parser/lexer.cpp @@ -123,10 +123,25 @@ std::optional<int> getHexDigit(char c) { return {}; } +enum Sign { NoSign, Pos, Neg }; + // The result of lexing an integer token fragment. struct LexIntResult : LexResult { uint64_t n; Sign sign; + + template<typename T> bool isUnsigned() { + static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>); + return sign == NoSign && n <= std::numeric_limits<T>::max(); + } + + template<typename T> bool isSigned() { + static_assert(std::is_integral_v<T> && std::is_signed_v<T>); + if (sign == Neg) { + return uint64_t(std::numeric_limits<T>::min()) <= n || n == 0; + } + return n <= uint64_t(std::numeric_limits<T>::max()); + } }; // Lexing context that accumulates lexed input to produce an integer token @@ -887,123 +902,6 @@ std::optional<LexResult> keyword(std::string_view in) { } // anonymous namespace -template<typename T> std::optional<T> Token::getU() const { - static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>); - if (auto* tok = std::get_if<IntTok>(&data)) { - if (tok->sign == NoSign && tok->n <= std::numeric_limits<T>::max()) { - return T(tok->n); - } - // TODO: Add error production for unsigned overflow. - } - return {}; -} - -template<typename T> std::optional<T> Token::getS() const { - static_assert(std::is_integral_v<T> && std::is_signed_v<T>); - if (auto* tok = std::get_if<IntTok>(&data)) { - if (tok->sign == Neg) { - if (uint64_t(std::numeric_limits<T>::min()) <= tok->n || tok->n == 0) { - return T(tok->n); - } - } else { - if (tok->n <= uint64_t(std::numeric_limits<T>::max())) { - return T(tok->n); - } - } - } - return {}; -} - -template<typename T> std::optional<T> Token::getI() const { - static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>); - if (auto n = getU<T>()) { - return *n; - } - if (auto n = getS<std::make_signed_t<T>>()) { - return T(*n); - } - return {}; -} - -template std::optional<uint64_t> Token::getU<uint64_t>() const; -template std::optional<int64_t> Token::getS<int64_t>() const; -template std::optional<uint64_t> Token::getI<uint64_t>() const; -template std::optional<uint32_t> Token::getU<uint32_t>() const; -template std::optional<int32_t> Token::getS<int32_t>() const; -template std::optional<uint32_t> Token::getI<uint32_t>() const; -template std::optional<uint16_t> Token::getU<uint16_t>() const; -template std::optional<int16_t> Token::getS<int16_t>() const; -template std::optional<uint16_t> Token::getI<uint16_t>() const; -template std::optional<uint8_t> Token::getU<uint8_t>() const; -template std::optional<int8_t> Token::getS<int8_t>() const; -template std::optional<uint8_t> Token::getI<uint8_t>() const; - -std::optional<double> Token::getF64() const { - constexpr int signif = 52; - constexpr uint64_t payloadMask = (1ull << signif) - 1; - constexpr uint64_t nanDefault = 1ull << (signif - 1); - if (auto* tok = std::get_if<FloatTok>(&data)) { - double d = tok->d; - if (std::isnan(d)) { - // Inject payload. - uint64_t payload = tok->nanPayload ? *tok->nanPayload : nanDefault; - if (payload == 0 || payload > payloadMask) { - // TODO: Add error production for out-of-bounds payload. - return {}; - } - uint64_t bits; - static_assert(sizeof(bits) == sizeof(d)); - memcpy(&bits, &d, sizeof(bits)); - bits = (bits & ~payloadMask) | payload; - memcpy(&d, &bits, sizeof(bits)); - } - return d; - } - if (auto* tok = std::get_if<IntTok>(&data)) { - if (tok->sign == Neg) { - if (tok->n == 0) { - return -0.0; - } - return double(int64_t(tok->n)); - } - return double(tok->n); - } - return {}; -} - -std::optional<float> Token::getF32() const { - constexpr int signif = 23; - constexpr uint32_t payloadMask = (1u << signif) - 1; - constexpr uint64_t nanDefault = 1ull << (signif - 1); - if (auto* tok = std::get_if<FloatTok>(&data)) { - float f = tok->d; - if (std::isnan(f)) { - // Validate and inject payload. - uint64_t payload = tok->nanPayload ? *tok->nanPayload : nanDefault; - if (payload == 0 || payload > payloadMask) { - // TODO: Add error production for out-of-bounds payload. - return {}; - } - uint32_t bits; - static_assert(sizeof(bits) == sizeof(f)); - memcpy(&bits, &f, sizeof(bits)); - bits = (bits & ~payloadMask) | payload; - memcpy(&f, &bits, sizeof(bits)); - } - return f; - } - if (auto* tok = std::get_if<IntTok>(&data)) { - if (tok->sign == Neg) { - if (tok->n == 0) { - return -0.0f; - } - return float(int64_t(tok->n)); - } - return float(tok->n); - } - return {}; -} - void Lexer::skipSpace() { while (true) { if (auto ctx = annotation(next())) { @@ -1020,9 +918,6 @@ void Lexer::skipSpace() { } bool Lexer::takeLParen() { - if (curr) { - return false; - } if (LexCtx(next()).startsWith("("sv)) { ++index; advance(); @@ -1032,9 +927,6 @@ bool Lexer::takeLParen() { } bool Lexer::takeRParen() { - if (curr) { - return false; - } if (LexCtx(next()).startsWith(")"sv)) { ++index; advance(); @@ -1044,9 +936,6 @@ bool Lexer::takeRParen() { } std::optional<std::string> Lexer::takeString() { - if (curr) { - return std::nullopt; - } if (auto result = str(next())) { index += result->span.size(); advance(); @@ -1060,9 +949,6 @@ std::optional<std::string> Lexer::takeString() { } std::optional<Name> Lexer::takeID() { - if (curr) { - return std::nullopt; - } if (auto result = ident(next())) { index += result->span.size(); advance(); @@ -1080,9 +966,6 @@ std::optional<Name> Lexer::takeID() { } std::optional<std::string_view> Lexer::takeKeyword() { - if (curr) { - return std::nullopt; - } if (auto result = keyword(next())) { index += result->span.size(); advance(); @@ -1130,20 +1013,124 @@ std::optional<uint32_t> Lexer::takeAlign() { return std::nullopt; } -void Lexer::lexToken() { - // TODO: Ensure we're getting the longest possible match. - Token tok; - if (auto t = integer(next())) { - tok = Token{t->span, IntTok{t->n, t->sign}}; - } else if (auto t = float_(next())) { - tok = Token{t->span, FloatTok{t->nanPayload, t->d}}; - } else { - // TODO: Do something about lexing errors. - curr = std::nullopt; - return; +template<typename T> std::optional<T> Lexer::takeU() { + static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>); + if (auto result = integer(next()); result && result->isUnsigned<T>()) { + index += result->span.size(); + advance(); + return T(result->n); + } + // TODO: Add error production for unsigned overflow. + return std::nullopt; +} + +template<typename T> std::optional<T> Lexer::takeS() { + static_assert(std::is_integral_v<T> && std::is_signed_v<T>); + if (auto result = integer(next()); result && result->isSigned<T>()) { + index += result->span.size(); + advance(); + return T(result->n); + } + return std::nullopt; +} + +template<typename T> std::optional<T> Lexer::takeI() { + static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>); + if (auto result = integer(next())) { + if (result->isUnsigned<T>() || result->isSigned<std::make_signed_t<T>>()) { + index += result->span.size(); + advance(); + return T(result->n); + } } - index += tok.span.size(); - curr = {tok}; + return std::nullopt; +} + +template std::optional<uint64_t> Lexer::takeU<uint64_t>(); +template std::optional<int64_t> Lexer::takeS<int64_t>(); +template std::optional<uint64_t> Lexer::takeI<uint64_t>(); +template std::optional<uint32_t> Lexer::takeU<uint32_t>(); +template std::optional<int32_t> Lexer::takeS<int32_t>(); +template std::optional<uint32_t> Lexer::takeI<uint32_t>(); +template std::optional<uint16_t> Lexer::takeU<uint16_t>(); +template std::optional<int16_t> Lexer::takeS<int16_t>(); +template std::optional<uint16_t> Lexer::takeI<uint16_t>(); +template std::optional<uint8_t> Lexer::takeU<uint8_t>(); +template std::optional<int8_t> Lexer::takeS<int8_t>(); +template std::optional<uint8_t> Lexer::takeI<uint8_t>(); + +std::optional<double> Lexer::takeF64() { + constexpr int signif = 52; + constexpr uint64_t payloadMask = (1ull << signif) - 1; + constexpr uint64_t nanDefault = 1ull << (signif - 1); + if (auto result = float_(next())) { + double d = result->d; + if (std::isnan(d)) { + // Inject payload. + uint64_t payload = result->nanPayload ? *result->nanPayload : nanDefault; + if (payload == 0 || payload > payloadMask) { + // TODO: Add error production for out-of-bounds payload. + return std::nullopt; + } + uint64_t bits; + static_assert(sizeof(bits) == sizeof(d)); + memcpy(&bits, &d, sizeof(bits)); + bits = (bits & ~payloadMask) | payload; + memcpy(&d, &bits, sizeof(bits)); + } + index += result->span.size(); + advance(); + return d; + } + if (auto result = integer(next())) { + index += result->span.size(); + advance(); + if (result->sign == Neg) { + if (result->n == 0) { + return -0.0; + } + return double(int64_t(result->n)); + } + return double(result->n); + } + return std::nullopt; +} + +std::optional<float> Lexer::takeF32() { + constexpr int signif = 23; + constexpr uint32_t payloadMask = (1u << signif) - 1; + constexpr uint64_t nanDefault = 1ull << (signif - 1); + if (auto result = float_(next())) { + float f = result->d; + if (std::isnan(f)) { + // Validate and inject payload. + uint64_t payload = result->nanPayload ? *result->nanPayload : nanDefault; + if (payload == 0 || payload > payloadMask) { + // TODO: Add error production for out-of-bounds payload. + return std::nullopt; + } + uint32_t bits; + static_assert(sizeof(bits) == sizeof(f)); + memcpy(&bits, &f, sizeof(bits)); + bits = (bits & ~payloadMask) | payload; + memcpy(&f, &bits, sizeof(bits)); + } + index += result->span.size(); + advance(); + return f; + } + if (auto result = integer(next())) { + index += result->span.size(); + advance(); + if (result->sign == Neg) { + if (result->n == 0) { + return -0.0f; + } + return float(int64_t(result->n)); + } + return float(result->n); + } + return std::nullopt; } TextPos Lexer::position(const char* c) const { @@ -1164,52 +1151,8 @@ bool TextPos::operator==(const TextPos& other) const { return line == other.line && col == other.col; } -bool IntTok::operator==(const IntTok& other) const { - return n == other.n && sign == other.sign; -} - -bool FloatTok::operator==(const FloatTok& other) const { - return std::signbit(d) == std::signbit(other.d) && - (d == other.d || (std::isnan(d) && std::isnan(other.d) && - nanPayload == other.nanPayload)); -} - -bool Token::operator==(const Token& other) const { - return span == other.span && - std::visit( - [](auto& t1, auto& t2) { - if constexpr (std::is_same_v<decltype(t1), decltype(t2)>) { - return t1 == t2; - } else { - return false; - } - }, - data, - other.data); -} - std::ostream& operator<<(std::ostream& os, const TextPos& pos) { return os << pos.line << ":" << pos.col; } -std::ostream& operator<<(std::ostream& os, const IntTok& tok) { - return os << (tok.sign == Pos ? "+" : tok.sign == Neg ? "-" : "") << tok.n; -} - -std::ostream& operator<<(std::ostream& os, const FloatTok& tok) { - if (std::isnan(tok.d)) { - os << (std::signbit(tok.d) ? "+" : "-"); - if (tok.nanPayload) { - return os << "nan:0x" << std::hex << *tok.nanPayload << std::dec; - } - return os << "nan"; - } - return os << tok.d; -} - -std::ostream& operator<<(std::ostream& os, const Token& tok) { - std::visit([&](const auto& t) { os << t; }, tok.data); - return os << " \"" << tok.span << "\""; -} - } // namespace wasm::WATParser |