From abd51437426c72a2d2f8195da5d5cf570941b805 Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Thu, 25 Apr 2024 22:56:36 -0700 Subject: [Parser] Do not eagerly lex numbers (#6544) Lex integers and floats on demand to avoid wasted work. Remove `Token` completely now that all kinds of tokens are lexed on demand. --- src/parser/lexer.cpp | 321 +++++++++++++++++++++------------------------------ 1 file changed, 132 insertions(+), 189 deletions(-) (limited to 'src/parser/lexer.cpp') diff --git a/src/parser/lexer.cpp b/src/parser/lexer.cpp index f3646c0be..87a9d12f3 100644 --- a/src/parser/lexer.cpp +++ b/src/parser/lexer.cpp @@ -123,10 +123,25 @@ std::optional getHexDigit(char c) { return {}; } +enum Sign { NoSign, Pos, Neg }; + // The result of lexing an integer token fragment. struct LexIntResult : LexResult { uint64_t n; Sign sign; + + template bool isUnsigned() { + static_assert(std::is_integral_v && std::is_unsigned_v); + return sign == NoSign && n <= std::numeric_limits::max(); + } + + template bool isSigned() { + static_assert(std::is_integral_v && std::is_signed_v); + if (sign == Neg) { + return uint64_t(std::numeric_limits::min()) <= n || n == 0; + } + return n <= uint64_t(std::numeric_limits::max()); + } }; // Lexing context that accumulates lexed input to produce an integer token @@ -887,123 +902,6 @@ std::optional keyword(std::string_view in) { } // anonymous namespace -template std::optional Token::getU() const { - static_assert(std::is_integral_v && std::is_unsigned_v); - if (auto* tok = std::get_if(&data)) { - if (tok->sign == NoSign && tok->n <= std::numeric_limits::max()) { - return T(tok->n); - } - // TODO: Add error production for unsigned overflow. - } - return {}; -} - -template std::optional Token::getS() const { - static_assert(std::is_integral_v && std::is_signed_v); - if (auto* tok = std::get_if(&data)) { - if (tok->sign == Neg) { - if (uint64_t(std::numeric_limits::min()) <= tok->n || tok->n == 0) { - return T(tok->n); - } - } else { - if (tok->n <= uint64_t(std::numeric_limits::max())) { - return T(tok->n); - } - } - } - return {}; -} - -template std::optional Token::getI() const { - static_assert(std::is_integral_v && std::is_unsigned_v); - if (auto n = getU()) { - return *n; - } - if (auto n = getS>()) { - return T(*n); - } - return {}; -} - -template std::optional Token::getU() const; -template std::optional Token::getS() const; -template std::optional Token::getI() const; -template std::optional Token::getU() const; -template std::optional Token::getS() const; -template std::optional Token::getI() const; -template std::optional Token::getU() const; -template std::optional Token::getS() const; -template std::optional Token::getI() const; -template std::optional Token::getU() const; -template std::optional Token::getS() const; -template std::optional Token::getI() const; - -std::optional Token::getF64() const { - constexpr int signif = 52; - constexpr uint64_t payloadMask = (1ull << signif) - 1; - constexpr uint64_t nanDefault = 1ull << (signif - 1); - if (auto* tok = std::get_if(&data)) { - double d = tok->d; - if (std::isnan(d)) { - // Inject payload. - uint64_t payload = tok->nanPayload ? *tok->nanPayload : nanDefault; - if (payload == 0 || payload > payloadMask) { - // TODO: Add error production for out-of-bounds payload. - return {}; - } - uint64_t bits; - static_assert(sizeof(bits) == sizeof(d)); - memcpy(&bits, &d, sizeof(bits)); - bits = (bits & ~payloadMask) | payload; - memcpy(&d, &bits, sizeof(bits)); - } - return d; - } - if (auto* tok = std::get_if(&data)) { - if (tok->sign == Neg) { - if (tok->n == 0) { - return -0.0; - } - return double(int64_t(tok->n)); - } - return double(tok->n); - } - return {}; -} - -std::optional Token::getF32() const { - constexpr int signif = 23; - constexpr uint32_t payloadMask = (1u << signif) - 1; - constexpr uint64_t nanDefault = 1ull << (signif - 1); - if (auto* tok = std::get_if(&data)) { - float f = tok->d; - if (std::isnan(f)) { - // Validate and inject payload. - uint64_t payload = tok->nanPayload ? *tok->nanPayload : nanDefault; - if (payload == 0 || payload > payloadMask) { - // TODO: Add error production for out-of-bounds payload. - return {}; - } - uint32_t bits; - static_assert(sizeof(bits) == sizeof(f)); - memcpy(&bits, &f, sizeof(bits)); - bits = (bits & ~payloadMask) | payload; - memcpy(&f, &bits, sizeof(bits)); - } - return f; - } - if (auto* tok = std::get_if(&data)) { - if (tok->sign == Neg) { - if (tok->n == 0) { - return -0.0f; - } - return float(int64_t(tok->n)); - } - return float(tok->n); - } - return {}; -} - void Lexer::skipSpace() { while (true) { if (auto ctx = annotation(next())) { @@ -1020,9 +918,6 @@ void Lexer::skipSpace() { } bool Lexer::takeLParen() { - if (curr) { - return false; - } if (LexCtx(next()).startsWith("("sv)) { ++index; advance(); @@ -1032,9 +927,6 @@ bool Lexer::takeLParen() { } bool Lexer::takeRParen() { - if (curr) { - return false; - } if (LexCtx(next()).startsWith(")"sv)) { ++index; advance(); @@ -1044,9 +936,6 @@ bool Lexer::takeRParen() { } std::optional Lexer::takeString() { - if (curr) { - return std::nullopt; - } if (auto result = str(next())) { index += result->span.size(); advance(); @@ -1060,9 +949,6 @@ std::optional Lexer::takeString() { } std::optional Lexer::takeID() { - if (curr) { - return std::nullopt; - } if (auto result = ident(next())) { index += result->span.size(); advance(); @@ -1080,9 +966,6 @@ std::optional Lexer::takeID() { } std::optional Lexer::takeKeyword() { - if (curr) { - return std::nullopt; - } if (auto result = keyword(next())) { index += result->span.size(); advance(); @@ -1130,20 +1013,124 @@ std::optional Lexer::takeAlign() { return std::nullopt; } -void Lexer::lexToken() { - // TODO: Ensure we're getting the longest possible match. - Token tok; - if (auto t = integer(next())) { - tok = Token{t->span, IntTok{t->n, t->sign}}; - } else if (auto t = float_(next())) { - tok = Token{t->span, FloatTok{t->nanPayload, t->d}}; - } else { - // TODO: Do something about lexing errors. - curr = std::nullopt; - return; +template std::optional Lexer::takeU() { + static_assert(std::is_integral_v && std::is_unsigned_v); + if (auto result = integer(next()); result && result->isUnsigned()) { + index += result->span.size(); + advance(); + return T(result->n); + } + // TODO: Add error production for unsigned overflow. + return std::nullopt; +} + +template std::optional Lexer::takeS() { + static_assert(std::is_integral_v && std::is_signed_v); + if (auto result = integer(next()); result && result->isSigned()) { + index += result->span.size(); + advance(); + return T(result->n); + } + return std::nullopt; +} + +template std::optional Lexer::takeI() { + static_assert(std::is_integral_v && std::is_unsigned_v); + if (auto result = integer(next())) { + if (result->isUnsigned() || result->isSigned>()) { + index += result->span.size(); + advance(); + return T(result->n); + } } - index += tok.span.size(); - curr = {tok}; + return std::nullopt; +} + +template std::optional Lexer::takeU(); +template std::optional Lexer::takeS(); +template std::optional Lexer::takeI(); +template std::optional Lexer::takeU(); +template std::optional Lexer::takeS(); +template std::optional Lexer::takeI(); +template std::optional Lexer::takeU(); +template std::optional Lexer::takeS(); +template std::optional Lexer::takeI(); +template std::optional Lexer::takeU(); +template std::optional Lexer::takeS(); +template std::optional Lexer::takeI(); + +std::optional Lexer::takeF64() { + constexpr int signif = 52; + constexpr uint64_t payloadMask = (1ull << signif) - 1; + constexpr uint64_t nanDefault = 1ull << (signif - 1); + if (auto result = float_(next())) { + double d = result->d; + if (std::isnan(d)) { + // Inject payload. + uint64_t payload = result->nanPayload ? *result->nanPayload : nanDefault; + if (payload == 0 || payload > payloadMask) { + // TODO: Add error production for out-of-bounds payload. + return std::nullopt; + } + uint64_t bits; + static_assert(sizeof(bits) == sizeof(d)); + memcpy(&bits, &d, sizeof(bits)); + bits = (bits & ~payloadMask) | payload; + memcpy(&d, &bits, sizeof(bits)); + } + index += result->span.size(); + advance(); + return d; + } + if (auto result = integer(next())) { + index += result->span.size(); + advance(); + if (result->sign == Neg) { + if (result->n == 0) { + return -0.0; + } + return double(int64_t(result->n)); + } + return double(result->n); + } + return std::nullopt; +} + +std::optional Lexer::takeF32() { + constexpr int signif = 23; + constexpr uint32_t payloadMask = (1u << signif) - 1; + constexpr uint64_t nanDefault = 1ull << (signif - 1); + if (auto result = float_(next())) { + float f = result->d; + if (std::isnan(f)) { + // Validate and inject payload. + uint64_t payload = result->nanPayload ? *result->nanPayload : nanDefault; + if (payload == 0 || payload > payloadMask) { + // TODO: Add error production for out-of-bounds payload. + return std::nullopt; + } + uint32_t bits; + static_assert(sizeof(bits) == sizeof(f)); + memcpy(&bits, &f, sizeof(bits)); + bits = (bits & ~payloadMask) | payload; + memcpy(&f, &bits, sizeof(bits)); + } + index += result->span.size(); + advance(); + return f; + } + if (auto result = integer(next())) { + index += result->span.size(); + advance(); + if (result->sign == Neg) { + if (result->n == 0) { + return -0.0f; + } + return float(int64_t(result->n)); + } + return float(result->n); + } + return std::nullopt; } TextPos Lexer::position(const char* c) const { @@ -1164,52 +1151,8 @@ bool TextPos::operator==(const TextPos& other) const { return line == other.line && col == other.col; } -bool IntTok::operator==(const IntTok& other) const { - return n == other.n && sign == other.sign; -} - -bool FloatTok::operator==(const FloatTok& other) const { - return std::signbit(d) == std::signbit(other.d) && - (d == other.d || (std::isnan(d) && std::isnan(other.d) && - nanPayload == other.nanPayload)); -} - -bool Token::operator==(const Token& other) const { - return span == other.span && - std::visit( - [](auto& t1, auto& t2) { - if constexpr (std::is_same_v) { - return t1 == t2; - } else { - return false; - } - }, - data, - other.data); -} - std::ostream& operator<<(std::ostream& os, const TextPos& pos) { return os << pos.line << ":" << pos.col; } -std::ostream& operator<<(std::ostream& os, const IntTok& tok) { - return os << (tok.sign == Pos ? "+" : tok.sign == Neg ? "-" : "") << tok.n; -} - -std::ostream& operator<<(std::ostream& os, const FloatTok& tok) { - if (std::isnan(tok.d)) { - os << (std::signbit(tok.d) ? "+" : "-"); - if (tok.nanPayload) { - return os << "nan:0x" << std::hex << *tok.nanPayload << std::dec; - } - return os << "nan"; - } - return os << tok.d; -} - -std::ostream& operator<<(std::ostream& os, const Token& tok) { - std::visit([&](const auto& t) { os << t; }, tok.data); - return os << " \"" << tok.span << "\""; -} - } // namespace wasm::WATParser -- cgit v1.2.3