summaryrefslogtreecommitdiff
path: root/src/parser/lexer.cpp
diff options
context:
space:
mode:
authorThomas Lively <tlively@google.com>2024-04-25 22:56:36 -0700
committerGitHub <noreply@github.com>2024-04-25 22:56:36 -0700
commitabd51437426c72a2d2f8195da5d5cf570941b805 (patch)
treebed515cd6cb6b582721ceef0ce1cbf7397be307c /src/parser/lexer.cpp
parentdf6878612a32d50408fa9dc71e84199bc823a250 (diff)
downloadbinaryen-abd51437426c72a2d2f8195da5d5cf570941b805.tar.gz
binaryen-abd51437426c72a2d2f8195da5d5cf570941b805.tar.bz2
binaryen-abd51437426c72a2d2f8195da5d5cf570941b805.zip
[Parser] Do not eagerly lex numbers (#6544)
Lex integers and floats on demand to avoid wasted work. Remove `Token` completely now that all kinds of tokens are lexed on demand.
Diffstat (limited to 'src/parser/lexer.cpp')
-rw-r--r--src/parser/lexer.cpp321
1 files changed, 132 insertions, 189 deletions
diff --git a/src/parser/lexer.cpp b/src/parser/lexer.cpp
index f3646c0be..87a9d12f3 100644
--- a/src/parser/lexer.cpp
+++ b/src/parser/lexer.cpp
@@ -123,10 +123,25 @@ std::optional<int> getHexDigit(char c) {
return {};
}
+enum Sign { NoSign, Pos, Neg };
+
// The result of lexing an integer token fragment.
struct LexIntResult : LexResult {
uint64_t n;
Sign sign;
+
+ template<typename T> bool isUnsigned() {
+ static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>);
+ return sign == NoSign && n <= std::numeric_limits<T>::max();
+ }
+
+ template<typename T> bool isSigned() {
+ static_assert(std::is_integral_v<T> && std::is_signed_v<T>);
+ if (sign == Neg) {
+ return uint64_t(std::numeric_limits<T>::min()) <= n || n == 0;
+ }
+ return n <= uint64_t(std::numeric_limits<T>::max());
+ }
};
// Lexing context that accumulates lexed input to produce an integer token
@@ -887,123 +902,6 @@ std::optional<LexResult> keyword(std::string_view in) {
} // anonymous namespace
-template<typename T> std::optional<T> Token::getU() const {
- static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>);
- if (auto* tok = std::get_if<IntTok>(&data)) {
- if (tok->sign == NoSign && tok->n <= std::numeric_limits<T>::max()) {
- return T(tok->n);
- }
- // TODO: Add error production for unsigned overflow.
- }
- return {};
-}
-
-template<typename T> std::optional<T> Token::getS() const {
- static_assert(std::is_integral_v<T> && std::is_signed_v<T>);
- if (auto* tok = std::get_if<IntTok>(&data)) {
- if (tok->sign == Neg) {
- if (uint64_t(std::numeric_limits<T>::min()) <= tok->n || tok->n == 0) {
- return T(tok->n);
- }
- } else {
- if (tok->n <= uint64_t(std::numeric_limits<T>::max())) {
- return T(tok->n);
- }
- }
- }
- return {};
-}
-
-template<typename T> std::optional<T> Token::getI() const {
- static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>);
- if (auto n = getU<T>()) {
- return *n;
- }
- if (auto n = getS<std::make_signed_t<T>>()) {
- return T(*n);
- }
- return {};
-}
-
-template std::optional<uint64_t> Token::getU<uint64_t>() const;
-template std::optional<int64_t> Token::getS<int64_t>() const;
-template std::optional<uint64_t> Token::getI<uint64_t>() const;
-template std::optional<uint32_t> Token::getU<uint32_t>() const;
-template std::optional<int32_t> Token::getS<int32_t>() const;
-template std::optional<uint32_t> Token::getI<uint32_t>() const;
-template std::optional<uint16_t> Token::getU<uint16_t>() const;
-template std::optional<int16_t> Token::getS<int16_t>() const;
-template std::optional<uint16_t> Token::getI<uint16_t>() const;
-template std::optional<uint8_t> Token::getU<uint8_t>() const;
-template std::optional<int8_t> Token::getS<int8_t>() const;
-template std::optional<uint8_t> Token::getI<uint8_t>() const;
-
-std::optional<double> Token::getF64() const {
- constexpr int signif = 52;
- constexpr uint64_t payloadMask = (1ull << signif) - 1;
- constexpr uint64_t nanDefault = 1ull << (signif - 1);
- if (auto* tok = std::get_if<FloatTok>(&data)) {
- double d = tok->d;
- if (std::isnan(d)) {
- // Inject payload.
- uint64_t payload = tok->nanPayload ? *tok->nanPayload : nanDefault;
- if (payload == 0 || payload > payloadMask) {
- // TODO: Add error production for out-of-bounds payload.
- return {};
- }
- uint64_t bits;
- static_assert(sizeof(bits) == sizeof(d));
- memcpy(&bits, &d, sizeof(bits));
- bits = (bits & ~payloadMask) | payload;
- memcpy(&d, &bits, sizeof(bits));
- }
- return d;
- }
- if (auto* tok = std::get_if<IntTok>(&data)) {
- if (tok->sign == Neg) {
- if (tok->n == 0) {
- return -0.0;
- }
- return double(int64_t(tok->n));
- }
- return double(tok->n);
- }
- return {};
-}
-
-std::optional<float> Token::getF32() const {
- constexpr int signif = 23;
- constexpr uint32_t payloadMask = (1u << signif) - 1;
- constexpr uint64_t nanDefault = 1ull << (signif - 1);
- if (auto* tok = std::get_if<FloatTok>(&data)) {
- float f = tok->d;
- if (std::isnan(f)) {
- // Validate and inject payload.
- uint64_t payload = tok->nanPayload ? *tok->nanPayload : nanDefault;
- if (payload == 0 || payload > payloadMask) {
- // TODO: Add error production for out-of-bounds payload.
- return {};
- }
- uint32_t bits;
- static_assert(sizeof(bits) == sizeof(f));
- memcpy(&bits, &f, sizeof(bits));
- bits = (bits & ~payloadMask) | payload;
- memcpy(&f, &bits, sizeof(bits));
- }
- return f;
- }
- if (auto* tok = std::get_if<IntTok>(&data)) {
- if (tok->sign == Neg) {
- if (tok->n == 0) {
- return -0.0f;
- }
- return float(int64_t(tok->n));
- }
- return float(tok->n);
- }
- return {};
-}
-
void Lexer::skipSpace() {
while (true) {
if (auto ctx = annotation(next())) {
@@ -1020,9 +918,6 @@ void Lexer::skipSpace() {
}
bool Lexer::takeLParen() {
- if (curr) {
- return false;
- }
if (LexCtx(next()).startsWith("("sv)) {
++index;
advance();
@@ -1032,9 +927,6 @@ bool Lexer::takeLParen() {
}
bool Lexer::takeRParen() {
- if (curr) {
- return false;
- }
if (LexCtx(next()).startsWith(")"sv)) {
++index;
advance();
@@ -1044,9 +936,6 @@ bool Lexer::takeRParen() {
}
std::optional<std::string> Lexer::takeString() {
- if (curr) {
- return std::nullopt;
- }
if (auto result = str(next())) {
index += result->span.size();
advance();
@@ -1060,9 +949,6 @@ std::optional<std::string> Lexer::takeString() {
}
std::optional<Name> Lexer::takeID() {
- if (curr) {
- return std::nullopt;
- }
if (auto result = ident(next())) {
index += result->span.size();
advance();
@@ -1080,9 +966,6 @@ std::optional<Name> Lexer::takeID() {
}
std::optional<std::string_view> Lexer::takeKeyword() {
- if (curr) {
- return std::nullopt;
- }
if (auto result = keyword(next())) {
index += result->span.size();
advance();
@@ -1130,20 +1013,124 @@ std::optional<uint32_t> Lexer::takeAlign() {
return std::nullopt;
}
-void Lexer::lexToken() {
- // TODO: Ensure we're getting the longest possible match.
- Token tok;
- if (auto t = integer(next())) {
- tok = Token{t->span, IntTok{t->n, t->sign}};
- } else if (auto t = float_(next())) {
- tok = Token{t->span, FloatTok{t->nanPayload, t->d}};
- } else {
- // TODO: Do something about lexing errors.
- curr = std::nullopt;
- return;
+template<typename T> std::optional<T> Lexer::takeU() {
+ static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>);
+ if (auto result = integer(next()); result && result->isUnsigned<T>()) {
+ index += result->span.size();
+ advance();
+ return T(result->n);
+ }
+ // TODO: Add error production for unsigned overflow.
+ return std::nullopt;
+}
+
+template<typename T> std::optional<T> Lexer::takeS() {
+ static_assert(std::is_integral_v<T> && std::is_signed_v<T>);
+ if (auto result = integer(next()); result && result->isSigned<T>()) {
+ index += result->span.size();
+ advance();
+ return T(result->n);
+ }
+ return std::nullopt;
+}
+
+template<typename T> std::optional<T> Lexer::takeI() {
+ static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>);
+ if (auto result = integer(next())) {
+ if (result->isUnsigned<T>() || result->isSigned<std::make_signed_t<T>>()) {
+ index += result->span.size();
+ advance();
+ return T(result->n);
+ }
}
- index += tok.span.size();
- curr = {tok};
+ return std::nullopt;
+}
+
+template std::optional<uint64_t> Lexer::takeU<uint64_t>();
+template std::optional<int64_t> Lexer::takeS<int64_t>();
+template std::optional<uint64_t> Lexer::takeI<uint64_t>();
+template std::optional<uint32_t> Lexer::takeU<uint32_t>();
+template std::optional<int32_t> Lexer::takeS<int32_t>();
+template std::optional<uint32_t> Lexer::takeI<uint32_t>();
+template std::optional<uint16_t> Lexer::takeU<uint16_t>();
+template std::optional<int16_t> Lexer::takeS<int16_t>();
+template std::optional<uint16_t> Lexer::takeI<uint16_t>();
+template std::optional<uint8_t> Lexer::takeU<uint8_t>();
+template std::optional<int8_t> Lexer::takeS<int8_t>();
+template std::optional<uint8_t> Lexer::takeI<uint8_t>();
+
+std::optional<double> Lexer::takeF64() {
+ constexpr int signif = 52;
+ constexpr uint64_t payloadMask = (1ull << signif) - 1;
+ constexpr uint64_t nanDefault = 1ull << (signif - 1);
+ if (auto result = float_(next())) {
+ double d = result->d;
+ if (std::isnan(d)) {
+ // Inject payload.
+ uint64_t payload = result->nanPayload ? *result->nanPayload : nanDefault;
+ if (payload == 0 || payload > payloadMask) {
+ // TODO: Add error production for out-of-bounds payload.
+ return std::nullopt;
+ }
+ uint64_t bits;
+ static_assert(sizeof(bits) == sizeof(d));
+ memcpy(&bits, &d, sizeof(bits));
+ bits = (bits & ~payloadMask) | payload;
+ memcpy(&d, &bits, sizeof(bits));
+ }
+ index += result->span.size();
+ advance();
+ return d;
+ }
+ if (auto result = integer(next())) {
+ index += result->span.size();
+ advance();
+ if (result->sign == Neg) {
+ if (result->n == 0) {
+ return -0.0;
+ }
+ return double(int64_t(result->n));
+ }
+ return double(result->n);
+ }
+ return std::nullopt;
+}
+
+std::optional<float> Lexer::takeF32() {
+ constexpr int signif = 23;
+ constexpr uint32_t payloadMask = (1u << signif) - 1;
+ constexpr uint64_t nanDefault = 1ull << (signif - 1);
+ if (auto result = float_(next())) {
+ float f = result->d;
+ if (std::isnan(f)) {
+ // Validate and inject payload.
+ uint64_t payload = result->nanPayload ? *result->nanPayload : nanDefault;
+ if (payload == 0 || payload > payloadMask) {
+ // TODO: Add error production for out-of-bounds payload.
+ return std::nullopt;
+ }
+ uint32_t bits;
+ static_assert(sizeof(bits) == sizeof(f));
+ memcpy(&bits, &f, sizeof(bits));
+ bits = (bits & ~payloadMask) | payload;
+ memcpy(&f, &bits, sizeof(bits));
+ }
+ index += result->span.size();
+ advance();
+ return f;
+ }
+ if (auto result = integer(next())) {
+ index += result->span.size();
+ advance();
+ if (result->sign == Neg) {
+ if (result->n == 0) {
+ return -0.0f;
+ }
+ return float(int64_t(result->n));
+ }
+ return float(result->n);
+ }
+ return std::nullopt;
}
TextPos Lexer::position(const char* c) const {
@@ -1164,52 +1151,8 @@ bool TextPos::operator==(const TextPos& other) const {
return line == other.line && col == other.col;
}
-bool IntTok::operator==(const IntTok& other) const {
- return n == other.n && sign == other.sign;
-}
-
-bool FloatTok::operator==(const FloatTok& other) const {
- return std::signbit(d) == std::signbit(other.d) &&
- (d == other.d || (std::isnan(d) && std::isnan(other.d) &&
- nanPayload == other.nanPayload));
-}
-
-bool Token::operator==(const Token& other) const {
- return span == other.span &&
- std::visit(
- [](auto& t1, auto& t2) {
- if constexpr (std::is_same_v<decltype(t1), decltype(t2)>) {
- return t1 == t2;
- } else {
- return false;
- }
- },
- data,
- other.data);
-}
-
std::ostream& operator<<(std::ostream& os, const TextPos& pos) {
return os << pos.line << ":" << pos.col;
}
-std::ostream& operator<<(std::ostream& os, const IntTok& tok) {
- return os << (tok.sign == Pos ? "+" : tok.sign == Neg ? "-" : "") << tok.n;
-}
-
-std::ostream& operator<<(std::ostream& os, const FloatTok& tok) {
- if (std::isnan(tok.d)) {
- os << (std::signbit(tok.d) ? "+" : "-");
- if (tok.nanPayload) {
- return os << "nan:0x" << std::hex << *tok.nanPayload << std::dec;
- }
- return os << "nan";
- }
- return os << tok.d;
-}
-
-std::ostream& operator<<(std::ostream& os, const Token& tok) {
- std::visit([&](const auto& t) { os << t; }, tok.data);
- return os << " \"" << tok.span << "\"";
-}
-
} // namespace wasm::WATParser