3 files changed, 907 insertions, 0 deletions
diff --git a/src/wasm/wat-parser-internal.h b/src/wasm/wat-parser-internal.h
new file mode 100644
index 000000000..ede89de66
--- /dev/null
+++ b/src/wasm/wat-parser-internal.h
@@ -0,0 +1,557 @@
+/*
+ * Copyright 2022 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Usage note
+// ----------
+//
+// This parser is a work in progress and this file should not yet be included
+// anywhere except for in its own tests. Once the parser is usable, we will add
+// wat-parser.h to declare the public parsing API and wat-parser.cpp to
+// implement the public parsing functions in terms of the private API in this
+// header. The private API will stay in this header rather than moving to
+// wat-parser.cpp so that we can continue to unit test it.
+
+#include <cassert>
+#include <cctype>
+#include <iostream>
+#include <optional>
+#include <variant>
+
+using namespace std::string_view_literals;
+
+namespace wasm::WATParser {
+
+namespace {
+
+// ================
+// Lexical Analysis
+// ================
+
+// The result of lexing a token fragment.
+struct LexResult {
+  std::string_view span;
+};
+
+// Lexing context that accumulates lexed input to produce a token fragment.
+struct LexCtx {
+private:
+  // The input we are lexing.
+  std::string_view input;
+
+  // How much of the input we have already lexed.
+  size_t lexedSize = 0;
+
+public:
+  explicit LexCtx(std::string_view in) : input(in) {}
+
+  // Return the fragment that has been lexed so far.
+  std::optional<LexResult> lexed() const {
+    if (lexedSize > 0) {
+      return {LexResult{input.substr(0, lexedSize)}};
+    }
+    return {};
+  }
+
+  // The next input that has not already been lexed.
+  std::string_view next() const { return input.substr(lexedSize); }
+
+  // The size of the unlexed input.
+  size_t size() const { return input.size() - lexedSize; }
+
+  // Whether there is no more input.
+  bool empty() const { return size() == 0; }
+
+  // Tokens must be separated by spaces or parentheses.
+  bool canFinish() const;
+
+  // Whether the unlexed input starts with prefix `sv`.
+  size_t startsWith(std::string_view sv) const {
+    return next().substr(0, sv.size()) == sv;
+  }
+
+  // Consume the next `n` characters.
+  void take(size_t n) { lexedSize += n; }
+
+  // Consume an additional lexed fragment.
+  void take(const LexResult& res) { lexedSize += res.span.size(); }
+
+  // Consume the prefix and return true if possible.
+  bool takePrefix(std::string_view sv) {
+    if (startsWith(sv)) {
+      take(sv.size());
+      return true;
+    }
+    return false;
+  }
+
+  // Consume the rest of the input.
+  void takeAll() { lexedSize = input.size(); }
+};
+
+enum Signedness { Unsigned, Signed };
+
+// The result of lexing an integer token fragment.
+struct LexIntResult : LexResult {
+  uint64_t n;
+  Signedness signedness;
+};
+
+// Lexing context that accumulates lexed input to produce an integer token
+// fragment.
+struct LexIntCtx : LexCtx {
+  using LexCtx::take;
+
+private:
+  uint64_t n = 0;
+  Signedness signedness = Unsigned;
+  bool negative = false;
+  bool overflow = false;
+
+  std::optional<int> getDigit(char c) {
+    if ('0' <= c && c <= '9') {
+      return {c - '0'};
+    }
+    return std::nullopt;
+  }
+
+  std::optional<int> getHexDigit(char c) {
+    if ('0' <= c && c <= '9') {
+      return {c - '0'};
+    }
+    if ('A' <= c && c <= 'F') {
+      return {10 + c - 'A'};
+    }
+    if ('a' <= c && c <= 'f') {
+      return {10 + c - 'a'};
+    }
+    return std::nullopt;
+  }
+
+public:
+  explicit LexIntCtx(std::string_view in) : LexCtx(in) {}
+
+  std::optional<LexIntResult> lexed() {
+    // Check most significant bit for overflow of signed numbers.
+    if (overflow) {
+      return {};
+    }
+    auto basic = LexCtx::lexed();
+    if (!basic) {
+      return {};
+    }
+    if (signedness == Signed) {
+      if (negative) {
+        if (n > (1ull << 63)) {
+          // TODO: Add error production for signed underflow.
+          return {};
+        }
+      } else {
+        if (n > (1ull << 63) - 1) {
+          // TODO: Add error production for signed overflow.
+          return {};
+        }
+      }
+    }
+    return {LexIntResult{*basic, negative ? -n : n, signedness}};
+  }
+
+  void takeSign() {
+    if (takePrefix("+"sv)) {
+      signedness = Signed;
+    } else if (takePrefix("-"sv)) {
+      signedness = Signed;
+      negative = true;
+    }
+  }
+
+  bool takeDigit() {
+    if (!empty()) {
+      if (auto d = getDigit(next()[0])) {
+        take(1);
+        uint64_t newN = n * 10 + *d;
+        if (newN < n) {
+          overflow = true;
+        }
+        n = newN;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool takeHexdigit() {
+    if (!empty()) {
+      if (auto h = getHexDigit(next()[0])) {
+        take(1);
+        uint64_t newN = n * 16 + *h;
+        if (newN < n) {
+          overflow = true;
+        }
+        n = newN;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void take(const LexIntResult& res) {
+    LexCtx::take(res);
+    n = res.n;
+  }
+};
+
+std::optional<LexResult> lparen(std::string_view in) {
+  LexCtx ctx(in);
+  ctx.takePrefix("("sv);
+  return ctx.lexed();
+}
+
+std::optional<LexResult> rparen(std::string_view in) {
+  LexCtx ctx(in);
+  ctx.takePrefix(")"sv);
+  return ctx.lexed();
+}
+
+// comment      ::= linecomment | blockcomment
+// linecomment  ::= ';;' linechar* ('\n' | eof)
+// linechar     ::= c:char                      (if c != '\n')
+// blockcomment ::= '(;' blockchar* ';)'
+// blockchar    ::= c:char                      (if c != ';' and c != '(')
+//                | ';'                         (if the next char is not ')')
+//                | '('                         (if the next char is not ';')
+//                | blockcomment
+std::optional<LexResult> comment(std::string_view in) {
+  LexCtx ctx(in);
+  if (ctx.size() < 2) {
+    return {};
+  }
+
+  // Line comment
+  if (ctx.takePrefix(";;"sv)) {
+    if (auto size = ctx.next().find('\n'); size != ""sv.npos) {
+      ctx.take(size);
+    } else {
+      ctx.takeAll();
+    }
+    return ctx.lexed();
+  }
+
+  // Block comment (possibly nested!)
+  if (ctx.takePrefix("(;"sv)) {
+    size_t depth = 1;
+    while (depth > 0 && ctx.size() >= 2) {
+      if (ctx.takePrefix("(;"sv)) {
+        ++depth;
+      } else if (ctx.takePrefix(";)"sv)) {
+        --depth;
+      } else {
+        ctx.take(1);
+      }
+    }
+    if (depth > 0) {
+      // TODO: Add error production for non-terminated block comment.
+      return {};
+    }
+    return ctx.lexed();
+  }
+
+  return {};
+}
+
+std::optional<LexResult> spacechar(std::string_view in) {
+  LexCtx ctx(in);
+  ctx.takePrefix(" "sv) || ctx.takePrefix("\n"sv) || ctx.takePrefix("\r"sv) ||
+    ctx.takePrefix("\t"sv);
+  return ctx.lexed();
+}
+
+// space  ::= (' ' | format | comment)*
+// format ::= '\t' | '\n' | '\r'
+std::optional<LexResult> space(std::string_view in) {
+  LexCtx ctx(in);
+  while (ctx.size()) {
+    if (auto lexed = spacechar(ctx.next())) {
+      ctx.take(*lexed);
+    } else if (auto lexed = comment(ctx.next())) {
+      ctx.take(*lexed);
+    } else {
+      break;
+    }
+  }
+  return ctx.lexed();
+}
+
+bool LexCtx::canFinish() const {
+  // Logically we want to check for eof, parens, and space. But we don't
+  // actually want to parse more than a couple characters of space, so check for
+  // individual space chars or comment starts instead.
+  return empty() || lparen(next()) || rparen(next()) || spacechar(next()) ||
+         startsWith(";;"sv);
+}
+
+// num   ::= d:digit => d
+//         |  n:num '_'? d:digit => 10*n + d
+// digit ::= '0' => 0 | ... | '9' => 9
+std::optional<LexIntResult> num(std::string_view in) {
+  LexIntCtx ctx(in);
+  if (!ctx.takeDigit()) {
+    return {};
+  }
+  while (true) {
+    bool under = ctx.takePrefix("_"sv);
+    if (!ctx.takeDigit()) {
+      if (!under) {
+        return ctx.lexed();
+      }
+      return {};
+    }
+  }
+}
+
+// hexnum   ::= h:hexdigit => h
+//            | n:hexnum '_'? h:hexdigit => 16*n + h
+// hexdigit ::= d:digit => d
+//            | 'A' => 10 | ... | 'F' => 15
+//            | 'a' => 10 | ... | 'f' => 15
+std::optional<LexIntResult> hexnum(std::string_view in) {
+  LexIntCtx ctx(in);
+  if (!ctx.takeHexdigit()) {
+    return {};
+  }
+  while (true) {
+    bool under = ctx.takePrefix("_"sv);
+    if (!ctx.takeHexdigit()) {
+      if (!under) {
+        return ctx.lexed();
+      }
+      return {};
+    }
+  }
+}
+
+// uN ::= n:num         => n (if n < 2^N)
+//      | '0x' n:hexnum => n (if n < 2^N)
+// sN ::= s:sign n:num         => [s]n (if -2^(N-1) <= [s]n < 2^(N-1))
+//      | s:sign '0x' n:hexnum => [s]n (if -2^(N-1) <= [s]n < 2^(N-1))
+// sign ::= {} => + | '+' => + | '-' => -
+//
+// Note: Defer bounds and sign checking until we know what kind of integer we
+// expect.
+std::optional<LexIntResult> integer(std::string_view in) {
+  LexIntCtx ctx(in);
+  ctx.takeSign();
+  if (ctx.takePrefix("0x"sv)) {
+    if (auto lexed = hexnum(ctx.next())) {
+      ctx.take(*lexed);
+      if (ctx.canFinish()) {
+        return ctx.lexed();
+      }
+    }
+    // TODO: Add error production for unrecognized hexnum.
+    return {};
+  }
+  if (auto lexed = num(ctx.next())) {
+    ctx.take(*lexed);
+    if (ctx.canFinish()) {
+      return ctx.lexed();
+    }
+  }
+  return {};
+}
+
+// ======
+// Tokens
+// ======
+
+struct LParenTok {
+  friend std::ostream& operator<<(std::ostream& os, const LParenTok&) {
+    return os << "'('";
+  }
+
+  friend bool operator==(const LParenTok&, const LParenTok&) { return true; }
+};
+
+struct RParenTok {
+  friend std::ostream& operator<<(std::ostream& os, const RParenTok&) {
+    return os << "')'";
+  }
+
+  friend bool operator==(const RParenTok&, const RParenTok&) { return true; }
+};
+
+struct IntTok {
+  uint64_t n;
+  Signedness signedness;
+
+  friend std::ostream& operator<<(std::ostream& os, const IntTok& tok) {
+    return os << tok.n << (tok.signedness == Signed ? " signed" : " unsigned");
+  }
+
+  friend bool operator==(const IntTok& t1, const IntTok& t2) {
+    return t1.n == t2.n && t1.signedness == t2.signedness;
+  }
+};
+
+struct Token {
+  using Data = std::variant<LParenTok, RParenTok, IntTok>;
+
+  std::string_view span;
+  Data data;
+
+  // Suppress clang-tidy false positive about unused functions.
+  [[maybe_unused]] friend std::ostream& operator<<(std::ostream& os,
+                                                   const Token& tok) {
+    std::visit([&](const auto& t) { os << t; }, tok.data);
+    return os << " \"" << tok.span << "\"";
+  }
+
+  [[maybe_unused]] friend bool operator==(const Token& t1, const Token& t2) {
+    return t1.span == t2.span &&
+           std::visit(
+             [](auto& d1, auto& d2) {
+               if constexpr (std::is_same_v<decltype(d1), decltype(d2)>) {
+                 return d1 == d2;
+               } else {
+                 return false;
+               }
+             },
+             t1.data,
+             t2.data);
+  }
+};
+
+struct TextPos {
+  size_t line;
+  size_t col;
+
+  bool operator==(const TextPos& other) const {
+    return line == other.line && col == other.col;
+  }
+  bool operator!=(const TextPos& other) const { return !(*this == other); }
+
+  // Suppress clang-tidy false positive about unused functions.
+  [[maybe_unused]] friend std::ostream& operator<<(std::ostream& os,
+                                                   const TextPos& pos) {
+    return os << pos.line << ":" << pos.col;
+  }
+};
+
+// Lexer's purpose is twofold. First, it wraps a buffer to provide a tokenizing
+// iterator over it. Second, it implements that iterator itself. Also provides
+// utilities for locating the text position of tokens within the buffer. Text
+// positions are computed on demand rather than eagerly because they are
+// typically only needed when there is an error to report.
+struct Lexer {
+  using iterator = Lexer;
+  using difference_type = std::ptrdiff_t;
+  using value_type = Token;
+  using pointer = const Token*;
+  using reference = const Token&;
+  using iterator_category = std::forward_iterator_tag;
+
+  std::string_view buffer;
+  size_t index = 0;
+  std::optional<Token> curr;
+
+  // The end sentinel.
+  Lexer() = default;
+
+  Lexer(std::string_view buffer) : buffer(buffer) {
+    skipSpace();
+    lexToken();
+    skipSpace();
+  }
+
+  std::string_view next() const { return buffer.substr(index); }
+
+  void skipSpace() {
+    if (auto ctx = space(next())) {
+      index += ctx->span.size();
+    }
+  }
+
+  void lexToken() {
+    // TODO: Ensure we're getting the longest possible match.
+    Token tok;
+    if (auto t = lparen(next())) {
+      tok = Token{t->span, LParenTok{}};
+    } else if (auto t = rparen(next())) {
+      tok = Token{t->span, RParenTok{}};
+    } else if (auto t = integer(next())) {
+      tok = Token{t->span, IntTok{t->n, t->signedness}};
+    } else {
+      // TODO: Do something about lexing errors.
+      curr = std::nullopt;
+      return;
+    }
+    index += tok.span.size();
+    curr = {tok};
+  }
+
+  Lexer& operator++() {
+    // Preincrement
+    lexToken();
+    skipSpace();
+    return *this;
+  }
+
+  Lexer operator++(int) {
+    // Postincrement
+    Lexer ret = *this;
+    ++(*this);
+    return ret;
+  }
+
+  const Token& operator*() { return *curr; }
+  const Token* operator->() { return &*curr; }
+
+  bool operator==(const Lexer& other) const {
+    // The iterator is equal to the end sentinel when there is no current token.
+    if (!curr && !other.curr) {
+      return true;
+    }
+    // Otherwise they are equivalent when they are at the same position.
+    return index == other.index;
+  }
+
+  bool operator!=(const Lexer& other) const { return !(*this == other); }
+
+  Lexer begin() { return *this; }
+
+  Lexer end() { return Lexer(); }
+
+  TextPos position(const char* c) {
+    assert(size_t(c - buffer.data()) < buffer.size());
+    TextPos pos{1, 0};
+    for (const char* p = buffer.data(); p != c; ++p) {
+      if (*p == '\n') {
+        pos.line++;
+        pos.col = 0;
+      } else {
+        pos.col++;
+      }
+    }
+    return pos;
+  }
+
+  TextPos position(std::string_view span) { return position(span.data()); }
+
+  TextPos position(Token tok) { return position(tok.span); }
+};
+
+} // anonymous namespace
+
+} // namespace wasm::WATParser
diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt
index c58827a21..6a58debbf 100644
--- a/test/gtest/CMakeLists.txt
+++ b/test/gtest/CMakeLists.txt
@@ -1,7 +1,9 @@
 include_directories(../../third_party/googletest/googletest/include)
+include_directories(../../src/wasm)
 
 set(unittest_SOURCES
   type-builder.cpp
+  wat-parser.cpp
 )
 
 binaryen_add_executable(binaryen-unittests "${unittest_SOURCES}")
diff --git a/test/gtest/wat-parser.cpp b/test/gtest/wat-parser.cpp
new file mode 100644
index 000000000..1eba25869
--- /dev/null
+++ b/test/gtest/wat-parser.cpp
@@ -0,0 +1,348 @@
+#include "gtest/gtest.h"
+
+#include "wat-parser-internal.h"
+
+using namespace wasm::WATParser;
+
+TEST(ParserTest, LexWhitespace) {
+  Token one{"1"sv, IntTok{1, Unsigned}};
+  Token two{"2"sv, IntTok{2, Unsigned}};
+  Token three{"3"sv, IntTok{3, Unsigned}};
+  Token four{"4"sv, IntTok{4, Unsigned}};
+  Token five{"5"sv, IntTok{5, Unsigned}};
+
+  Lexer lexer(" 1\t2\n3\r4 \n\n\t 5 "sv);
+
+  auto it = lexer.begin();
+  ASSERT_NE(it, lexer.end());
+  Token t1 = *it++;
+  ASSERT_NE(it, lexer.end());
+  Token t2 = *it++;
+  ASSERT_NE(it, lexer.end());
+  Token t3 = *it++;
+  ASSERT_NE(it, lexer.end());
+  Token t4 = *it++;
+  ASSERT_NE(it, lexer.end());
+  Token t5 = *it++;
+  EXPECT_EQ(it, lexer.end());
+
+  EXPECT_EQ(t1, one);
+  EXPECT_EQ(t2, two);
+  EXPECT_EQ(t3, three);
+  EXPECT_EQ(t4, four);
+  EXPECT_EQ(t5, five);
+
+  EXPECT_EQ(lexer.position(t1), (TextPos{1, 1}));
+  EXPECT_EQ(lexer.position(t2), (TextPos{1, 3}));
+  EXPECT_EQ(lexer.position(t3), (TextPos{2, 0}));
+  EXPECT_EQ(lexer.position(t4), (TextPos{2, 2}));
+  EXPECT_EQ(lexer.position(t5), (TextPos{4, 2}));
+}
+
+TEST(ParserTest, LexLineComment) {
+  Token one{"1"sv, IntTok{1, Unsigned}};
+  Token six{"6"sv, IntTok{6, Unsigned}};
+
+  Lexer lexer("1;; whee! 2 3\t4\r5\n6"sv);
+
+  auto it = lexer.begin();
+  Token t1 = *it++;
+  ASSERT_NE(it, lexer.end());
+  Token t2 = *it++;
+  EXPECT_EQ(it, lexer.end());
+
+  EXPECT_EQ(t1, one);
+  EXPECT_EQ(t2, six);
+
+  EXPECT_EQ(lexer.position(t1), (TextPos{1, 0}));
+  EXPECT_EQ(lexer.position(t2), (TextPos{2, 0}));
+}
+
+TEST(ParserTest, LexBlockComment) {
+  Token one{"1"sv, IntTok{1, Unsigned}};
+  Token six{"6"sv, IntTok{6, Unsigned}};
+
+  Lexer lexer("1(; whoo! 2\n (; \n3\n ;) 4 (;) 5 ;) \n;)6"sv);
+
+  auto it = lexer.begin();
+  Token t1 = *it++;
+  ASSERT_NE(it, lexer.end());
+  Token t2 = *it++;
+  EXPECT_EQ(it, lexer.end());
+
+  EXPECT_EQ(t1, one);
+  EXPECT_EQ(t2, six);
+
+  EXPECT_EQ(lexer.position(t1), (TextPos{1, 0}));
+  EXPECT_EQ(lexer.position(t2), (TextPos{5, 2}));
+}
+
+TEST(ParserTest, LexParens) {
+  Token left{"("sv, LParenTok{}};
+  Token right{")"sv, RParenTok{}};
+
+  Lexer lexer("(())"sv);
+
+  auto it = lexer.begin();
+  ASSERT_NE(it, lexer.end());
+  Token t1 = *it++;
+  ASSERT_NE(it, lexer.end());
+  Token t2 = *it++;
+  ASSERT_NE(it, lexer.end());
+  Token t3 = *it++;
+  ASSERT_NE(it, lexer.end());
+  Token t4 = *it++;
+  EXPECT_EQ(it, lexer.end());
+
+  EXPECT_EQ(t1, left);
+  EXPECT_EQ(t2, left);
+  EXPECT_EQ(t3, right);
+  EXPECT_EQ(t4, right);
+}
+
+TEST(ParserTest, LexInt) {
+  {
+    Lexer lexer("0"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"0"sv, IntTok{0, Unsigned}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("+0"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"+0"sv, IntTok{0, Signed}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("-0"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"-0"sv, IntTok{0, Signed}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("1"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"1"sv, IntTok{1, Unsigned}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("+1"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"+1"sv, IntTok{1, Signed}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("-1"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"-1"sv, IntTok{-1ull, Signed}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("0010"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"0010"sv, IntTok{10, Unsigned}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("+0010"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"+0010"sv, IntTok{10, Signed}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("-0010"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"-0010"sv, IntTok{-10ull, Signed}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("9999"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"9999"sv, IntTok{9999, Unsigned}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("+9999"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"+9999"sv, IntTok{9999, Signed}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("-9999"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"-9999"sv, IntTok{-9999ull, Signed}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("12_34"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"12_34"sv, IntTok{1234, Unsigned}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("1_2_3_4"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"1_2_3_4"sv, IntTok{1234, Unsigned}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("_1234"sv);
+    EXPECT_EQ(lexer, lexer.end());
+  }
+  {
+    Lexer lexer("1234_"sv);
+    EXPECT_EQ(lexer, lexer.end());
+  }
+  {
+    Lexer lexer("12__34"sv);
+    EXPECT_EQ(lexer, lexer.end());
+  }
+  {
+    Lexer lexer("12cd56"sv);
+    EXPECT_EQ(lexer, lexer.end());
+  }
+  {
+    Lexer lexer("18446744073709551615"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"18446744073709551615"sv, IntTok{-1ull, Unsigned}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    // 64-bit unsigned overflow!
+    Lexer lexer("18446744073709551616");
+    EXPECT_EQ(lexer, lexer.end());
+  }
+  {
+    Lexer lexer("+9223372036854775807"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"+9223372036854775807"sv, IntTok{~(1ull << 63), Signed}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    // 64-bit signed overflow!
+    Lexer lexer("+9223372036854775808"sv);
+    EXPECT_EQ(lexer, lexer.end());
+  }
+  {
+    Lexer lexer("-9223372036854775808"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"-9223372036854775808"sv, IntTok{1ull << 63, Signed}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    // 64-bit signed underflow!
+    Lexer lexer("-9223372036854775809"sv);
+    EXPECT_EQ(lexer, lexer.end());
+  }
+}
+
+TEST(ParserTest, LexHexInt) {
+  {
+    Lexer lexer("0x0"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"0x0"sv, IntTok{0, Unsigned}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("+0x0"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"+0x0"sv, IntTok{0, Signed}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("-0x0"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"-0x0"sv, IntTok{0, Signed}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("0x1"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"0x1"sv, IntTok{1, Unsigned}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("+0x1"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"+0x1"sv, IntTok{1, Signed}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("-0x1"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"-0x1"sv, IntTok{-1ull, Signed}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("0x0010"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"0x0010"sv, IntTok{16, Unsigned}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("+0x0010"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"+0x0010"sv, IntTok{16, Signed}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("-0x0010"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"-0x0010"sv, IntTok{-16ull, Signed}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("0xabcdef"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"0xabcdef"sv, IntTok{0xabcdef, Unsigned}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("+0xABCDEF"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"+0xABCDEF"sv, IntTok{0xabcdef, Signed}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("-0xAbCdEf"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"-0xAbCdEf"sv, IntTok{-0xabcdefull, Signed}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("0x12_34"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"0x12_34"sv, IntTok{0x1234, Unsigned}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("0x1_2_3_4"sv);
+    ASSERT_NE(lexer, lexer.end());
+    Token expected{"0x1_2_3_4"sv, IntTok{0x1234, Unsigned}};
+    EXPECT_EQ(*lexer, expected);
+  }
+  {
+    Lexer lexer("_0x1234"sv);
+    EXPECT_EQ(lexer, lexer.end());
+  }
+  {
+    Lexer lexer("0x_1234"sv);
+    EXPECT_EQ(lexer, lexer.end());
+  }
+  {
+    Lexer lexer("0x1234_"sv);
+    EXPECT_EQ(lexer, lexer.end());
+  }
+  {
+    Lexer lexer("0x12__34"sv);
+    EXPECT_EQ(lexer, lexer.end());
+  }
+  {
+    Lexer lexer("0xg"sv);
+    EXPECT_EQ(lexer, lexer.end());
+  }
+  {
+    Lexer lexer("0x120x34"sv);
+    EXPECT_EQ(lexer, lexer.end());
+  }
+}