diff options
Diffstat (limited to 'src/wast-lexer.cc')
-rw-r--r-- | src/wast-lexer.cc | 68 |
1 files changed, 35 insertions, 33 deletions
diff --git a/src/wast-lexer.cc b/src/wast-lexer.cc index 8586d925..7ae2b4cd 100644 --- a/src/wast-lexer.cc +++ b/src/wast-lexer.cc @@ -48,18 +48,20 @@ YY_USER_ACTION; \ wast_parser_error(loc, this, parser, __VA_ARGS__) -#define BEGIN(c) \ - do { \ - cond = c; \ - } while (0) -#define FILL(n) \ - do { \ - if (WABT_FAILED(Fill(loc, parser, n))) { \ - RETURN(EOF); \ - continue; \ - } \ +#define BEGIN(c) cond = (c) +#define FILL(n) \ + do { \ + if (WABT_FAILED(Fill(loc, parser, (n)))) { \ + RETURN(EOF); \ + } \ } while (0) +#define MAYBE_MALFORMED_UTF8(desc) \ + if (!(eof_ && limit_ - cursor_ <= YYMAXFILL)) { \ + ERROR("malformed utf-8%s", desc); \ + } \ + continue + #define yytext (token_) #define yyleng (cursor_ - token_) @@ -174,7 +176,8 @@ Result WastLexer::Fill(Location* loc, WastParser* parser, size_t need) { // http://re2c.org/examples/example_03.html. if (limit_ < buffer_ + buffer_size_ - YYMAXFILL) { eof_ = true; - memset(limit_, 0, YYMAXFILL); + // Fill with 0xff, since that is an invalid utf-8 byte. + memset(limit_, 0xff, YYMAXFILL); limit_ += YYMAXFILL; } return Result::Ok; @@ -203,30 +206,29 @@ int WastLexer::GetToken(Token* lval, Location* loc, WastParser* parser) { re2c:define:YYGETCONDITION:naked = 1; re2c:define:YYSETCONDITION = "BEGIN"; - space = [ \t]; digit = [0-9]; - digits = [0-9]+; hexdigit = [0-9a-fA-F]; + num = digit+; + hexnum = hexdigit+; letter = [a-zA-Z]; - symbol = [+\-*\/\\\^~=<>!?@#$%&|:`.]; - tick = "'"; - escape = [nt\\'"]; - character = [^"\\\x00-\x1f\x7f] | "\\" escape | "\\" hexdigit hexdigit; + symbol = [+\-*\/\\\^~=<>!?@#$%&|:`.']; + character = [^"\\\x00-\x1f] + | "\\" [nrt\\'"] + | "\\" hexdigit hexdigit; sign = [+-]; - num = digit+; - hexnum = "0x" hexdigit+; - nat = num | hexnum; + nat = num | "0x" hexnum; int = sign nat; - float0 = sign? num "." digit*; - float1 = sign? num ("." digit*)? [eE] sign? num; - hexfloat = sign? "0x" hexdigit+ "."? hexdigit* "p" sign? digit+; - infinity = sign? ("inf" | "infinity"); - nan = sign? "nan" | sign? "nan:0x" hexdigit+; - float = float0 | float1; + hexfloat = sign? "0x" hexnum ("." hexdigit*)? "p" sign? num; + infinity = sign? "inf"; + nan = sign? "nan" + | sign? "nan:0x" hexnum; + float = sign? num "." digit* + | sign? num ("." digit*)? [eE] sign? num; text = '"' character* '"'; - atom = (letter | digit | "_" | tick | symbol)+; - name = "$" atom; - EOF = "\x00"; + name = "$" (letter | digit | "_" | symbol)+; + + // Should be ([\x21-\x7e] \ [()"; ])+ , but re2c doesn't like this... + reserved = [\x21\x23-\x27\x2a-\x3a\x3c-\x7e]+; <i> "(" { RETURN(LPAR); } <i> ")" { RETURN(RPAR); } @@ -246,9 +248,9 @@ int WastLexer::GetToken(Token* lval, Location* loc, WastParser* parser) { static_cast<int>(yyleng), yytext); continue; } <BAD_TEXT> '"' => i { TEXT; RETURN(TEXT); } - <BAD_TEXT> EOF { ERROR("unexpected EOF"); RETURN(EOF); } <BAD_TEXT> [^] { ERROR("illegal character in string"); continue; } + <BAD_TEXT> * { MAYBE_MALFORMED_UTF8(" in string"); } <i> "i32" { TYPE(I32); RETURN(VALUE_TYPE); } <i> "i64" { TYPE(I64); RETURN(VALUE_TYPE); } <i> "f32" { TYPE(F32); RETURN(VALUE_TYPE); } @@ -469,15 +471,15 @@ int WastLexer::GetToken(Token* lval, Location* loc, WastParser* parser) { BEGIN(YYCOND_INIT); continue; } <BLOCK_COMMENT> "\n" { NEWLINE; continue; } - <BLOCK_COMMENT> EOF { ERROR("unexpected EOF"); RETURN(EOF); } <BLOCK_COMMENT> [^] { continue; } + <BLOCK_COMMENT> * { MAYBE_MALFORMED_UTF8(" in block comment"); } <i> "\n" { NEWLINE; continue; } <i> [ \t\r]+ { continue; } - <i> atom { ERROR("unexpected token \"%.*s\"", + <i> reserved { ERROR("unexpected token \"%.*s\"", static_cast<int>(yyleng), yytext); continue; } - <*> EOF { RETURN(EOF); } <*> [^] { ERROR("unexpected char"); continue; } + <*> * { MAYBE_MALFORMED_UTF8(""); } */ } } |