diff options
author | Thomas Lively <tlively@google.com> | 2024-09-16 15:42:28 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-09-16 22:42:28 +0000 |
commit | 6c07a328e5ce0e1ac187a55d07faf6be642774a5 (patch) | |
tree | 45e6adaa112431059e52877c2b07f487da6670ae /src | |
parent | ed19e3f699ddb72d59f227a9f20846c9ce79e2c6 (diff) | |
download | binaryen-6c07a328e5ce0e1ac187a55d07faf6be642774a5.tar.gz binaryen-6c07a328e5ce0e1ac187a55d07faf6be642774a5.tar.bz2 binaryen-6c07a328e5ce0e1ac187a55d07faf6be642774a5.zip |
Require string-style identifiers to be UTF-8 (#6941)
In the WebAssembly text format, strings can generally be arbitrary
bytes, but identifiers must be valid UTF-8. Check for UTF-8 validity
when parsing string-style identifiers in the lexer.
Update StringLowering to generate valid UTF-8 global names even for
strings that may not be valid UTF-8 and test that text round tripping
works correctly after StringLowering.
Fixes #6937.
Diffstat (limited to 'src')
-rw-r--r-- | src/parser/lexer.cpp | 10 | ||||
-rw-r--r-- | src/passes/StringLowering.cpp | 7 |
2 files changed, 15 insertions, 2 deletions
diff --git a/src/parser/lexer.cpp b/src/parser/lexer.cpp index bb6428e87..44aecdc2b 100644 --- a/src/parser/lexer.cpp +++ b/src/parser/lexer.cpp @@ -280,6 +280,13 @@ struct LexStrResult : LexResult { // Allocate a string only if there are escape sequences, otherwise just use // the original string_view. std::optional<std::string> str; + + std::string_view getStr() { + if (str) { + return *str; + } + return span; + } }; struct LexStrCtx : LexCtx { @@ -860,6 +867,9 @@ std::optional<LexIdResult> ident(std::string_view in) { return {}; } if (auto s = str(ctx.next())) { + if (!String::isUTF8(s->getStr())) { + return {}; + } ctx.isStr = true; ctx.str = s->str; ctx.take(*s); diff --git a/src/passes/StringLowering.cpp b/src/passes/StringLowering.cpp index 349ba8cd0..081db6068 100644 --- a/src/passes/StringLowering.cpp +++ b/src/passes/StringLowering.cpp @@ -153,9 +153,12 @@ struct StringGathering : public Pass { [[maybe_unused]] bool valid = String::convertWTF16ToWTF8(wtf8, string.str); assert(valid); - // TODO: Use wtf8.view() once we have C++20. + // Then escape it because identifiers must be valid UTF-8. + // TODO: Use wtf8.view() and escaped.view() once we have C++20. + std::stringstream escaped; + String::printEscaped(escaped, wtf8.str()); auto name = Names::getValidGlobalName( - *module, std::string("string.const_") + std::string(wtf8.str())); + *module, std::string("string.const_") + std::string(escaped.str())); globalName = name; newNames.insert(name); auto* stringConst = builder.makeStringConst(string); |