From 6c07a328e5ce0e1ac187a55d07faf6be642774a5 Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Mon, 16 Sep 2024 15:42:28 -0700 Subject: Require string-style identifiers to be UTF-8 (#6941) In the WebAssembly text format, strings can generally be arbitrary bytes, but identifiers must be valid UTF-8. Check for UTF-8 validity when parsing string-style identifiers in the lexer. Update StringLowering to generate valid UTF-8 global names even for strings that may not be valid UTF-8 and test that text round tripping works correctly after StringLowering. Fixes #6937. --- src/parser/lexer.cpp | 10 ++++++++++ src/passes/StringLowering.cpp | 7 +++++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/parser/lexer.cpp b/src/parser/lexer.cpp index bb6428e87..44aecdc2b 100644 --- a/src/parser/lexer.cpp +++ b/src/parser/lexer.cpp @@ -280,6 +280,13 @@ struct LexStrResult : LexResult { // Allocate a string only if there are escape sequences, otherwise just use // the original string_view. std::optional str; + + std::string_view getStr() { + if (str) { + return *str; + } + return span; + } }; struct LexStrCtx : LexCtx { @@ -860,6 +867,9 @@ std::optional ident(std::string_view in) { return {}; } if (auto s = str(ctx.next())) { + if (!String::isUTF8(s->getStr())) { + return {}; + } ctx.isStr = true; ctx.str = s->str; ctx.take(*s); diff --git a/src/passes/StringLowering.cpp b/src/passes/StringLowering.cpp index 349ba8cd0..081db6068 100644 --- a/src/passes/StringLowering.cpp +++ b/src/passes/StringLowering.cpp @@ -153,9 +153,12 @@ struct StringGathering : public Pass { [[maybe_unused]] bool valid = String::convertWTF16ToWTF8(wtf8, string.str); assert(valid); - // TODO: Use wtf8.view() once we have C++20. + // Then escape it because identifiers must be valid UTF-8. + // TODO: Use wtf8.view() and escaped.view() once we have C++20. + std::stringstream escaped; + String::printEscaped(escaped, wtf8.str()); auto name = Names::getValidGlobalName( - *module, std::string("string.const_") + std::string(wtf8.str())); + *module, std::string("string.const_") + std::string(escaped.str())); globalName = name; newNames.insert(name); auto* stringConst = builder.makeStringConst(string); -- cgit v1.2.3