summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorThomas Lively <tlively@google.com>2024-09-16 15:42:28 -0700
committerGitHub <noreply@github.com>2024-09-16 22:42:28 +0000
commit6c07a328e5ce0e1ac187a55d07faf6be642774a5 (patch)
tree45e6adaa112431059e52877c2b07f487da6670ae /src
parented19e3f699ddb72d59f227a9f20846c9ce79e2c6 (diff)
downloadbinaryen-6c07a328e5ce0e1ac187a55d07faf6be642774a5.tar.gz
binaryen-6c07a328e5ce0e1ac187a55d07faf6be642774a5.tar.bz2
binaryen-6c07a328e5ce0e1ac187a55d07faf6be642774a5.zip
Require string-style identifiers to be UTF-8 (#6941)
In the WebAssembly text format, strings can generally be arbitrary bytes, but identifiers must be valid UTF-8. Check for UTF-8 validity when parsing string-style identifiers in the lexer. Update StringLowering to generate valid UTF-8 global names even for strings that may not be valid UTF-8 and test that text round tripping works correctly after StringLowering. Fixes #6937.
Diffstat (limited to 'src')
-rw-r--r--src/parser/lexer.cpp10
-rw-r--r--src/passes/StringLowering.cpp7
2 files changed, 15 insertions, 2 deletions
diff --git a/src/parser/lexer.cpp b/src/parser/lexer.cpp
index bb6428e87..44aecdc2b 100644
--- a/src/parser/lexer.cpp
+++ b/src/parser/lexer.cpp
@@ -280,6 +280,13 @@ struct LexStrResult : LexResult {
// Allocate a string only if there are escape sequences, otherwise just use
// the original string_view.
std::optional<std::string> str;
+
+ std::string_view getStr() {
+ if (str) {
+ return *str;
+ }
+ return span;
+ }
};
struct LexStrCtx : LexCtx {
@@ -860,6 +867,9 @@ std::optional<LexIdResult> ident(std::string_view in) {
return {};
}
if (auto s = str(ctx.next())) {
+ if (!String::isUTF8(s->getStr())) {
+ return {};
+ }
ctx.isStr = true;
ctx.str = s->str;
ctx.take(*s);
diff --git a/src/passes/StringLowering.cpp b/src/passes/StringLowering.cpp
index 349ba8cd0..081db6068 100644
--- a/src/passes/StringLowering.cpp
+++ b/src/passes/StringLowering.cpp
@@ -153,9 +153,12 @@ struct StringGathering : public Pass {
[[maybe_unused]] bool valid =
String::convertWTF16ToWTF8(wtf8, string.str);
assert(valid);
- // TODO: Use wtf8.view() once we have C++20.
+ // Then escape it because identifiers must be valid UTF-8.
+ // TODO: Use wtf8.view() and escaped.view() once we have C++20.
+ std::stringstream escaped;
+ String::printEscaped(escaped, wtf8.str());
auto name = Names::getValidGlobalName(
- *module, std::string("string.const_") + std::string(wtf8.str()));
+ *module, std::string("string.const_") + std::string(escaped.str()));
globalName = name;
newNames.insert(name);
auto* stringConst = builder.makeStringConst(string);