summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/binaryen-c.cpp8
-rw-r--r--src/literal.h2
-rw-r--r--src/parser/contexts.h9
-rw-r--r--src/parser/lexer.cpp21
-rw-r--r--src/passes/Print.cpp8
-rw-r--r--src/passes/StringLowering.cpp8
-rw-r--r--src/support/json.cpp7
-rw-r--r--src/support/string.cpp231
-rw-r--r--src/support/string.h19
-rw-r--r--src/tools/fuzzing/fuzzing.cpp10
-rw-r--r--src/wasm-builder.h13
-rw-r--r--src/wasm-interpreter.h53
-rw-r--r--src/wasm/literal.cpp23
-rw-r--r--src/wasm/wasm-binary.cpp16
-rw-r--r--src/wasm/wasm-s-parser.cpp8
15 files changed, 294 insertions, 142 deletions
diff --git a/src/binaryen-c.cpp b/src/binaryen-c.cpp
index 29e0597d7..402dce553 100644
--- a/src/binaryen-c.cpp
+++ b/src/binaryen-c.cpp
@@ -26,6 +26,7 @@
#include "pass.h"
#include "shell-interface.h"
#include "support/colors.h"
+#include "support/string.h"
#include "wasm-binary.h"
#include "wasm-builder.h"
#include "wasm-interpreter.h"
@@ -1895,8 +1896,13 @@ BinaryenExpressionRef BinaryenStringNew(BinaryenModuleRef module,
}
BinaryenExpressionRef BinaryenStringConst(BinaryenModuleRef module,
const char* name) {
+ // Re-encode from WTF-8 to WTF-16.
+ std::stringstream wtf16;
+ [[maybe_unused]] bool valid = String::convertWTF8ToWTF16(wtf16, name);
+ assert(valid);
+ // TODO: Use wtf16.view() once we have C++20.
return static_cast<Expression*>(
- Builder(*(Module*)module).makeStringConst(name));
+ Builder(*(Module*)module).makeStringConst(wtf16.str()));
}
BinaryenExpressionRef BinaryenStringMeasure(BinaryenModuleRef module,
BinaryenOp op,
diff --git a/src/literal.h b/src/literal.h
index 971cc8b3e..a4017f6ec 100644
--- a/src/literal.h
+++ b/src/literal.h
@@ -85,7 +85,7 @@ public:
assert(type.isSignature());
}
explicit Literal(std::shared_ptr<GCData> gcData, HeapType type);
- explicit Literal(std::string string);
+ explicit Literal(std::string_view string);
Literal(const Literal& other);
Literal& operator=(const Literal& other);
~Literal();
diff --git a/src/parser/contexts.h b/src/parser/contexts.h
index 8b59ab40b..0979461a0 100644
--- a/src/parser/contexts.h
+++ b/src/parser/contexts.h
@@ -22,6 +22,7 @@
#include "lexer.h"
#include "support/name.h"
#include "support/result.h"
+#include "support/string.h"
#include "wasm-builder.h"
#include "wasm-ir-builder.h"
#include "wasm.h"
@@ -2491,7 +2492,13 @@ struct ParseDefsCtx : TypeParserCtx<ParseDefsCtx> {
Result<> makeStringConst(Index pos,
const std::vector<Annotation>& annotations,
std::string_view str) {
- return withLoc(pos, irBuilder.makeStringConst(Name(str)));
+ // Re-encode from WTF-8 to WTF-16.
+ std::stringstream wtf16;
+ if (!String::convertWTF8ToWTF16(wtf16, str)) {
+ return in.err(pos, "invalid string constant");
+ }
+ // TODO: Use wtf16.view() once we have C++20.
+ return withLoc(pos, irBuilder.makeStringConst(wtf16.str()));
}
Result<> makeStringMeasure(Index pos,
diff --git a/src/parser/lexer.cpp b/src/parser/lexer.cpp
index 8c7542dd7..48da163e1 100644
--- a/src/parser/lexer.cpp
+++ b/src/parser/lexer.cpp
@@ -23,6 +23,7 @@
#include <variant>
#include "lexer.h"
+#include "support/string.h"
using namespace std::string_view_literals;
@@ -308,25 +309,7 @@ public:
if ((0xd800 <= u && u < 0xe000) || 0x110000 <= u) {
return false;
}
- if (u < 0x80) {
- // 0xxxxxxx
- *escapeBuilder << uint8_t(u);
- } else if (u < 0x800) {
- // 110xxxxx 10xxxxxx
- *escapeBuilder << uint8_t(0b11000000 | ((u >> 6) & 0b00011111));
- *escapeBuilder << uint8_t(0b10000000 | ((u >> 0) & 0b00111111));
- } else if (u < 0x10000) {
- // 1110xxxx 10xxxxxx 10xxxxxx
- *escapeBuilder << uint8_t(0b11100000 | ((u >> 12) & 0b00001111));
- *escapeBuilder << uint8_t(0b10000000 | ((u >> 6) & 0b00111111));
- *escapeBuilder << uint8_t(0b10000000 | ((u >> 0) & 0b00111111));
- } else {
- // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- *escapeBuilder << uint8_t(0b11110000 | ((u >> 18) & 0b00000111));
- *escapeBuilder << uint8_t(0b10000000 | ((u >> 12) & 0b00111111));
- *escapeBuilder << uint8_t(0b10000000 | ((u >> 6) & 0b00111111));
- *escapeBuilder << uint8_t(0b10000000 | ((u >> 0) & 0b00111111));
- }
+ String::writeWTF8CodePoint(*escapeBuilder, u);
return true;
}
};
diff --git a/src/passes/Print.cpp b/src/passes/Print.cpp
index 643f1cc3f..80047a281 100644
--- a/src/passes/Print.cpp
+++ b/src/passes/Print.cpp
@@ -2232,7 +2232,13 @@ struct PrintExpressionContents
}
void visitStringConst(StringConst* curr) {
printMedium(o, "string.const ");
- String::printEscaped(o, curr->string.str);
+ // Re-encode from WTF-16 to WTF-8.
+ std::stringstream wtf8;
+ [[maybe_unused]] bool valid =
+ String::convertWTF16ToWTF8(wtf8, curr->string.str);
+ assert(valid);
+ // TODO: Use wtf8.view() once we have C++20.
+ String::printEscaped(o, wtf8.str());
}
void visitStringMeasure(StringMeasure* curr) {
switch (curr->op) {
diff --git a/src/passes/StringLowering.cpp b/src/passes/StringLowering.cpp
index e0d3fbad0..322f0deb2 100644
--- a/src/passes/StringLowering.cpp
+++ b/src/passes/StringLowering.cpp
@@ -147,8 +147,14 @@ struct StringGathering : public Pass {
}
auto& string = strings[i];
+ // Re-encode from WTF-16 to WTF-8 to make the name easier to read.
+ std::stringstream wtf8;
+ [[maybe_unused]] bool valid =
+ String::convertWTF16ToWTF8(wtf8, string.str);
+ assert(valid);
+ // TODO: Use wtf8.view() once we have C++20.
auto name = Names::getValidGlobalName(
- *module, std::string("string.const_") + std::string(string.str));
+ *module, std::string("string.const_") + std::string(wtf8.str()));
globalName = name;
newNames.insert(name);
auto* stringConst = builder.makeStringConst(string);
diff --git a/src/support/json.cpp b/src/support/json.cpp
index ab55cc75f..dd94719d4 100644
--- a/src/support/json.cpp
+++ b/src/support/json.cpp
@@ -21,7 +21,12 @@ namespace json {
void Value::stringify(std::ostream& os, bool pretty) {
if (isString()) {
- wasm::String::printEscapedJSON(os, getCString());
+ std::stringstream wtf16;
+ [[maybe_unused]] bool valid =
+ wasm::String::convertWTF8ToWTF16(wtf16, getIString().str);
+ assert(valid);
+ // TODO: Use wtf16.view() once we have C++20.
+ wasm::String::printEscapedJSON(os, wtf16.str());
} else if (isArray()) {
os << '[';
auto first = true;
diff --git a/src/support/string.cpp b/src/support/string.cpp
index c3a9ce4e4..68249f51e 100644
--- a/src/support/string.cpp
+++ b/src/support/string.cpp
@@ -14,6 +14,7 @@
* limitations under the License.
*/
+#include <optional>
#include <ostream>
#include "support/string.h"
@@ -106,7 +107,7 @@ std::string trim(const std::string& input) {
return input.substr(0, size);
}
-std::ostream& printEscaped(std::ostream& os, const std::string_view str) {
+std::ostream& printEscaped(std::ostream& os, std::string_view str) {
os << '"';
for (unsigned char c : str) {
switch (c) {
@@ -140,67 +141,193 @@ std::ostream& printEscaped(std::ostream& os, const std::string_view str) {
return os << '"';
}
-std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str) {
- os << '"';
- constexpr uint32_t replacementCharacter = 0xFFFD;
- bool lastWasLeadingSurrogate = false;
- for (size_t i = 0; i < str.size();) {
- // Decode from WTF-8 into a unicode code point.
- uint8_t leading = str[i];
- size_t trailingBytes;
- uint32_t u;
- if ((leading & 0b10000000) == 0b00000000) {
- // 0xxxxxxx
- trailingBytes = 0;
- u = leading;
- } else if ((leading & 0b11100000) == 0b11000000) {
- // 110xxxxx 10xxxxxx
- trailingBytes = 1;
- u = (leading & 0b00011111) << 6;
- } else if ((leading & 0b11110000) == 0b11100000) {
- // 1110xxxx 10xxxxxx 10xxxxxx
- trailingBytes = 2;
- u = (leading & 0b00001111) << 12;
- } else if ((leading & 0b11111000) == 0b11110000) {
- // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- trailingBytes = 3;
- u = (leading & 0b00000111) << 18;
- } else {
- std::cerr << "warning: Bad WTF-8 leading byte (" << std::hex
- << int(leading) << std::dec << "). Replacing.\n";
- trailingBytes = 0;
- u = replacementCharacter;
- }
+namespace {
- ++i;
+std::optional<uint32_t> takeWTF8CodePoint(std::string_view& str) {
+ bool valid = true;
- if (i + trailingBytes > str.size()) {
- std::cerr << "warning: Unexpected end of string. Replacing.\n";
- u = replacementCharacter;
- } else {
- for (size_t j = 0; j < trailingBytes; ++j) {
- uint8_t trailing = str[i + j];
- if ((trailing & 0b11000000) != 0b10000000) {
- std::cerr << "warning: Bad WTF-8 trailing byte (" << std::hex
- << int(trailing) << std::dec << "). Replacing.\n";
- u = replacementCharacter;
- break;
- }
- // Shift 6 bits for every remaining trailing byte after this one.
- u |= (trailing & 0b00111111) << (6 * (trailingBytes - j - 1));
+ if (str.size() == 0) {
+ return std::nullopt;
+ }
+
+ uint8_t leading = str[0];
+ size_t trailingBytes;
+ uint32_t u;
+ if ((leading & 0b10000000) == 0b00000000) {
+ // 0xxxxxxx
+ trailingBytes = 0;
+ u = leading;
+ } else if ((leading & 0b11100000) == 0b11000000) {
+ // 110xxxxx 10xxxxxx
+ trailingBytes = 1;
+ u = (leading & 0b00011111) << 6;
+ } else if ((leading & 0b11110000) == 0b11100000) {
+ // 1110xxxx 10xxxxxx 10xxxxxx
+ trailingBytes = 2;
+ u = (leading & 0b00001111) << 12;
+ } else if ((leading & 0b11111000) == 0b11110000) {
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ trailingBytes = 3;
+ u = (leading & 0b00000111) << 18;
+ } else {
+ // Bad WTF-8 leading byte.
+ trailingBytes = 0;
+ valid = false;
+ }
+
+ if (str.size() <= trailingBytes) {
+ // Unexpected end of string.
+ str = str.substr(str.size());
+ return std::nullopt;
+ }
+
+ if (valid) {
+ for (size_t j = 0; j < trailingBytes; ++j) {
+ uint8_t trailing = str[1 + j];
+ if ((trailing & 0b11000000) != 0b10000000) {
+ // Bad WTF-8 trailing byte.
+ valid = false;
+ break;
}
+ // Shift 6 bits for every remaining trailing byte after this one.
+ u |= (trailing & 0b00111111) << (6 * (trailingBytes - j - 1));
}
+ }
+
+ str = str.substr(1 + trailingBytes);
+ if (!valid) {
+ return std::nullopt;
+ }
+ return u;
+}
+
+std::optional<uint16_t> takeWTF16CodeUnit(std::string_view& str) {
+ if (str.size() < 2) {
+ str = str.substr(str.size());
+ return std::nullopt;
+ }
+
+ // Use a little-endian encoding.
+ uint16_t u = uint8_t(str[0]) | (uint8_t(str[1]) << 8);
+ str = str.substr(2);
+ return u;
+}
+
+std::optional<uint32_t> takeWTF16CodePoint(std::string_view& str) {
+ auto u = takeWTF16CodeUnit(str);
+ if (!u) {
+ return std::nullopt;
+ }
+
+ if (0xD800 <= *u && *u < 0xDC00) {
+ // High surrogate; take the next low surrogate if it exists.
+ auto next = str;
+ auto low = takeWTF16CodeUnit(next);
+ if (low && 0xDC00 <= *low && *low < 0xE000) {
+ str = next;
+ uint16_t highBits = *u - 0xD800;
+ uint16_t lowBits = *low - 0xDC00;
+ return 0x10000 + ((highBits << 10) | lowBits);
+ }
+ }
+
+ return *u;
+}
+
+void writeWTF16CodeUnit(std::ostream& os, uint16_t u) {
+ // Little-endian encoding.
+ os << uint8_t(u & 0xFF);
+ os << uint8_t(u >> 8);
+}
+
+constexpr uint32_t replacementCharacter = 0xFFFD;
+
+} // anonymous namespace
+
+std::ostream& writeWTF8CodePoint(std::ostream& os, uint32_t u) {
+ assert(u < 0x110000);
+ if (u < 0x80) {
+ // 0xxxxxxx
+ os << uint8_t(u);
+ } else if (u < 0x800) {
+ // 110xxxxx 10xxxxxx
+ os << uint8_t(0b11000000 | ((u >> 6) & 0b00011111));
+ os << uint8_t(0b10000000 | ((u >> 0) & 0b00111111));
+ } else if (u < 0x10000) {
+ // 1110xxxx 10xxxxxx 10xxxxxx
+ os << uint8_t(0b11100000 | ((u >> 12) & 0b00001111));
+ os << uint8_t(0b10000000 | ((u >> 6) & 0b00111111));
+ os << uint8_t(0b10000000 | ((u >> 0) & 0b00111111));
+ } else {
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ os << uint8_t(0b11110000 | ((u >> 18) & 0b00000111));
+ os << uint8_t(0b10000000 | ((u >> 12) & 0b00111111));
+ os << uint8_t(0b10000000 | ((u >> 6) & 0b00111111));
+ os << uint8_t(0b10000000 | ((u >> 0) & 0b00111111));
+ }
+ return os;
+}
+
+std::ostream& writeWTF16CodePoint(std::ostream& os, uint32_t u) {
+ assert(u < 0x110000);
+ if (u < 0x10000) {
+ writeWTF16CodeUnit(os, u);
+ } else {
+ // Encode with a surrogate pair.
+ uint16_t high = 0xD800 + ((u - 0x10000) >> 10);
+ uint16_t low = 0xDC00 + ((u - 0x10000) & 0x3FF);
+ writeWTF16CodeUnit(os, high);
+ writeWTF16CodeUnit(os, low);
+ }
+ return os;
+}
+
+bool convertWTF8ToWTF16(std::ostream& os, std::string_view str) {
+ bool valid = true;
+ bool lastWasLeadingSurrogate = false;
- i += trailingBytes;
+ while (str.size()) {
+ auto u = takeWTF8CodePoint(str);
+ if (!u) {
+ valid = false;
+ u = replacementCharacter;
+ }
- bool isLeadingSurrogate = 0xD800 <= u && u <= 0xDBFF;
- bool isTrailingSurrogate = 0xDC00 <= u && u <= 0xDFFF;
+ bool isLeadingSurrogate = 0xD800 <= *u && *u < 0xDC00;
+ bool isTrailingSurrogate = 0xDC00 <= *u && *u < 0xE000;
if (lastWasLeadingSurrogate && isTrailingSurrogate) {
- std::cerr << "warning: Invalid surrogate sequence in WTF-8.\n";
+ // Invalid surrogate sequence.
+ valid = false;
}
lastWasLeadingSurrogate = isLeadingSurrogate;
- // Encode unicode code point into JSON.
+ writeWTF16CodePoint(os, *u);
+ }
+
+ return valid;
+}
+
+bool convertWTF16ToWTF8(std::ostream& os, std::string_view str) {
+ bool valid = true;
+
+ while (str.size()) {
+ auto u = takeWTF16CodePoint(str);
+ if (!u) {
+ valid = false;
+ u = replacementCharacter;
+ }
+ writeWTF8CodePoint(os, *u);
+ }
+
+ return valid;
+}
+
+std::ostream& printEscapedJSON(std::ostream& os, std::string_view str) {
+ os << '"';
+ while (str.size()) {
+ auto u = *takeWTF16CodePoint(str);
+
+ // Use escape sequences mandated by the JSON spec.
switch (u) {
case '"':
os << "\\\"";
diff --git a/src/support/string.h b/src/support/string.h
index 6fb3f693b..be2c3c6a3 100644
--- a/src/support/string.h
+++ b/src/support/string.h
@@ -75,9 +75,24 @@ inline bool isNumber(const std::string& str) {
return !str.empty() && std::all_of(str.begin(), str.end(), ::isdigit);
}
-std::ostream& printEscaped(std::ostream& os, const std::string_view str);
+std::ostream& printEscaped(std::ostream& os, std::string_view str);
-std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str);
+// `str` must be a valid WTF-16 string.
+std::ostream& printEscapedJSON(std::ostream& os, std::string_view str);
+
+std::ostream& writeWTF8CodePoint(std::ostream& os, uint32_t u);
+
+std::ostream& writeWTF16CodePoint(std::ostream& os, uint32_t u);
+
+// Writes the WTF-16LE encoding of the given WTF-8 string to `os`, inserting
+// replacement characters as necessary when encountering invalid WTF-8. Returns
+// `true` iff the input was valid WTF-8.
+bool convertWTF8ToWTF16(std::ostream& os, std::string_view str);
+
+// Writes the WTF-8 encoding of the given WTF-16LE string to `os`, inserting a
+// replacement character at the end if the string is an odd number of bytes.
+// Returns `true` iff the input was valid WTF-16.
+bool convertWTF16ToWTF8(std::ostream& os, std::string_view str);
} // namespace wasm::String
diff --git a/src/tools/fuzzing/fuzzing.cpp b/src/tools/fuzzing/fuzzing.cpp
index 1c4ee4cc5..c62114c3f 100644
--- a/src/tools/fuzzing/fuzzing.cpp
+++ b/src/tools/fuzzing/fuzzing.cpp
@@ -20,6 +20,7 @@
#include "ir/module-utils.h"
#include "ir/subtypes.h"
#include "ir/type-updating.h"
+#include "support/string.h"
#include "tools/fuzzing/heap-types.h"
#include "tools/fuzzing/parameters.h"
@@ -2465,8 +2466,13 @@ Expression* TranslateToFuzzReader::makeBasicRef(Type type) {
}
return null;
}
- case HeapType::string:
- return builder.makeStringConst(std::to_string(upTo(1024)));
+ case HeapType::string: {
+ auto wtf8 = std::to_string(upTo(1024));
+ std::stringstream wtf16;
+ String::convertWTF8ToWTF16(wtf16, wtf8);
+ // TODO: Use wtf16.view() once we have C++20.
+ return builder.makeStringConst(wtf16.str());
+ }
case HeapType::stringview_wtf8:
return builder.makeStringAs(
StringAsWTF8, makeBasicRef(Type(HeapType::string, NonNullable)));
diff --git a/src/wasm-builder.h b/src/wasm-builder.h
index 7d4991d9a..cc90a8abe 100644
--- a/src/wasm-builder.h
+++ b/src/wasm-builder.h
@@ -1265,12 +1265,17 @@ public:
return makeRefI31(makeConst(value.geti31()));
}
if (type.isString()) {
- // TODO: more than ascii support
- std::string string;
+ // The string is already WTF-16, but we need to convert from `Literals` to
+ // actual string.
+ std::stringstream wtf16;
for (auto c : value.getGCData()->values) {
- string.push_back(c.getInteger());
+ auto u = c.getInteger();
+ assert(u < 0x10000);
+ wtf16 << uint8_t(u & 0xFF);
+ wtf16 << uint8_t(u >> 8);
}
- return makeStringConst(string);
+ // TODO: Use wtf16.view() once we have C++20.
+ return makeStringConst(wtf16.str());
}
if (type.isRef() && type.getHeapType() == HeapType::ext) {
return makeRefAs(ExternExternalize,
diff --git a/src/wasm-interpreter.h b/src/wasm-interpreter.h
index c8031f617..f34cb83be 100644
--- a/src/wasm-interpreter.h
+++ b/src/wasm-interpreter.h
@@ -1900,23 +1900,7 @@ public:
return Flow(NONCONSTANT_FLOW);
}
}
- Flow visitStringConst(StringConst* curr) {
- return Literal(curr->string.toString());
- }
-
- // Returns if there is a non-ascii character in a list of values, looking only
- // up to an index that is provided (not inclusive). If the index is not
- // provided we look in the entire list.
- bool hasNonAsciiUpTo(const Literals& values,
- std::optional<Index> maybeEnd = std::nullopt) {
- Index end = maybeEnd ? *maybeEnd : values.size();
- for (Index i = 0; i < end; ++i) {
- if (uint32_t(values[i].geti32()) > 127) {
- return true;
- }
- }
- return false;
- }
+ Flow visitStringConst(StringConst* curr) { return Literal(curr->string.str); }
Flow visitStringMeasure(StringMeasure* curr) {
// For now we only support JS-style strings.
@@ -1934,12 +1918,6 @@ public:
trap("null ref");
}
- // This is only correct if all the bytes stored in `values` correspond to
- // single unicode code points. See `visitStringWTF16Get` for details.
- if (hasNonAsciiUpTo(data->values)) {
- return Flow(NONCONSTANT_FLOW);
- }
-
return Literal(int32_t(data->values.size()));
}
Flow visitStringConcat(StringConcat* curr) {
@@ -1960,18 +1938,13 @@ public:
if (!leftData || !rightData) {
trap("null ref");
}
- // This is only correct if all the bytes in the left operand correspond
- // to single unicode code points.
- if (hasNonAsciiUpTo(leftData->values)) {
- return Flow(NONCONSTANT_FLOW);
- }
Literals contents;
contents.reserve(leftData->values.size() + rightData->values.size());
- for (Literal l : leftData->values) {
+ for (Literal& l : leftData->values) {
contents.push_back(l);
}
- for (Literal l : rightData->values) {
+ for (Literal& l : rightData->values) {
contents.push_back(l);
}
@@ -2011,11 +1984,6 @@ public:
trap("oob");
}
- // We don't handle non-ascii code points correctly yet.
- if (hasNonAsciiUpTo(refValues)) {
- return Flow(NONCONSTANT_FLOW);
- }
-
for (Index i = 0; i < refValues.size(); i++) {
ptrValues[startVal + i] = refValues[i];
}
@@ -2132,17 +2100,6 @@ public:
trap("string oob");
}
- // This naive indexing approach is only correct if the first `i` bytes
- // stored in `values` each corresponds to a single unicode code point. To
- // implement this correctly in general, we would have to reinterpret the
- // bytes as WTF-8, then count up to the `i`th code point, accounting
- // properly for code points that would be represented by surrogate pairs in
- // WTF-16. Alternatively, we could represent string contents as WTF-16 to
- // begin with.
- if (hasNonAsciiUpTo(values, i + 1)) {
- return Flow(NONCONSTANT_FLOW);
- }
-
return Literal(values[i].geti32());
}
Flow visitStringIterNext(StringIterNext* curr) {
@@ -2178,9 +2135,7 @@ public:
auto startVal = start.getSingleValue().getUnsigned();
auto endVal = end.getSingleValue().getUnsigned();
endVal = std::min<size_t>(endVal, refValues.size());
- if (hasNonAsciiUpTo(refValues, endVal)) {
- return Flow(NONCONSTANT_FLOW);
- }
+
Literals contents;
if (endVal > startVal) {
contents.reserve(endVal - startVal);
diff --git a/src/wasm/literal.cpp b/src/wasm/literal.cpp
index 7c674ffc5..afdc14c72 100644
--- a/src/wasm/literal.cpp
+++ b/src/wasm/literal.cpp
@@ -23,6 +23,7 @@
#include "ir/bits.h"
#include "pretty_printing.h"
#include "support/bits.h"
+#include "support/string.h"
#include "support/utilities.h"
namespace wasm {
@@ -77,12 +78,15 @@ Literal::Literal(std::shared_ptr<GCData> gcData, HeapType type)
(type.isBottom() && !gcData));
}
-Literal::Literal(std::string string)
+Literal::Literal(std::string_view string)
: gcData(nullptr), type(Type(HeapType::string, NonNullable)) {
// TODO: we could in theory internalize strings
+ // Extract individual WTF-16LE code units.
Literals contents;
- for (auto c : string) {
- contents.push_back(Literal(int32_t(c)));
+ assert(string.size() % 2 == 0);
+ for (size_t i = 0; i < string.size(); i += 2) {
+ int32_t u = uint8_t(string[i]) | (uint8_t(string[i + 1]) << 8);
+ contents.push_back(Literal(u));
}
gcData = std::make_shared<GCData>(HeapType::string, contents);
}
@@ -636,10 +640,19 @@ std::ostream& operator<<(std::ostream& o, Literal literal) {
o << "nullstring";
} else {
o << "string(\"";
+ // Convert WTF-16 literals to WTF-16 string.
+ std::stringstream wtf16;
for (auto c : data->values) {
- // TODO: more than ascii
- o << char(c.getInteger());
+ auto u = c.getInteger();
+ assert(u < 0x10000);
+ wtf16 << uint8_t(u & 0xFF);
+ wtf16 << uint8_t(u >> 8);
}
+ // Convert to WTF-8 for printing.
+ // TODO: Use wtf16.view() once we have C++20.
+ [[maybe_unused]] bool valid =
+ String::convertWTF16ToWTF8(o, wtf16.str());
+ assert(valid);
o << "\")";
}
break;
diff --git a/src/wasm/wasm-binary.cpp b/src/wasm/wasm-binary.cpp
index 55cdd726c..f9a643d66 100644
--- a/src/wasm/wasm-binary.cpp
+++ b/src/wasm/wasm-binary.cpp
@@ -23,6 +23,7 @@
#include "ir/type-updating.h"
#include "support/bits.h"
#include "support/debug.h"
+#include "support/string.h"
#include "wasm-binary.h"
#include "wasm-debug.h"
#include "wasm-stack.h"
@@ -527,7 +528,12 @@ void WasmBinaryWriter::writeStrings() {
// The number of strings and then their contents.
o << U32LEB(num);
for (auto& string : sorted) {
- writeInlineString(string.str);
+ // Re-encode from WTF-16 to WTF-8.
+ std::stringstream wtf8;
+ [[maybe_unused]] bool valid = String::convertWTF16ToWTF8(wtf8, string.str);
+ assert(valid);
+ // TODO: Use wtf8.view() once we have C++20.
+ writeInlineString(wtf8.str());
}
finishSection(start);
@@ -2960,7 +2966,13 @@ void WasmBinaryReader::readStrings() {
size_t num = getU32LEB();
for (size_t i = 0; i < num; i++) {
auto string = getInlineString();
- strings.push_back(string);
+ // Re-encode from WTF-8 to WTF-16.
+ std::stringstream wtf16;
+ if (!String::convertWTF8ToWTF16(wtf16, string.str)) {
+ throwError("invalid string constant");
+ }
+ // TODO: Use wtf16.view() once we have C++20.
+ strings.push_back(wtf16.str());
}
}
diff --git a/src/wasm/wasm-s-parser.cpp b/src/wasm/wasm-s-parser.cpp
index 2d4f3fffe..d6df7ab0f 100644
--- a/src/wasm/wasm-s-parser.cpp
+++ b/src/wasm/wasm-s-parser.cpp
@@ -3335,7 +3335,13 @@ Expression* SExpressionWasmBuilder::makeStringConst(Element& s) {
std::vector<char> data;
stringToBinary(*s[1], s[1]->str().str, data);
Name str = std::string_view(data.data(), data.size());
- return Builder(wasm).makeStringConst(str);
+ // Re-encode from WTF-8 to WTF-16.
+ std::stringstream wtf16;
+ if (!String::convertWTF8ToWTF16(wtf16, str.str)) {
+ throw SParseException("invalid string constant", s);
+ }
+ // TODO: Use wtf16.view() once we have C++20.
+ return Builder(wasm).makeStringConst(wtf16.str());
}
Expression* SExpressionWasmBuilder::makeStringMeasure(Element& s,