summaryrefslogtreecommitdiff
path: root/src/support
diff options
context:
space:
mode:
authorThomas Lively <tlively@google.com>2024-03-22 16:56:33 -0700
committerGitHub <noreply@github.com>2024-03-22 23:56:33 +0000
commitb3fea30f84fef3ff7aa77775e00b83ba62d997cc (patch)
tree53494a466d8e56d34d849d14927817a22f843748 /src/support
parentd3414c3deaebe7ba35731a8c20d7fa5f5a833ca3 (diff)
downloadbinaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.tar.gz
binaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.tar.bz2
binaryen-b3fea30f84fef3ff7aa77775e00b83ba62d997cc.zip
[Strings] Represent string values as WTF-16 internally (#6418)
WTF-16, i.e. arbitrary sequences of 16-bit values, is the encoding of Java and JavaScript strings, and using the same encoding makes the interpretation of string operations trivial, even when accounting for non-ascii characters. Specifically, use little-endian WTF-16. Re-encode string constants from WTF-8 to WTF-16 in the parsers, then back to WTF-8 in the writers. Update the constructor for string `Literal`s to interpret the string as WTF-16 and store a sequence of WTF-16 code units, i.e. 16-bit integers. Update `Builder::makeConstantExpression` accordingly to convert from the new `Literal` string representation back to a WTF-16 string. Update the interpreter to remove the logic for detecting non-ascii characters and bailing out. The naive implementations of all the string operations are correct now that our string encoding matches the JS string encoding.
Diffstat (limited to 'src/support')
-rw-r--r--src/support/json.cpp7
-rw-r--r--src/support/string.cpp231
-rw-r--r--src/support/string.h19
3 files changed, 202 insertions, 55 deletions
diff --git a/src/support/json.cpp b/src/support/json.cpp
index ab55cc75f..dd94719d4 100644
--- a/src/support/json.cpp
+++ b/src/support/json.cpp
@@ -21,7 +21,12 @@ namespace json {
void Value::stringify(std::ostream& os, bool pretty) {
if (isString()) {
- wasm::String::printEscapedJSON(os, getCString());
+ std::stringstream wtf16;
+ [[maybe_unused]] bool valid =
+ wasm::String::convertWTF8ToWTF16(wtf16, getIString().str);
+ assert(valid);
+ // TODO: Use wtf16.view() once we have C++20.
+ wasm::String::printEscapedJSON(os, wtf16.str());
} else if (isArray()) {
os << '[';
auto first = true;
diff --git a/src/support/string.cpp b/src/support/string.cpp
index c3a9ce4e4..68249f51e 100644
--- a/src/support/string.cpp
+++ b/src/support/string.cpp
@@ -14,6 +14,7 @@
* limitations under the License.
*/
+#include <optional>
#include <ostream>
#include "support/string.h"
@@ -106,7 +107,7 @@ std::string trim(const std::string& input) {
return input.substr(0, size);
}
-std::ostream& printEscaped(std::ostream& os, const std::string_view str) {
+std::ostream& printEscaped(std::ostream& os, std::string_view str) {
os << '"';
for (unsigned char c : str) {
switch (c) {
@@ -140,67 +141,193 @@ std::ostream& printEscaped(std::ostream& os, const std::string_view str) {
return os << '"';
}
-std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str) {
- os << '"';
- constexpr uint32_t replacementCharacter = 0xFFFD;
- bool lastWasLeadingSurrogate = false;
- for (size_t i = 0; i < str.size();) {
- // Decode from WTF-8 into a unicode code point.
- uint8_t leading = str[i];
- size_t trailingBytes;
- uint32_t u;
- if ((leading & 0b10000000) == 0b00000000) {
- // 0xxxxxxx
- trailingBytes = 0;
- u = leading;
- } else if ((leading & 0b11100000) == 0b11000000) {
- // 110xxxxx 10xxxxxx
- trailingBytes = 1;
- u = (leading & 0b00011111) << 6;
- } else if ((leading & 0b11110000) == 0b11100000) {
- // 1110xxxx 10xxxxxx 10xxxxxx
- trailingBytes = 2;
- u = (leading & 0b00001111) << 12;
- } else if ((leading & 0b11111000) == 0b11110000) {
- // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- trailingBytes = 3;
- u = (leading & 0b00000111) << 18;
- } else {
- std::cerr << "warning: Bad WTF-8 leading byte (" << std::hex
- << int(leading) << std::dec << "). Replacing.\n";
- trailingBytes = 0;
- u = replacementCharacter;
- }
+namespace {
- ++i;
+std::optional<uint32_t> takeWTF8CodePoint(std::string_view& str) {
+ bool valid = true;
- if (i + trailingBytes > str.size()) {
- std::cerr << "warning: Unexpected end of string. Replacing.\n";
- u = replacementCharacter;
- } else {
- for (size_t j = 0; j < trailingBytes; ++j) {
- uint8_t trailing = str[i + j];
- if ((trailing & 0b11000000) != 0b10000000) {
- std::cerr << "warning: Bad WTF-8 trailing byte (" << std::hex
- << int(trailing) << std::dec << "). Replacing.\n";
- u = replacementCharacter;
- break;
- }
- // Shift 6 bits for every remaining trailing byte after this one.
- u |= (trailing & 0b00111111) << (6 * (trailingBytes - j - 1));
+ if (str.size() == 0) {
+ return std::nullopt;
+ }
+
+ uint8_t leading = str[0];
+ size_t trailingBytes;
+ uint32_t u;
+ if ((leading & 0b10000000) == 0b00000000) {
+ // 0xxxxxxx
+ trailingBytes = 0;
+ u = leading;
+ } else if ((leading & 0b11100000) == 0b11000000) {
+ // 110xxxxx 10xxxxxx
+ trailingBytes = 1;
+ u = (leading & 0b00011111) << 6;
+ } else if ((leading & 0b11110000) == 0b11100000) {
+ // 1110xxxx 10xxxxxx 10xxxxxx
+ trailingBytes = 2;
+ u = (leading & 0b00001111) << 12;
+ } else if ((leading & 0b11111000) == 0b11110000) {
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ trailingBytes = 3;
+ u = (leading & 0b00000111) << 18;
+ } else {
+ // Bad WTF-8 leading byte.
+ trailingBytes = 0;
+ valid = false;
+ }
+
+ if (str.size() <= trailingBytes) {
+ // Unexpected end of string.
+ str = str.substr(str.size());
+ return std::nullopt;
+ }
+
+ if (valid) {
+ for (size_t j = 0; j < trailingBytes; ++j) {
+ uint8_t trailing = str[1 + j];
+ if ((trailing & 0b11000000) != 0b10000000) {
+ // Bad WTF-8 trailing byte.
+ valid = false;
+ break;
}
+ // Shift 6 bits for every remaining trailing byte after this one.
+ u |= (trailing & 0b00111111) << (6 * (trailingBytes - j - 1));
}
+ }
+
+ str = str.substr(1 + trailingBytes);
+ if (!valid) {
+ return std::nullopt;
+ }
+ return u;
+}
+
+std::optional<uint16_t> takeWTF16CodeUnit(std::string_view& str) {
+ if (str.size() < 2) {
+ str = str.substr(str.size());
+ return std::nullopt;
+ }
+
+ // Use a little-endian encoding.
+ uint16_t u = uint8_t(str[0]) | (uint8_t(str[1]) << 8);
+ str = str.substr(2);
+ return u;
+}
+
+std::optional<uint32_t> takeWTF16CodePoint(std::string_view& str) {
+ auto u = takeWTF16CodeUnit(str);
+ if (!u) {
+ return std::nullopt;
+ }
+
+ if (0xD800 <= *u && *u < 0xDC00) {
+ // High surrogate; take the next low surrogate if it exists.
+ auto next = str;
+ auto low = takeWTF16CodeUnit(next);
+ if (low && 0xDC00 <= *low && *low < 0xE000) {
+ str = next;
+ uint16_t highBits = *u - 0xD800;
+ uint16_t lowBits = *low - 0xDC00;
+ return 0x10000 + ((highBits << 10) | lowBits);
+ }
+ }
+
+ return *u;
+}
+
+void writeWTF16CodeUnit(std::ostream& os, uint16_t u) {
+ // Little-endian encoding.
+ os << uint8_t(u & 0xFF);
+ os << uint8_t(u >> 8);
+}
+
+constexpr uint32_t replacementCharacter = 0xFFFD;
+
+} // anonymous namespace
+
+std::ostream& writeWTF8CodePoint(std::ostream& os, uint32_t u) {
+ assert(u < 0x110000);
+ if (u < 0x80) {
+ // 0xxxxxxx
+ os << uint8_t(u);
+ } else if (u < 0x800) {
+ // 110xxxxx 10xxxxxx
+ os << uint8_t(0b11000000 | ((u >> 6) & 0b00011111));
+ os << uint8_t(0b10000000 | ((u >> 0) & 0b00111111));
+ } else if (u < 0x10000) {
+ // 1110xxxx 10xxxxxx 10xxxxxx
+ os << uint8_t(0b11100000 | ((u >> 12) & 0b00001111));
+ os << uint8_t(0b10000000 | ((u >> 6) & 0b00111111));
+ os << uint8_t(0b10000000 | ((u >> 0) & 0b00111111));
+ } else {
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ os << uint8_t(0b11110000 | ((u >> 18) & 0b00000111));
+ os << uint8_t(0b10000000 | ((u >> 12) & 0b00111111));
+ os << uint8_t(0b10000000 | ((u >> 6) & 0b00111111));
+ os << uint8_t(0b10000000 | ((u >> 0) & 0b00111111));
+ }
+ return os;
+}
+
+std::ostream& writeWTF16CodePoint(std::ostream& os, uint32_t u) {
+ assert(u < 0x110000);
+ if (u < 0x10000) {
+ writeWTF16CodeUnit(os, u);
+ } else {
+ // Encode with a surrogate pair.
+ uint16_t high = 0xD800 + ((u - 0x10000) >> 10);
+ uint16_t low = 0xDC00 + ((u - 0x10000) & 0x3FF);
+ writeWTF16CodeUnit(os, high);
+ writeWTF16CodeUnit(os, low);
+ }
+ return os;
+}
+
+bool convertWTF8ToWTF16(std::ostream& os, std::string_view str) {
+ bool valid = true;
+ bool lastWasLeadingSurrogate = false;
- i += trailingBytes;
+ while (str.size()) {
+ auto u = takeWTF8CodePoint(str);
+ if (!u) {
+ valid = false;
+ u = replacementCharacter;
+ }
- bool isLeadingSurrogate = 0xD800 <= u && u <= 0xDBFF;
- bool isTrailingSurrogate = 0xDC00 <= u && u <= 0xDFFF;
+ bool isLeadingSurrogate = 0xD800 <= *u && *u < 0xDC00;
+ bool isTrailingSurrogate = 0xDC00 <= *u && *u < 0xE000;
if (lastWasLeadingSurrogate && isTrailingSurrogate) {
- std::cerr << "warning: Invalid surrogate sequence in WTF-8.\n";
+ // Invalid surrogate sequence.
+ valid = false;
}
lastWasLeadingSurrogate = isLeadingSurrogate;
- // Encode unicode code point into JSON.
+ writeWTF16CodePoint(os, *u);
+ }
+
+ return valid;
+}
+
+bool convertWTF16ToWTF8(std::ostream& os, std::string_view str) {
+ bool valid = true;
+
+ while (str.size()) {
+ auto u = takeWTF16CodePoint(str);
+ if (!u) {
+ valid = false;
+ u = replacementCharacter;
+ }
+ writeWTF8CodePoint(os, *u);
+ }
+
+ return valid;
+}
+
+std::ostream& printEscapedJSON(std::ostream& os, std::string_view str) {
+ os << '"';
+ while (str.size()) {
+ auto u = *takeWTF16CodePoint(str);
+
+ // Use escape sequences mandated by the JSON spec.
switch (u) {
case '"':
os << "\\\"";
diff --git a/src/support/string.h b/src/support/string.h
index 6fb3f693b..be2c3c6a3 100644
--- a/src/support/string.h
+++ b/src/support/string.h
@@ -75,9 +75,24 @@ inline bool isNumber(const std::string& str) {
return !str.empty() && std::all_of(str.begin(), str.end(), ::isdigit);
}
-std::ostream& printEscaped(std::ostream& os, const std::string_view str);
+std::ostream& printEscaped(std::ostream& os, std::string_view str);
-std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str);
+// `str` must be a valid WTF-16 string.
+std::ostream& printEscapedJSON(std::ostream& os, std::string_view str);
+
+std::ostream& writeWTF8CodePoint(std::ostream& os, uint32_t u);
+
+std::ostream& writeWTF16CodePoint(std::ostream& os, uint32_t u);
+
+// Writes the WTF-16LE encoding of the given WTF-8 string to `os`, inserting
+// replacement characters as necessary when encountering invalid WTF-8. Returns
+// `true` iff the input was valid WTF-8.
+bool convertWTF8ToWTF16(std::ostream& os, std::string_view str);
+
+// Writes the WTF-8 encoding of the given WTF-16LE string to `os`, inserting a
+// replacement character at the end if the string is an odd number of bytes.
+// Returns `true` iff the input was valid WTF-16.
+bool convertWTF16ToWTF8(std::ostream& os, std::string_view str);
} // namespace wasm::String