/* * Copyright 2024 WebAssembly Community Group participants * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include "support/string.h" namespace wasm::String { Split::Split(const std::string& input, const NewLineOr& newLineOrDelim) { auto first = input.find("\n", 0); if (first != std::string::npos && first != input.length() - 1) { split(input, "\n"); } else { split(input, newLineOrDelim.delim); } } void Split::split(const std::string& input, const std::string& delim) { size_t lastEnd = 0; while (lastEnd < input.size()) { auto nextDelim = input.find(delim, lastEnd); if (nextDelim == std::string::npos) { nextDelim = input.size(); } (*this).push_back(input.substr(lastEnd, nextDelim - lastEnd)); lastEnd = nextDelim + delim.size(); } needToHandleBracketingOperations = delim != "\n"; } Split handleBracketingOperators(Split split) { if (!split.needToHandleBracketingOperations) { return split; } Split ret; std::string last; int nesting = 0; auto handlePart = [&](std::string part) { if (part.empty()) { return; } for (const char c : part) { if (c == '(' || c == '<' || c == '[' || c == '{') { nesting++; } else if (c == ')' || c == '>' || c == ']' || c == '}') { nesting--; } } if (last.empty()) { last = part; } else { last += ',' + part; } if (nesting == 0) { ret.push_back(last); last.clear(); } }; for (auto& part : split) { handlePart(part); } handlePart(""); if (nesting != 0) { Fatal() << "Asyncify: failed to parse lists"; } return ret; } bool wildcardMatch(const std::string& pattern, const std::string& value) { for (size_t i = 0; i < pattern.size(); i++) { if (pattern[i] == '*') { return wildcardMatch(pattern.substr(i + 1), value.substr(i)) || (value.size() > 0 && wildcardMatch(pattern.substr(i), value.substr(i + 1))); } if (i >= value.size()) { return false; } if (pattern[i] != value[i]) { return false; } } return value.size() == pattern.size(); } std::string trim(const std::string& input) { size_t size = input.size(); while (size > 0 && (isspace(input[size - 1]) || input[size - 1] == '\0')) { size--; } return input.substr(0, size); } std::ostream& printEscaped(std::ostream& os, std::string_view str) { os << '"'; for (unsigned char c : str) { switch (c) { case '\t': os << "\\t"; break; case '\n': os << "\\n"; break; case '\r': os << "\\r"; break; case '"': os << "\\\""; break; case '\'': os << "\\'"; break; case '\\': os << "\\\\"; break; default: { if (c >= 32 && c < 127) { os << c; } else { os << std::hex << '\\' << (c / 16) << (c % 16) << std::dec; } } } } return os << '"'; } namespace { std::optional takeWTF8CodePoint(std::string_view& str) { bool valid = true; if (str.size() == 0) { return std::nullopt; } uint8_t leading = str[0]; size_t trailingBytes; // Initialized only to avoid spurious compiler warnings. uint32_t u = 0; if ((leading & 0b10000000) == 0b00000000) { // 0xxxxxxx trailingBytes = 0; u = leading; } else if ((leading & 0b11100000) == 0b11000000) { // 110xxxxx 10xxxxxx trailingBytes = 1; u = (leading & 0b00011111) << 6; } else if ((leading & 0b11110000) == 0b11100000) { // 1110xxxx 10xxxxxx 10xxxxxx trailingBytes = 2; u = (leading & 0b00001111) << 12; } else if ((leading & 0b11111000) == 0b11110000) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx trailingBytes = 3; u = (leading & 0b00000111) << 18; } else { // Bad WTF-8 leading byte. trailingBytes = 0; valid = false; } if (str.size() <= trailingBytes) { // Unexpected end of string. str = str.substr(str.size()); return std::nullopt; } if (valid) { for (size_t j = 0; j < trailingBytes; ++j) { uint8_t trailing = str[1 + j]; if ((trailing & 0b11000000) != 0b10000000) { // Bad WTF-8 trailing byte. valid = false; break; } // Shift 6 bits for every remaining trailing byte after this one. u |= (trailing & 0b00111111) << (6 * (trailingBytes - j - 1)); } } str = str.substr(1 + trailingBytes); if (!valid) { return std::nullopt; } size_t expectedTrailing = u < 0x80 ? 0 : u < 0x800 ? 1 : u < 0x10000 ? 2 : u < 0x110000 ? 3 : -1; if (trailingBytes != expectedTrailing) { // Overlong encoding or overlarge code point. return std::nullopt; } return u; } std::optional takeWTF16CodeUnit(std::string_view& str) { if (str.size() < 2) { str = str.substr(str.size()); return std::nullopt; } // Use a little-endian encoding. uint16_t u = uint8_t(str[0]) | (uint8_t(str[1]) << 8); str = str.substr(2); return u; } std::optional takeWTF16CodePoint(std::string_view& str, bool allowWTF = true) { auto u = takeWTF16CodeUnit(str); if (!u) { return std::nullopt; } if (0xD800 <= *u && *u < 0xDC00) { // High surrogate; take the next low surrogate if it exists. auto next = str; auto low = takeWTF16CodeUnit(next); if (low && 0xDC00 <= *low && *low < 0xE000) { str = next; uint16_t highBits = *u - 0xD800; uint16_t lowBits = *low - 0xDC00; return 0x10000 + ((highBits << 10) | lowBits); } else if (!allowWTF) { // Unpaired high surrogate. return std::nullopt; } } else if (!allowWTF && 0xDC00 <= *u && *u < 0xE000) { // Unpaired low surrogate. return std::nullopt; } return *u; } void writeWTF16CodeUnit(std::ostream& os, uint16_t u) { // Little-endian encoding. os << uint8_t(u & 0xFF); os << uint8_t(u >> 8); } constexpr uint32_t replacementCharacter = 0xFFFD; bool doConvertWTF16ToWTF8(std::ostream& os, std::string_view str, bool allowWTF) { bool valid = true; while (str.size()) { auto u = takeWTF16CodePoint(str, allowWTF); if (!u) { valid = false; u = replacementCharacter; } writeWTF8CodePoint(os, *u); } return valid; } } // anonymous namespace std::ostream& writeWTF8CodePoint(std::ostream& os, uint32_t u) { assert(u < 0x110000); if (u < 0x80) { // 0xxxxxxx os << uint8_t(u); } else if (u < 0x800) { // 110xxxxx 10xxxxxx os << uint8_t(0b11000000 | ((u >> 6) & 0b00011111)); os << uint8_t(0b10000000 | ((u >> 0) & 0b00111111)); } else if (u < 0x10000) { // 1110xxxx 10xxxxxx 10xxxxxx os << uint8_t(0b11100000 | ((u >> 12) & 0b00001111)); os << uint8_t(0b10000000 | ((u >> 6) & 0b00111111)); os << uint8_t(0b10000000 | ((u >> 0) & 0b00111111)); } else { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx os << uint8_t(0b11110000 | ((u >> 18) & 0b00000111)); os << uint8_t(0b10000000 | ((u >> 12) & 0b00111111)); os << uint8_t(0b10000000 | ((u >> 6) & 0b00111111)); os << uint8_t(0b10000000 | ((u >> 0) & 0b00111111)); } return os; } std::ostream& writeWTF16CodePoint(std::ostream& os, uint32_t u) { assert(u < 0x110000); if (u < 0x10000) { writeWTF16CodeUnit(os, u); } else { // Encode with a surrogate pair. uint16_t high = 0xD800 + ((u - 0x10000) >> 10); uint16_t low = 0xDC00 + ((u - 0x10000) & 0x3FF); writeWTF16CodeUnit(os, high); writeWTF16CodeUnit(os, low); } return os; } #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" bool convertWTF8ToWTF16(std::ostream& os, std::string_view str) { bool valid = true; bool lastWasLeadingSurrogate = false; while (str.size()) { auto u = takeWTF8CodePoint(str); if (!u) { valid = false; u = replacementCharacter; } bool isLeadingSurrogate = 0xD800 <= *u && *u < 0xDC00; bool isTrailingSurrogate = 0xDC00 <= *u && *u < 0xE000; if (lastWasLeadingSurrogate && isTrailingSurrogate) { // Invalid surrogate sequence. valid = false; } lastWasLeadingSurrogate = isLeadingSurrogate; writeWTF16CodePoint(os, *u); } return valid; } #pragma GCC diagnostic pop bool convertWTF16ToWTF8(std::ostream& os, std::string_view str) { return doConvertWTF16ToWTF8(os, str, true); } bool convertUTF16ToUTF8(std::ostream& os, std::string_view str) { return doConvertWTF16ToWTF8(os, str, false); } std::ostream& printEscapedJSON(std::ostream& os, std::string_view str) { os << '"'; while (str.size()) { auto u = *takeWTF16CodePoint(str); // Use escape sequences mandated by the JSON spec. switch (u) { case '"': os << "\\\""; continue; case '\\': os << "\\\\"; continue; case '\b': os << "\\b"; continue; case '\f': os << "\\f"; continue; case '\n': os << "\\n"; continue; case '\r': os << "\\r"; continue; case '\t': os << "\\t"; continue; default: break; } // TODO: To minimize size, consider additionally escaping only other control // characters (u <= 0x1F) and surrogates, emitting everything else directly // assuming a UTF-8 encoding of the JSON text. We don't do this now because // Print.cpp would consider the contents unprintable, messing up our test. bool isNaivelyPrintable = 32 <= u && u < 127; if (isNaivelyPrintable) { assert(u < 0x80 && "need additional logic to emit valid UTF-8"); os << uint8_t(u); continue; } // Escape as '\uXXXX` for code points less than 0x10000 or as a // '\uXXXX\uYYYY' surrogate pair otherwise. auto printEscape = [&os](uint32_t codePoint) { assert(codePoint < 0x10000); os << std::hex << "\\u"; os << ((codePoint & 0xF000) >> 12); os << ((codePoint & 0x0F00) >> 8); os << ((codePoint & 0x00F0) >> 4); os << (codePoint & 0x000F); os << std::dec; }; if (u < 0x10000) { printEscape(u); } else { assert(u <= 0x10FFFF && "unexpectedly high code point"); printEscape(0xD800 + ((u - 0x10000) >> 10)); printEscape(0xDC00 + ((u - 0x10000) & 0x3FF)); } } return os << '"'; } bool isUTF8(std::string_view str) { while (str.size()) { auto u = takeWTF8CodePoint(str); if (!u || (0xD800 <= *u && *u < 0xE000)) { return false; } } return true; } } // namespace wasm::String