diff options
author | Thomas Lively <tlively@google.com> | 2022-10-11 11:16:14 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-10-11 16:16:14 +0000 |
commit | b83450ed1fd98cec4453024f57f892b31851ea50 (patch) | |
tree | bf0467d96c9966d0f4699ea0afcdf25905b4098c /src/emscripten-optimizer | |
parent | 6d4ac3162c290e32a98de349d49e26e904a40414 (diff) | |
download | binaryen-b83450ed1fd98cec4453024f57f892b31851ea50.tar.gz binaryen-b83450ed1fd98cec4453024f57f892b31851ea50.tar.bz2 binaryen-b83450ed1fd98cec4453024f57f892b31851ea50.zip |
Make `Name` a pointer, length pair (#5122)
With the goal of supporting null characters (i.e. zero bytes) in strings.
Rewrite the underlying interned `IString` to store a `std::string_view` rather
than a `const char*`, reduce the number of map lookups necessary to intern a
string, and present a more immutable interface.
Most importantly, replace the `c_str()` method that returned a `const char*`
with a `toString()` method that returns a `std::string`. This new method can
correctly handle strings containing null characters. A `const char*` can still
be had by calling `data()` on the `std::string_view`, although this usage should
be discouraged.
This change is NFC in spirit, although not in practice. It does not intend to
support any particular new functionality, but it is probably now possible to use
strings containing null characters in at least some cases. At least one parser
bug is also incidentally fixed. Follow-on PRs will explicitly support and test
strings containing nulls for particular use cases.
The C API still uses `const char*` to represent strings. As strings containing
nulls become better supported by the rest of Binaryen, this will no longer be
sufficient. Updating the C and JS APIs to use pointer, length pairs is left as
future work.
Diffstat (limited to 'src/emscripten-optimizer')
-rw-r--r-- | src/emscripten-optimizer/istring.h | 218 | ||||
-rw-r--r-- | src/emscripten-optimizer/optimizer-shared.cpp | 2 | ||||
-rw-r--r-- | src/emscripten-optimizer/optimizer.h | 1 | ||||
-rw-r--r-- | src/emscripten-optimizer/parser.cpp | 2 | ||||
-rw-r--r-- | src/emscripten-optimizer/parser.h | 49 | ||||
-rw-r--r-- | src/emscripten-optimizer/simple_ast.cpp | 14 | ||||
-rw-r--r-- | src/emscripten-optimizer/simple_ast.h | 19 |
7 files changed, 61 insertions, 244 deletions
diff --git a/src/emscripten-optimizer/istring.h b/src/emscripten-optimizer/istring.h deleted file mode 100644 index 69df761fd..000000000 --- a/src/emscripten-optimizer/istring.h +++ /dev/null @@ -1,218 +0,0 @@ -/* - * Copyright 2015 WebAssembly Community Group participants - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Interned String type, 100% interned on creation. Comparisons are always just -// a pointer comparison - -#ifndef wasm_istring_h -#define wasm_istring_h - -#include <set> -#include <unordered_map> -#include <unordered_set> - -#include <assert.h> -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#include "support/threads.h" -#include "support/utilities.h" - -namespace cashew { - -struct IString { - const char* str = nullptr; - - static size_t - hash_c(const char* str) { // see http://www.cse.yorku.ca/~oz/hash.html - unsigned int hash = 5381; - int c; - while ((c = *str++)) { - hash = ((hash << 5) + hash) ^ c; - } - return (size_t)hash; - } - - class CStringHash { - public: - size_t operator()(const char* str) const { return IString::hash_c(str); } - }; - class CStringEqual { - public: - bool operator()(const char* x, const char* y) const { - return strcmp(x, y) == 0; - } - }; - - IString() = default; - // if reuse=true, then input is assumed to remain alive; not copied - IString(const char* s, bool reuse = true) { - assert(s); - set(s, reuse); - } - - void set(const char* s, bool reuse = true) { - typedef std::unordered_set<const char*, CStringHash, CStringEqual> - StringSet; - // one global store of strings per thread, we must not access this - // in parallel - thread_local static StringSet strings; - - auto existing = strings.find(s); - - if (existing == strings.end()) { - // if the string isn't already known, we must use a single global - // storage location, guarded by a mutex, so each string is allocated - // exactly once - static std::mutex mutex; - std::unique_lock<std::mutex> lock(mutex); - // a single global set contains the actual strings, so we allocate each - // one exactly once. - static StringSet globalStrings; - auto globalExisting = globalStrings.find(s); - if (globalExisting == globalStrings.end()) { - if (!reuse) { - static std::vector<std::unique_ptr<std::string>> allocated; - allocated.emplace_back(wasm::make_unique<std::string>(s)); - s = allocated.back()->c_str(); // we'll never modify it, so this is ok - } - // insert into global set - globalStrings.insert(s); - } else { - s = *globalExisting; - } - // add the string to our thread-local set - strings.insert(s); - } else { - s = *existing; - } - - str = s; - } - - void set(const IString& s) { str = s.str; } - - void clear() { str = nullptr; } - - bool operator==(const IString& other) const { - // assert((str == other.str) == !strcmp(str, other.str)); - return str == other.str; // fast! - } - bool operator!=(const IString& other) const { - // assert((str == other.str) == !strcmp(str, other.str)); - return str != other.str; // fast! - } - bool operator<(const IString& other) const { - return strcmp(str ? str : "", other.str ? other.str : "") < 0; - } - - char operator[](int x) const { return str[x]; } - - bool operator!() const { // no string, or empty string - return !str || str[0] == 0; - } - - const char* c_str() const { return str; } - bool equals(const char* other) const { return !strcmp(str, other); } - - bool is() const { return str != nullptr; } - bool isNull() const { return str == nullptr; } - - const char* stripPrefix(const char* prefix) const { - const char* ptr = str; - while (true) { - if (*prefix == 0) { - return ptr; - } - if (*ptr == 0) { - return nullptr; - } - if (*ptr++ != *prefix++) { - return nullptr; - } - } - } - - bool startsWith(const char* prefix) const { - return stripPrefix(prefix) != nullptr; - } - bool startsWith(const IString& prefix) const { - return startsWith(prefix.str); - } - - size_t size() const { return str ? strlen(str) : 0; } -}; - -} // namespace cashew - -// Utilities for creating hashmaps/sets over IStrings - -namespace std { - -template<> struct hash<cashew::IString> { - size_t operator()(const cashew::IString& str) const { - return std::hash<size_t>{}(size_t(str.str)); - } -}; - -template<> struct equal_to<cashew::IString> { - bool operator()(const cashew::IString& x, const cashew::IString& y) const { - return x == y; - } -}; - -} // namespace std - -namespace cashew { - -// IStringSet - -class IStringSet : public std::unordered_set<IString> { - std::vector<char> data; - -public: - IStringSet() = default; - IStringSet(const char* init) { // comma-delimited list - int size = strlen(init) + 1; - data.resize(size); - char* curr = &data[0]; - strncpy(curr, init, size); - while (1) { - char* end = strchr(curr, ' '); - if (end) { - *end = 0; - } - insert(curr); - if (!end) { - break; - } - curr = end + 1; - } - } - - bool has(const IString& str) { return count(str) > 0; } -}; - -class IOrderedStringSet : public std::set<IString> { -public: - bool has(const IString& str) { return count(str) > 0; } -}; - -} // namespace cashew - -#endif // wasm_istring_h diff --git a/src/emscripten-optimizer/optimizer-shared.cpp b/src/emscripten-optimizer/optimizer-shared.cpp index 8208f6f47..9ca66441a 100644 --- a/src/emscripten-optimizer/optimizer-shared.cpp +++ b/src/emscripten-optimizer/optimizer-shared.cpp @@ -19,8 +19,6 @@ #include "optimizer.h" #include "support/safe_integer.h" -using namespace cashew; - IString JS_FLOAT_ZERO; IString SIMD_INT8X16_CHECK("SIMD_Int8x16_check"); diff --git a/src/emscripten-optimizer/optimizer.h b/src/emscripten-optimizer/optimizer.h index 132ff6fb8..9e5c7afe8 100644 --- a/src/emscripten-optimizer/optimizer.h +++ b/src/emscripten-optimizer/optimizer.h @@ -20,6 +20,7 @@ #include "simple_ast.h" using namespace cashew; +using IString = wasm::IString; extern IString JS_FLOAT_ZERO; diff --git a/src/emscripten-optimizer/parser.cpp b/src/emscripten-optimizer/parser.cpp index 016966299..533043232 100644 --- a/src/emscripten-optimizer/parser.cpp +++ b/src/emscripten-optimizer/parser.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include <unordered_map> + #include "parser.h" namespace cashew { diff --git a/src/emscripten-optimizer/parser.h b/src/emscripten-optimizer/parser.h index c4d596058..8c3f36427 100644 --- a/src/emscripten-optimizer/parser.h +++ b/src/emscripten-optimizer/parser.h @@ -30,11 +30,46 @@ #include <limits> #include <vector> -#include "istring.h" +#include "support/istring.h" #include "support/safe_integer.h" namespace cashew { +using IString = wasm::IString; + +// IStringSet + +class IStringSet : public std::unordered_set<IString> { + std::vector<char> data; + +public: + IStringSet() = default; + IStringSet(const char* init) { // comma-delimited list + int size = strlen(init) + 1; + data.resize(size); + char* curr = &data[0]; + strncpy(curr, init, size); + while (1) { + char* end = strchr(curr, ' '); + if (end) { + *end = 0; + } + insert(curr); + if (!end) { + break; + } + curr = end + 1; + } + } + + bool has(const IString& str) { return count(str) > 0; } +}; + +class IOrderedStringSet : public std::set<IString> { +public: + bool has(const IString& str) { return count(str) > 0; } +}; + // common strings extern IString TOPLEVEL; @@ -233,11 +268,11 @@ template<class NodeRef, class Builder> class Parser { src++; } if (*src == 0) { - str.set(start); + str = IString(start); } else { char temp = *src; *src = 0; - str.set(start, false); + str = IString(start, false); *src = temp; } type = keywords.has(str) ? KEYWORD : IDENT; @@ -333,11 +368,11 @@ template<class NodeRef, class Builder> class Parser { default: abort(); } - size = strlen(str.str); + size = str.size(); #ifndef NDEBUG char temp = start[size]; start[size] = 0; - assert(strcmp(str.str, start) == 0); + assert(str.str == start); start[size] = temp; #endif type = OPERATOR; @@ -346,13 +381,13 @@ template<class NodeRef, class Builder> class Parser { type = SEPARATOR; char temp = src[1]; src[1] = 0; - str.set(src, false); + str = IString(src, false); src[1] = temp; src++; } else if (*src == '"' || *src == '\'') { char* end = strchr(src + 1, *src); *end = 0; - str.set(src + 1); + str = IString(src + 1); src = end + 1; type = STRING; } else { diff --git a/src/emscripten-optimizer/simple_ast.cpp b/src/emscripten-optimizer/simple_ast.cpp index 6afa0a66e..086d504ba 100644 --- a/src/emscripten-optimizer/simple_ast.cpp +++ b/src/emscripten-optimizer/simple_ast.cpp @@ -24,13 +24,11 @@ Ref& Ref::operator[](unsigned x) { return (*get())[x]; } Ref& Ref::operator[](IString x) { return (*get())[x]; } -bool Ref::operator==(const char* str) { - return get()->isString() && !strcmp(get()->str.str, str); +bool Ref::operator==(std::string_view str) { + return get()->isString() && get()->str == str; } -bool Ref::operator!=(const char* str) { - return get()->isString() ? !!strcmp(get()->str.str, str) : true; -} +bool Ref::operator!=(std::string_view str) { return !(*this == str); } bool Ref::operator==(const IString& str) { return get()->isString() && get()->str == str; @@ -81,8 +79,8 @@ void Value::stringify(std::ostream& os, bool pretty) { } switch (type) { case String: { - if (str.str) { - os << '"' << str.str << '"'; + if (str) { + os << '"' << str << '"'; } else { os << "\"(null)\""; } @@ -147,7 +145,7 @@ void Value::stringify(std::ostream& os, bool pretty) { } } indentify(); - os << '"' << i.first.c_str() << "\": "; + os << '"' << i.first << "\": "; i.second->stringify(os, pretty); } if (pretty) { diff --git a/src/emscripten-optimizer/simple_ast.h b/src/emscripten-optimizer/simple_ast.h index 64db04d79..60399d6d2 100644 --- a/src/emscripten-optimizer/simple_ast.h +++ b/src/emscripten-optimizer/simple_ast.h @@ -63,8 +63,9 @@ struct Ref { Ref& operator[](IString x); // special conveniences - bool operator==(const char* str); // comparison to string, which is by value - bool operator!=(const char* str); + bool + operator==(std::string_view str); // comparison to string, which is by value + bool operator!=(std::string_view str); bool operator==(const IString& str); bool operator!=(const IString& str); // prevent Ref == number, which is potentially ambiguous; use ->getNumber() == @@ -163,13 +164,13 @@ struct Value { Value& setString(const char* s) { free(); type = String; - str.set(s); + str = IString(s); return *this; } Value& setString(const IString& s) { free(); type = String; - str.set(s); + str = s; return *this; } Value& setNumber(double n) { @@ -233,7 +234,7 @@ struct Value { const char* getCString() { assert(isString()); - return str.str; + return str.str.data(); } IString& getIString() { assert(isString()); @@ -817,7 +818,7 @@ struct JSPrinter { break; } default: { - errv("cannot yet print %s\n", type.str); + errv("cannot yet print %s\n", type.str.data()); abort(); } } @@ -908,7 +909,7 @@ struct JSPrinter { void printAssignName(Ref node) { auto* assign = node->asAssignName(); - emit(assign->target().c_str()); + emit(assign->target().str.data()); space(); emit('='); space(); @@ -1472,10 +1473,10 @@ struct JSPrinter { needQuote = true; str = args[i][0][1]->getCString(); } else if (args[i][0][0] == GETTER) { - getterSetter = GETTER.c_str(); + getterSetter = GETTER.str.data(); str = args[i][0][1]->getCString(); } else if (args[i][0][0] == SETTER) { - getterSetter = SETTER.c_str(); + getterSetter = SETTER.str.data(); str = args[i][0][1]->getCString(); setterParam = args[i][0][2]->getCString(); } else { |