diff options
author | Sam Clegg <sbc@chromium.org> | 2021-02-11 16:53:51 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-02-12 00:53:51 +0000 |
commit | 8369064c09964ab77eae2af6f0d8cce58e412e11 (patch) | |
tree | 2f12e2dd1e4ca21332d5f9e89a8c29382826204e /src | |
parent | e283300dbb8bf573584a811bdc973b3731b67486 (diff) | |
download | binaryen-8369064c09964ab77eae2af6f0d8cce58e412e11.tar.gz binaryen-8369064c09964ab77eae2af6f0d8cce58e412e11.tar.bz2 binaryen-8369064c09964ab77eae2af6f0d8cce58e412e11.zip |
finalize: strip segments that contain only EM_ASM/EM_JS data (#3557)
If we find a data segment whose entire contents is EM_JS or EM_ASM
strings then strip it from the binary.
See: https://github.com/emscripten-core/emscripten/pull/13443
Diffstat (limited to 'src')
-rw-r--r-- | src/binaryen-c.h | 4 | ||||
-rw-r--r-- | src/wasm/wasm-emscripten.cpp | 189 |
2 files changed, 144 insertions, 49 deletions
diff --git a/src/binaryen-c.h b/src/binaryen-c.h index ba61db885..eb0cfc6e1 100644 --- a/src/binaryen-c.h +++ b/src/binaryen-c.h @@ -1633,9 +1633,9 @@ BINARYEN_API void BinaryenMemoryInitSetSize(BinaryenExpressionRef expr, // DataDrop -// Gets the index of the segment being dropped by a `memory.drop` expression. +// Gets the index of the segment being dropped by a `data.drop` expression. BINARYEN_API uint32_t BinaryenDataDropGetSegment(BinaryenExpressionRef expr); -// Sets the index of the segment being dropped by a `memory.drop` expression. +// Sets the index of the segment being dropped by a `data.drop` expression. BINARYEN_API void BinaryenDataDropSetSegment(BinaryenExpressionRef expr, uint32_t segmentIndex); diff --git a/src/wasm/wasm-emscripten.cpp b/src/wasm/wasm-emscripten.cpp index 24fb7bcdc..7a593f599 100644 --- a/src/wasm/wasm-emscripten.cpp +++ b/src/wasm/wasm-emscripten.cpp @@ -76,8 +76,7 @@ Global* getStackPointerGlobal(Module& wasm) { const Address UNKNOWN_OFFSET(uint32_t(-1)); -std::string escape(const char* input) { - std::string code = input; +std::string escape(std::string code) { // replace newlines quotes with escaped newlines size_t curr = 0; while ((curr = code.find("\\n", curr)) != std::string::npos) { @@ -109,14 +108,21 @@ class StringConstantTracker { public: StringConstantTracker(Module& wasm) : wasm(wasm) { calcSegmentOffsets(); } - std::string codeForConstAddr(int64_t address) { - const char* str = stringAtAddr(address); - if (!str) { - Fatal() << "unable to find data for ASM/EM_JS const at: " << address; + const char* stringAtAddr(Address address) { + for (unsigned i = 0; i < wasm.memory.segments.size(); ++i) { + Memory::Segment& segment = wasm.memory.segments[i]; + Address offset = segmentOffsets[i]; + if (offset != UNKNOWN_OFFSET && address >= offset && + address < offset + segment.data.size()) { + return &segment.data[address - offset]; + } } - return escape(str); + Fatal() << "unable to find data for ASM/EM_JS const at: " << address; + return nullptr; } + std::vector<Address> segmentOffsets; // segment index => address offset + private: void calcSegmentOffsets() { std::unordered_map<Index, Address> passiveOffsets; @@ -172,20 +178,12 @@ private: } } - const char* stringAtAddr(Address address) { - for (unsigned i = 0; i < wasm.memory.segments.size(); ++i) { - Memory::Segment& segment = wasm.memory.segments[i]; - Address offset = segmentOffsets[i]; - if (offset != UNKNOWN_OFFSET && address >= offset && - address < offset + segment.data.size()) { - return &segment.data[address - offset]; - } - } - return nullptr; - } - Module& wasm; - std::vector<Address> segmentOffsets; // segment index => address offset +}; + +struct AsmConst { + Address id; + std::string code; }; struct AsmConstWalker : public LinearExecutionWalker<AsmConstWalker> { @@ -193,11 +191,6 @@ struct AsmConstWalker : public LinearExecutionWalker<AsmConstWalker> { bool minimizeWasmChanges; StringConstantTracker stringTracker; - struct AsmConst { - Address id; - std::string code; - }; - std::vector<AsmConst> asmConsts; // last sets in the current basic block, per index std::map<Index, LocalSet*> sets; @@ -292,9 +285,8 @@ void AsmConstWalker::visitCall(Call* curr) { } auto* value = arg->cast<Const>(); - int64_t address = value->value.getInteger(); - auto code = stringTracker.codeForConstAddr(address); - createAsmConst(address, code); + Address address = value->value.getInteger(); + asmConsts.push_back({address, stringTracker.stringAtAddr(address)}); } void AsmConstWalker::process() { @@ -305,24 +297,105 @@ void AsmConstWalker::process() { addImports(); } -void AsmConstWalker::createAsmConst(uint64_t id, std::string code) { - AsmConst asmConst; - asmConst.id = id; - asmConst.code = code; - asmConsts.push_back(asmConst); -} - void AsmConstWalker::addImports() { for (auto& import : queuedImports) { wasm.addFunction(import.release()); } } -static AsmConstWalker findEmAsmConstsAndReturnWalker(Module& wasm, - bool minimizeWasmChanges) { - AsmConstWalker walker(wasm, minimizeWasmChanges); - walker.process(); - return walker; +struct SegmentRemover : WalkerPass<PostWalker<SegmentRemover>> { + SegmentRemover(Index segment) : segment(segment) {} + + bool isFunctionParallel() override { return true; } + + Pass* create() override { return new SegmentRemover(segment); } + + void visitMemoryInit(MemoryInit* curr) { + if (segment == curr->segment) { + Builder builder(*getModule()); + replaceCurrent(builder.blockify(builder.makeDrop(curr->dest), + builder.makeDrop(curr->offset), + builder.makeDrop(curr->size))); + } + } + + void visitDataDrop(DataDrop* curr) { + if (segment == curr->segment) { + Builder builder(*getModule()); + replaceCurrent(builder.makeNop()); + } + } + + Index segment; +}; + +static void removeSegment(Module& wasm, Index segment) { + PassRunner runner(&wasm); + SegmentRemover(segment).run(&runner, &wasm); + // Resize the segment to zero. In theory we should completely remove it + // but that would mean re-numbering the segments that follow which is + // non-trivial. + wasm.memory.segments[segment].data.resize(0); +} + +static Address getExportedAddress(Module& wasm, Export* export_) { + Global* g = wasm.getGlobal(export_->value); + auto* addrConst = g->init->dynCast<Const>(); + return addrConst->value.getInteger(); +} + +static std::vector<AsmConst> findEmAsmConsts(Module& wasm, + bool minimizeWasmChanges) { + Export* start = wasm.getExportOrNull("__start_em_asm"); + Export* end = wasm.getExportOrNull("__stop_em_asm"); + + // Older versions of emscripten don't export these symbols. Instead + // we run AsmConstWalker in an attempt to derive the string addresses + // from the code. + if (!start || !end) { + AsmConstWalker walker(wasm, minimizeWasmChanges); + walker.process(); + return walker.asmConsts; + } + + // Newer version of emscripten export this symbols and we + // can use it ot find all the EM_ASM constants. Sadly __start_em_asm and + // __stop_em_asm don't alwasy mark the start and end of segment because in + // dynamic linking we merge all data segments into one. + std::vector<AsmConst> asmConsts; + StringConstantTracker stringTracker(wasm); + Address startAddress = getExportedAddress(wasm, start); + Address endAddress = getExportedAddress(wasm, end); + for (Index i = 0; i < wasm.memory.segments.size(); i++) { + Address segmentStart = stringTracker.segmentOffsets[i]; + size_t segmentSize = wasm.memory.segments[i].data.size(); + if (segmentStart <= startAddress && + segmentStart + segmentSize >= endAddress) { + Address address = startAddress; + while (address < endAddress) { + auto code = stringTracker.stringAtAddr(address); + asmConsts.push_back({address, code}); + address.addr += strlen(code) + 1; + } + + if (segmentStart == startAddress && + segmentStart + segmentSize == endAddress) { + removeSegment(wasm, i); + } else { + // If we can't remove the whole segment then just set the string + // data to zero. + size_t segmentOffset = startAddress - segmentStart; + char* startElem = &wasm.memory.segments[i].data[segmentOffset]; + memset(startElem, 0, endAddress - startAddress); + } + break; + } + } + + assert(asmConsts.size()); + wasm.removeExport("__start_em_asm"); + wasm.removeExport("__stop_em_asm"); + return asmConsts; } struct EmJsWalker : public PostWalker<EmJsWalker> { @@ -331,6 +404,7 @@ struct EmJsWalker : public PostWalker<EmJsWalker> { std::vector<Export> toRemove; std::map<std::string, std::string> codeByName; + std::map<Address, size_t> codeAddresses; // map from address to string len EmJsWalker(Module& _wasm) : wasm(_wasm), stringTracker(_wasm) {} @@ -353,8 +427,9 @@ struct EmJsWalker : public PostWalker<EmJsWalker> { } auto* addrConst = consts.list[0]; int64_t address = addrConst->value.getInteger(); - auto code = stringTracker.codeForConstAddr(address); + auto code = stringTracker.stringAtAddr(address); codeByName[funcName] = code; + codeAddresses[address] = strlen(code) + 1; } }; @@ -366,6 +441,27 @@ EmJsWalker findEmJsFuncsAndReturnWalker(Module& wasm) { wasm.removeExport(exp.name); wasm.removeFunction(exp.value); } + + // With newer versions of emscripten/llvm we pack all EM_JS strings into + // single segment. + // We can detect this by checking for segments that contain only JS strings. + // When we find such segements we remove them from the final binary. + for (Index i = 0; i < wasm.memory.segments.size(); i++) { + Address start = walker.stringTracker.segmentOffsets[0]; + Address cur = start; + + while (cur < start + wasm.memory.segments[i].data.size()) { + if (walker.codeAddresses.count(cur) == 0) { + break; + } + cur.addr += walker.codeAddresses[cur]; + } + + if (cur == start + wasm.memory.segments[i].data.size()) { + // Entire segment is contains JS strings. Remove it. + removeSegment(wasm, i); + } + } return walker; } @@ -383,16 +479,15 @@ std::string EmscriptenGlueGenerator::generateEmscriptenMetadata() { std::stringstream meta; meta << "{\n"; - AsmConstWalker emAsmWalker = - findEmAsmConstsAndReturnWalker(wasm, minimizeWasmChanges); + std::vector<AsmConst> asmConsts = findEmAsmConsts(wasm, minimizeWasmChanges); // print commaFirst = true; - if (!emAsmWalker.asmConsts.empty()) { + if (!asmConsts.empty()) { meta << " \"asmConsts\": {"; - for (auto& asmConst : emAsmWalker.asmConsts) { + for (auto& asmConst : asmConsts) { meta << nextElement(); - meta << '"' << asmConst.id << "\": \"" << asmConst.code << "\""; + meta << '"' << asmConst.id << "\": \"" << escape(asmConst.code) << "\""; } meta << "\n },\n"; } @@ -405,7 +500,7 @@ std::string EmscriptenGlueGenerator::generateEmscriptenMetadata() { auto& name = pair.first; auto& code = pair.second; meta << nextElement(); - meta << '"' << name << "\": \"" << code << '"'; + meta << '"' << name << "\": \"" << escape(code) << '"'; } meta << "\n },\n"; } |