diff options
author | Wouter van Oortmerssen <aardappel@gmail.com> | 2020-01-27 14:05:21 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-01-27 14:05:21 -0800 |
commit | ac29d4f571f54ce3c99243048ebdac6cfe6cc671 (patch) | |
tree | 326acb7841b6092f007744fe55a2c48f98d77ca8 | |
parent | 2132abdd621a354a25af6bc67d1bb603c6b3c4dc (diff) | |
download | wabt-ac29d4f571f54ce3c99243048ebdac6cfe6cc671.tar.gz wabt-ac29d4f571f54ce3c99243048ebdac6cfe6cc671.tar.bz2 wabt-ac29d4f571f54ce3c99243048ebdac6cfe6cc671.zip |
wasm-decompile: use symbols from linking section for names. (#1318)
This allows wasm .o files to have more readable names, or even final
linked modules if the linking information is preserved (with e.g.
--emit-relocs in LLD).
This is implemented as part of the WABT IR representation, so
benefits wasm2wat as well.
Named obtained this way are only set for functions if the function
doesn't also have a name in the name section, but is preferred over
the export name if there is one.
-rw-r--r-- | docs/decompiler.md | 17 | ||||
-rw-r--r-- | src/binary-reader-ir.cc | 78 | ||||
-rw-r--r-- | src/decompiler-ast.h | 2 | ||||
-rw-r--r-- | src/decompiler-naming.h | 18 | ||||
-rw-r--r-- | src/decompiler.cc | 32 | ||||
-rw-r--r-- | src/generate-names.cc | 3 | ||||
-rw-r--r-- | src/generate-names.h | 1 | ||||
-rw-r--r-- | src/tools/wasm-decompile.cc | 3 | ||||
-rw-r--r-- | src/tools/wasm2wat.cc | 2 | ||||
-rw-r--r-- | test/README.md | 5 | ||||
-rw-r--r-- | test/binary/names.txt | 94 | ||||
-rw-r--r-- | test/decompile/names.txt | 116 | ||||
-rwxr-xr-x | test/run-tests.py | 6 |
13 files changed, 343 insertions, 34 deletions
diff --git a/docs/decompiler.md b/docs/decompiler.md index 5dcd78c8..c2d931d5 100644 --- a/docs/decompiler.md +++ b/docs/decompiler.md @@ -35,8 +35,9 @@ already be mostly familiar with that. ### Naming. -wasm-decompile, much like wasm2wat, derives names from import/export -declarations and the name section where possible. For things that have no +wasm-decompile, much like wasm2wat, derives names from the name section +(preferrably), or linker symbols (if available), or import/export (if not +available in the other 2). For things that have no names, names are generated starting from `a`, `b`, `c` and so forth. In addition, prefixes are used for things that are not arguments/locals: @@ -48,6 +49,11 @@ long. Besides removing characters not typically part of an identifier, the decompiler also strips common keywords/types from these in an effort to reduce their size. +Linker symbols are typically only available in wasm .o files, though if useful +for naming can be retained in fully linked wasm modules using the +`--emit-reloc` flag to `wasm.ld`. This gives you names for most functions +even when `--strip-debug` was used. + ### Top level declarations. Top level items may be preceded with `import` or `export`. @@ -98,7 +104,7 @@ These tend to be the hardest to "read" in Wasm code, as they've lost all context of the data structures and types the language that Wasm was compiled from was operating upon. -wasm-decompile has a few feature to try and make these more readable. +wasm-decompile has a few features to try and make these more readable. The basic form looks like an array indexing operation, so `o[2]:int` says: read element 2 from `o` when seen as an array of ints. This thus accesses 4 bytes @@ -122,6 +128,11 @@ back to indexing operations when there are holes or overlaps in the memory layout, or types are mixed, etc. This happens even more so when locals such as `o` are being re-used for unrelated things in memory. +For accesses that are not contiguous, but at least of the same type, the +decompiler will change the pointer type from `o:int` to e.g. `o:float_ptr` (and +similarly, it will omit the type from the actual access, `o[2]` instead +of `o[2]:int`). + Additionally, wasm-decompile tried to clean up typical indexing operations. For example, when accessing any array of 32-bit elements, generated Wasm code often looks like `(base + (index << 2))[0]:int`, since Wasm has no diff --git a/src/binary-reader-ir.cc b/src/binary-reader-ir.cc index 9fae6168..9f16a380 100644 --- a/src/binary-reader-ir.cc +++ b/src/binary-reader-ir.cc @@ -247,6 +247,17 @@ class BinaryReaderIR : public BinaryReaderNop { Result OnInitExprRefNull(Index index) override; Result OnInitExprRefFunc(Index index, Index func_index) override; + Result OnDataSymbol(Index index, uint32_t flags, string_view name, + Index segment, uint32_t offset, uint32_t size) override; + Result OnFunctionSymbol(Index index, uint32_t flags, string_view name, + Index func_index) override; + Result OnGlobalSymbol(Index index, uint32_t flags, string_view name, + Index global_index) override; + Result OnSectionSymbol(Index index, uint32_t flags, + Index section_index) override; + Result OnEventSymbol(Index index, uint32_t flags, string_view name, + Index event_index) override; + private: Location GetLocation() const; void PrintError(const char* format, ...); @@ -1238,6 +1249,73 @@ Result BinaryReaderIR::OnEventType(Index index, Index sig_index) { return Result::Ok; } +Result BinaryReaderIR::OnDataSymbol(Index index, uint32_t flags, + string_view name, Index segment, + uint32_t offset, uint32_t size) { + if (name.empty()) { + return Result::Ok; + } + if (offset) { + // If it is pointing into the data segment, then it's not really naming + // the whole segment. + return Result::Ok; + } + DataSegment* seg = module_->data_segments[segment]; + std::string dollar_name = + GetUniqueName(&module_->data_segment_bindings, MakeDollarName(name)); + seg->name = dollar_name; + module_->data_segment_bindings.emplace(dollar_name, Binding(segment)); + return Result::Ok; +} + +Result BinaryReaderIR::OnFunctionSymbol(Index index, uint32_t flags, + string_view name, Index func_index) { + if (name.empty()) { + return Result::Ok; + } + Func* func = module_->funcs[func_index]; + if (!func->name.empty()) { + // The name section has already named this function. + return Result::Ok; + } + std::string dollar_name = + GetUniqueName(&module_->func_bindings, MakeDollarName(name)); + func->name = dollar_name; + module_->func_bindings.emplace(dollar_name, Binding(func_index)); + return Result::Ok; +} + +Result BinaryReaderIR::OnGlobalSymbol(Index index, uint32_t flags, + string_view name, Index global_index) { + if (name.empty()) { + return Result::Ok; + } + Global* glob = module_->globals[global_index]; + std::string dollar_name = + GetUniqueName(&module_->global_bindings, MakeDollarName(name)); + glob->name = dollar_name; + module_->global_bindings.emplace(dollar_name, Binding(global_index)); + return Result::Ok; +} + +Result BinaryReaderIR::OnSectionSymbol(Index index, uint32_t flags, + Index section_index) { + return Result::Ok; +} + +Result BinaryReaderIR::OnEventSymbol(Index index, uint32_t flags, + string_view name, Index event_index) { + if (name.empty()) { + return Result::Ok; + } + Event* event = module_->events[event_index]; + std::string dollar_name = + GetUniqueName(&module_->event_bindings, MakeDollarName(name)); + event->name = dollar_name; + module_->event_bindings.emplace(dollar_name, Binding(event_index)); + return Result::Ok; +} + } // end anonymous namespace Result ReadBinaryIr(const char* filename, diff --git a/src/decompiler-ast.h b/src/decompiler-ast.h index d7671e43..5d04ddfc 100644 --- a/src/decompiler-ast.h +++ b/src/decompiler-ast.h @@ -80,7 +80,7 @@ struct AST { if (f) { mc.BeginFunc(*f); for (Index i = 0; i < f->GetNumParams(); i++) { - auto name = IndexToAlphaName(i); + auto name = "$" + IndexToAlphaName(i); vars_defined.insert(name); } } diff --git a/src/decompiler-naming.h b/src/decompiler-naming.h index 41e0e3cb..8aefcfd6 100644 --- a/src/decompiler-naming.h +++ b/src/decompiler-naming.h @@ -119,6 +119,11 @@ enum { void RenameToContents(std::vector<DataSegment*>& segs, BindingHash& bh) { std::string s; for (auto seg : segs) { + if (seg->name.substr(0, 2) != "d_") { + // This segment was named explicitly by a symbol. + // FIXME: this is not a great check, a symbol could start with d_. + continue; + } s = "d_"; for (auto c : seg->data) { if (isalnum(c) || c == '_') { @@ -183,10 +188,19 @@ void RenameAll(Module& module) { { "4096ul" }, }; RenameToIdentifiers(module.funcs, module.func_bindings, &filter); - // Also do this for some other kinds of names. + // Also do this for some other kinds of names, but without the keyword + // substitution. RenameToIdentifiers(module.globals, module.global_bindings, nullptr); RenameToIdentifiers(module.tables, module.table_bindings, nullptr); - + RenameToIdentifiers(module.events, module.event_bindings, nullptr); + RenameToIdentifiers(module.exports, module.export_bindings, nullptr); + RenameToIdentifiers(module.func_types, module.func_type_bindings, nullptr); + RenameToIdentifiers(module.memories, module.memory_bindings, nullptr); + RenameToIdentifiers(module.data_segments, module.data_segment_bindings, + nullptr); + RenameToIdentifiers(module.elem_segments, module.elem_segment_bindings, + nullptr); + // Special purpose naming for data segments. RenameToContents(module.data_segments, module.data_segment_bindings); } diff --git a/src/decompiler.cc b/src/decompiler.cc index a0f710c4..92a33780 100644 --- a/src/decompiler.cc +++ b/src/decompiler.cc @@ -204,12 +204,17 @@ struct Decompiler { } } + string_view VarName(string_view name) { + assert(!name.empty()); + return name[0] == '$' ? name.substr(1) : name; + } + template<ExprType T> Value Get(const VarExpr<T>& ve) { - return Value{{ve.var.name()}, Precedence::Atomic}; + return Value{{std::string(VarName(ve.var.name()))}, Precedence::Atomic}; } template<ExprType T> Value Set(Value& child, const VarExpr<T>& ve) { - return WrapChild(child, ve.var.name() + " = ", "", Precedence::Assign); + return WrapChild(child, VarName(ve.var.name()) + " = ", "", Precedence::Assign); } std::string TempVarName(Index n) { @@ -221,7 +226,8 @@ struct Decompiler { std::string LocalDecl(const std::string& name, Type t) { auto struc = lst.GenTypeDecl(name); - return cat(name, ":", struc.empty() ? GetDecompTypeName(t) : struc); + return cat(VarName(name), ":", + struc.empty() ? GetDecompTypeName(t) : struc); } bool ConstIntVal(const Expr* e, uint64_t &dest) { @@ -360,7 +366,7 @@ struct Decompiler { } case NodeType::Decl: { return Value{ - {"var " + LocalDecl(n.u.var->name(), + {"var " + LocalDecl(std::string(n.u.var->name()), cur_func->GetLocalType(*n.u.var))}, Precedence::None}; } @@ -368,7 +374,8 @@ struct Decompiler { return WrapChild( args[0], cat("var ", - LocalDecl(n.u.var->name(), cur_func->GetLocalType(*n.u.var)), + LocalDecl(std::string(n.u.var->name()), + cur_func->GetLocalType(*n.u.var)), " = "), "", Precedence::None); } @@ -502,7 +509,7 @@ struct Decompiler { case ExprType::Block: { auto& val = args[0]; val.v.push_back( - cat("label ", cast<BlockExpr>(n.e)->block.label, ":")); + cat("label ", VarName(cast<BlockExpr>(n.e)->block.label), ":")); // If this block is part of a larger statement scope, it doesn't // need its own indenting, but if its part of an exp we wrap it in {}. if (parent && parent->ntype != NodeType::Statements @@ -521,7 +528,7 @@ struct Decompiler { auto& val = args[0]; auto& block = cast<LoopExpr>(n.e)->block; IndentValue(val, indent_amount, {}); - val.v.insert(val.v.begin(), cat("loop ", block.label, " {")); + val.v.insert(val.v.begin(), cat("loop ", VarName(block.label), " {")); val.v.push_back("}"); val.precedence = Precedence::Atomic; return std::move(val); @@ -529,13 +536,14 @@ struct Decompiler { case ExprType::Br: { auto be = cast<BrExpr>(n.e); return Value{{(n.u.lt == LabelType::Loop ? "continue " : "goto ") + - be->var.name()}, + VarName(be->var.name())}, Precedence::None}; } case ExprType::BrIf: { auto bie = cast<BrIfExpr>(n.e); auto jmp = n.u.lt == LabelType::Loop ? "continue" : "goto"; - return WrapChild(args[0], "if (", cat(") ", jmp, " ", bie->var.name()), + return WrapChild(args[0], "if (", cat(") ", jmp, " ", + VarName(bie->var.name())), Precedence::None); } case ExprType::Return: { @@ -562,11 +570,11 @@ struct Decompiler { auto bte = cast<BrTableExpr>(n.e); std::string ts = "br_table["; for (auto &v : bte->targets) { - ts += v.name(); + ts += VarName(v.name()); ts += ", "; } ts += ".."; - ts += bte->default_target.name(); + ts += VarName(bte->default_target.name()); ts += "]("; return WrapChild(args[0], ts, ")", Precedence::Atomic); } @@ -758,7 +766,7 @@ struct Decompiler { if (i) s += ", "; auto t = f->GetParamType(i); - auto name = IndexToAlphaName(i); + auto name = "$" + IndexToAlphaName(i); s += LocalDecl(name, t); } s += ")"; diff --git a/src/generate-names.cc b/src/generate-names.cc index 0e140f5f..e615d471 100644 --- a/src/generate-names.cc +++ b/src/generate-names.cc @@ -120,8 +120,7 @@ void NameGenerator::GenerateName(const char* prefix, Index index, unsigned disambiguator, std::string* str) { - str->clear(); - if (!(opts_ & NameOpts::NoDollar)) *str = "$"; + *str = "$"; *str += prefix; if (index != kInvalidIndex) { if (opts_ & NameOpts::AlphaNames) { diff --git a/src/generate-names.h b/src/generate-names.h index 58214c61..9cd926e4 100644 --- a/src/generate-names.h +++ b/src/generate-names.h @@ -26,7 +26,6 @@ struct Module; enum NameOpts { None = 0, AlphaNames = 1 << 0, - NoDollar = 1 << 1, }; Result GenerateNames(struct Module*, NameOpts opts = NameOpts::None); diff --git a/src/tools/wasm-decompile.cc b/src/tools/wasm-decompile.cc index 6ce2789e..62e69d45 100644 --- a/src/tools/wasm-decompile.cc +++ b/src/tools/wasm-decompile.cc @@ -88,8 +88,7 @@ int ProgramMain(int argc, char** argv) { result = ValidateModule(&module, &errors, options); } result = GenerateNames(&module, - static_cast<NameOpts>(NameOpts::AlphaNames | - NameOpts::NoDollar)); + static_cast<NameOpts>(NameOpts::AlphaNames)); // Must be called after ReadBinaryIr & GenerateNames, and before // ApplyNames, see comments at definition. RenameAll(module); diff --git a/src/tools/wasm2wat.cc b/src/tools/wasm2wat.cc index 8167c168..63ce282c 100644 --- a/src/tools/wasm2wat.cc +++ b/src/tools/wasm2wat.cc @@ -39,7 +39,7 @@ static std::string s_infile; static std::string s_outfile; static Features s_features; static WriteWatOptions s_write_wat_options; -static bool s_generate_names; +static bool s_generate_names = false; static bool s_read_debug_names = true; static bool s_fail_on_custom_section_error = true; static std::unique_ptr<FileStream> s_log_stream; diff --git a/test/README.md b/test/README.md index c93bc26a..669fd28d 100644 --- a/test/README.md +++ b/test/README.md @@ -140,7 +140,9 @@ The currently supported list of tools (see binary files), then parse via `wasm2wat` and display the result - `run-gen-wasm-interp`: parse a "gen-wasm" text file, generate a wasm file, the run `wasm-interp` on it, which runes all exported functions in an - interpreter + interpreter. +- `run-gen-wasm-decompile`: parse a "gen-wasm" text file (which can describe + invalid binary files), then parse via `wasm-decompile` and display the result. - `run-opcodecnt`: parse a wasm text file, convert it to binary, then display opcode usage counts. - `run-gen-spec-js`: parse wasm spec test text file, convert it to a JSON file @@ -149,6 +151,7 @@ The currently supported list of tools (see - `run-spec-wasm2c`: similar to `run-gen-spec-js`, but the output instead will be C source files, that are then compiled with the default C compiler (`cc`). Finally, the native executable is run. +- `run-wasm-decompile`: parse wat with `wat2wasm` then `wasm-decompile`. ## Test subdirectories diff --git a/test/binary/names.txt b/test/binary/names.txt index bc57f4a8..d5561072 100644 --- a/test/binary/names.txt +++ b/test/binary/names.txt @@ -1,26 +1,76 @@ ;;; TOOL: run-gen-wasm +;;; ARGS2: --generate-names +;; NOTE: same test as in test/decompile/names.txt magic version -section(TYPE) { count[1] function params[0] results[1] i32 } -section(FUNCTION) { count[1] type[0] } -section(CODE) { +section(TYPE) { + count[2] + function params[0] results[1] i32 + function params[0] results[0] +} +section(FUNCTION) { + count[4] + type[0] + type[1] + type[1] + type[1] +} +section(MEMORY) { count[1] + has_max[0] + initial[0] +} +section(GLOBAL) { + count[2] + ;; This has both a sym and export name, prefer sym. + type[i32] mut[0] init_expr[i32.const 0 end] + ;; This only has an export name. + type[i32] mut[0] init_expr[i32.const 0 end] +} +section(EXPORT) { + count[5] + str("F1_EXPORT") func_kind func[1] + str("F2_EXPORT") func_kind func[2] + str("F3_EXPORT") func_kind func[3] + str("G0_EXPORT") global_kind global[0] + str("G1_EXPORT") global_kind global[1] +} +section(CODE) { + count[4] + ;; Test name section. func { locals[decl_count[1] i32_count[1] i32] get_local 0 } + ;; Test naming priorities + ;; If there's a name section name, prefer that over sym/export. + func { locals[0] } + ;; If there's no name section name, prefer sym over export. + func { locals[0] } + ;; If there's only export, use that. + func { locals[0] } +} +section(DATA) { + count[2] + ;; These can only be named thru symbols. + memory_index[0] + offset[i32.const 0 end] + data[str("foo")] + memory_index[0] + offset[i32.const 10 end] + data[str("bar")] } section("name") { section(NAME_MODULE) { str("M0") } - section(NAME_FUNCTION) { - func_count[1] + func_count[2] index[0] str("F0") + index[1] + str("F1_NS") } - section(NAME_LOCALS) { func_count[1] index[0] @@ -29,10 +79,36 @@ section("name") { str("L0") } } +section("linking") { + metadata_version[2] + section(LINKING_SYMBOL_TABLE) { + num_symbols[5] + type[0] flags[1] index[1] str("F1_SYM") + type[0] flags[1] index[2] str("F2_SYM") + type[2] flags[1] index[0] str("G0_SYM") + + type[1] flags[4] str("D0_SYM") segment[0] offset[0] size[1] + type[1] flags[4] str("D1_SYM") segment[1] offset[0] size[1] + } +} (;; STDOUT ;;; (module $M0 - (type (;0;) (func (result i32))) - (func $F0 (type 0) (result i32) + (type $t0 (func (result i32))) + (type $t1 (func)) + (func $F0 (type $t0) (result i32) (local $L0 i32) - local.get $L0)) + local.get $L0) + (func $F1_NS (type $t1)) + (func $F2_SYM (type $t1)) + (func $F3_EXPORT (type $t1)) + (memory $M0 0) + (global $G0_SYM i32 (i32.const 0)) + (global $G1_EXPORT i32 (i32.const 0)) + (export "F1_EXPORT" (func $F1_NS)) + (export "F2_EXPORT" (func $F2_SYM)) + (export "F3_EXPORT" (func $F3_EXPORT)) + (export "G0_EXPORT" (global 0)) + (export "G1_EXPORT" (global 1)) + (data $D0_SYM (i32.const 0) "foo") + (data $D1_SYM (i32.const 10) "bar")) ;;; STDOUT ;;) diff --git a/test/decompile/names.txt b/test/decompile/names.txt new file mode 100644 index 00000000..f390b891 --- /dev/null +++ b/test/decompile/names.txt @@ -0,0 +1,116 @@ +;;; TOOL: run-gen-wasm-decompile +;; NOTE: same test as in test/binary/names.txt +magic +version +section(TYPE) { + count[2] + function params[0] results[1] i32 + function params[0] results[0] +} +section(FUNCTION) { + count[4] + type[0] + type[1] + type[1] + type[1] +} +section(MEMORY) { + count[1] + has_max[0] + initial[0] +} +section(GLOBAL) { + count[2] + ;; This has both a sym and export name, prefer sym. + type[i32] mut[0] init_expr[i32.const 0 end] + ;; This only has an export name. + type[i32] mut[0] init_expr[i32.const 0 end] +} +section(EXPORT) { + count[5] + str("F1_EXPORT") func_kind func[1] + str("F2_EXPORT") func_kind func[2] + str("F3_EXPORT") func_kind func[3] + str("G0_EXPORT") global_kind global[0] + str("G1_EXPORT") global_kind global[1] +} +section(CODE) { + count[4] + ;; Test name section. + func { + locals[decl_count[1] i32_count[1] i32] + get_local 0 + } + ;; Test naming priorities + ;; If there's a name section name, prefer that over sym/export. + func { locals[0] } + ;; If there's no name section name, prefer sym over export. + func { locals[0] } + ;; If there's only export, use that. + func { locals[0] } +} +section(DATA) { + count[2] + ;; These can only be named thru symbols. + memory_index[0] + offset[i32.const 0 end] + data[str("Hello, World!")] + memory_index[0] + offset[i32.const 10 end] + data[str("bar")] +} +section("name") { + section(NAME_MODULE) { + str("M0") + } + section(NAME_FUNCTION) { + func_count[2] + index[0] + str("F0") + index[1] + str("F1_NS") + } + section(NAME_LOCALS) { + func_count[1] + index[0] + local_count[1] + index[0] + str("L0") + } +} +section("linking") { + metadata_version[2] + section(LINKING_SYMBOL_TABLE) { + num_symbols[5] + type[0] flags[1] index[1] str("F1_SYM") + type[0] flags[1] index[2] str("F2_SYM") + type[2] flags[1] index[0] str("G0_SYM") + + type[1] flags[4] str("D0_SYM") segment[0] offset[0] size[1] + type[1] flags[4] str("D1_SYM") segment[1] offset[0] size[1] + } +} +(;; STDOUT ;;; +memory M_a(initial: 0, max: 0); + +global G0_SYM:int = 0; +export global G1_EXPORT:int = 0; + +data D0_SYM(offset: 0) = "Hello, World!"; +data D1_SYM(offset: 10) = "bar"; + +function F0():int { + var L0:int; + return L0; +} + +function F1_NS() { +} + +function F2_SYM() { +} + +export function F3_EXPORT() { +} + +;;; STDOUT ;;) diff --git a/test/run-tests.py b/test/run-tests.py index 0a452025..89094d9c 100755 --- a/test/run-tests.py +++ b/test/run-tests.py @@ -119,6 +119,12 @@ TOOLS = { ('RUN', '%(wasm-objdump)s -h %(temp_file)s.wasm'), ('VERBOSE-ARGS', ['--print-cmd', '-v']), ], + 'run-gen-wasm-decompile': [ + ('RUN', '%(gen_wasm_py)s %(in_file)s -o %(temp_file)s.wasm'), + ('RUN', '%(wasm-validate)s %(temp_file)s.wasm'), + ('RUN', '%(wasm-decompile)s %(temp_file)s.wasm'), + ('VERBOSE-ARGS', ['--print-cmd', '-v']), + ], 'run-opcodecnt': [ ('RUN', '%(wat2wasm)s %(in_file)s -o %(temp_file)s.wasm'), ('RUN', '%(wasm-opcodecnt)s %(temp_file)s.wasm'), |