diff --git a/emsymbolizer.py b/emsymbolizer.py index 398d36b6bbd8c..cf6a2ed7ae76d 100755 --- a/emsymbolizer.py +++ b/emsymbolizer.py @@ -14,11 +14,14 @@ import argparse import json import os +import numbers +import pickle import re import subprocess import sys from tools import shared from tools import webassembly +from tools import wasm_offset_converter LLVM_SYMBOLIZER = os.path.expanduser( shared.build_llvm_tool_path(shared.exe_suffix('llvm-symbolizer'))) @@ -41,6 +44,11 @@ def print(self): func = self.func if self.func else '??' print(f'{func}\n{source}:{self.line}:{self.column}') + def getAsJson(self): + source = self.source if self.source else '??' + func = self.func if self.func else '??' + return {'source': source, 'func': func, 'line': self.line, 'column':self.column} + def get_codesec_offset(module): sec = module.get_section(webassembly.SecType.CODE) @@ -94,12 +102,18 @@ def __init__(self, source=None, line=0, column=0, func=None): self.line = line self.column = column self.func = func + def __repr__(self) -> str: + return "Location(source={0},line={1},column={2},func={3})".format( + self.source, self.line, self.column, self.func + ) - def __init__(self): + def __init__(self, offsetConverter): + self.offsetConverter = offsetConverter self.version = None self.sources = [] self.mappings = {} self.offsets = [] + self.names = {} def parse(self, filename): with open(filename) as f: @@ -109,6 +123,7 @@ def parse(self, filename): self.version = source_map_json['version'] self.sources = source_map_json['sources'] + self.names = source_map_json['names'] vlq_map = {} chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=' @@ -136,10 +151,13 @@ def decodeVLQ(string): offset = 0 src = 0 + name = 0 line = 1 col = 1 for segment in source_map_json['mappings'].split(','): data = decodeVLQ(segment) + if shared.DEBUG: + print("\n data: {} \n".format(data)) info = [] offset += data[0] @@ -152,7 +170,12 @@ def decodeVLQ(string): if len(data) >= 4: col += data[3] info.append(col) - # TODO: see if we need the name, which is the next field (data[4]) + # if len(data) >= 5: + # print("Name exists!!") + # name += data[4] + # info.append(name) + + info.append(self.offsetConverter.getName(offset)) self.mappings[offset] = WasmSourceMap.Location(*info) self.offsets.append(offset) @@ -169,20 +192,28 @@ def find_offset(self, offset): hi = mid else: lo = mid + 1 + if lo >= len(self.offsets): + print("--->find_offset: offset_len: {0}, ret_lo:{1}".format(len(self.offsets), lo)) return self.offsets[lo - 1] def lookup(self, offset): nearest = self.find_offset(offset) assert nearest in self.mappings, 'Sourcemap has an offset with no mapping' info = self.mappings[nearest] - return LocationInfo( + offset_converter_ret = self.offsetConverter.getName(offset) + locInfo = LocationInfo( self.sources[info.source] if info.source is not None else None, info.line, - info.column + info.column, + offset_converter_ret + # info.func + #self.names[info.func] ) + #print(f'offset: {hex(offset)}, nearest: {hex(nearest)}, info: {info}, offsetConvRet: {offset_converter_ret}') + return locInfo -def symbolize_address_sourcemap(module, address, force_file): +def symbolize_address_sourcemap(module, address, force_file, offsetConverter): URL = force_file if not URL: # If a sourcemap file is not forced, read it from the wasm module @@ -195,7 +226,7 @@ def symbolize_address_sourcemap(module, address, force_file): if shared.DEBUG: print(f'Source Mapping URL: {URL}') - sm = WasmSourceMap() + sm = WasmSourceMap(offsetConverter) sm.parse(URL) if shared.DEBUG: csoff = get_codesec_offset(module) @@ -205,12 +236,182 @@ def symbolize_address_sourcemap(module, address, force_file): print(f'{k-csoff:x}: {v}') sm.lookup(address).print() +def build_address_sourcemap(module, force_file, offsetConverter): + URL = force_file + if not URL: + # If a sourcemap file is not forced, read it from the wasm module + section = get_sourceMappingURL_section(module) + assert section + module.seek(section.offset) + assert module.read_string() == 'sourceMappingURL' + # TODO: support stripping/replacing a prefix from the URL + URL = module.read_string() + + if shared.DEBUG: + print(f'Source Mapping URL: {URL}') + sm = WasmSourceMap(offsetConverter) + sm.parse(URL) + if shared.DEBUG: + csoff = get_codesec_offset(module) + print(sm.mappings) + # Print with section offsets to easily compare against dwarf + for k, v in sm.mappings.items(): + print(f'{k-csoff:x}: {v}') + return sm + + +def isJSPC(val): + addr = '{:032b}'.format(val) + return addr[0] == '1' + +def getJSPC(val): + addr = '{:032b}'.format(val) + addr = '0' + addr[1:] + return int(addr, 2) + +def get_location_info_from_line(line, pc_addr): + m = re.search(r"\(https://(.+?)\)", line) + file_name = "??" + line_no = "??" + col_no = "??" + function_name = pc_addr + source_line = None + if m: + source_line = m.group(1) + #print("Found: ", found) + # Exclude the https substring for function-name. + function_name = line.replace(source_line, '') + function_name = function_name.replace("(https://)", '') + # print("Function: ", function_name) + elif "https://" in line: + # Some stacktrace lines are present without any function names. + # e.g https://localhost.corp.adobe.com:3000/libProteusWeb.js:14757:34 + function_name = pc_addr + source_line = line.strip() + + if source_line: + first_slash = source_line.find('/') + file_line_col_substr = source_line[first_slash:] + colon_positions = [pos for pos, char in enumerate(file_line_col_substr) if char == ':'] + file_name = file_line_col_substr[:colon_positions[0]] + line_no = file_line_col_substr[colon_positions[0]+1 : colon_positions[1]] + col_no = file_line_col_substr[colon_positions[1]+1 : ] + #print("Source Info fileName:{0}, line_no: {1}, column_no:{2}".format(file_name, line_no, col_no)) + + return LocationInfo( + file_name, + line_no, + col_no, + function_name + ) + +def convert_pc_file_to_symbol_file(args): + #print("convert_pc_file_to_symbol_file", args) + pc_file = args.address + # removing the new line characters + with open(pc_file) as f: + pcs = [line.rstrip() for line in f] + #print("Number of addresses: {}".format(len(pcs))) + + out_sym_map_file = pc_file + ".symbol_map.json" + #js_pc_map_file = "/Users/gmarella/Documents/SampleAppMemProfiling/JS_PC_CACHE.json" + OUT_DIR = os.path.dirname(pc_file) + js_pc_map_file = os.path.join(OUT_DIR, "VENUS_JS_PC_CACHE.json") + #js_pc_map_file = "/Users/gmarella/Documents/VenusMemProfiling/obj_files/VENUS_JS_PC_CACHE.json" + out_offset_convertet_map_file = pc_file +".offset_map.json" + + with open(js_pc_map_file, "r") as js_pc_file: + JS_PC_MAP = json.load(js_pc_file) + + with webassembly.Module(args.wasm_file) as module: + offsetConverter = wasm_offset_converter.WasmOffsetConverter(args.wasm_file, module) + #offsetConverterMapFile = pc_file + ".offset_converter.json" + #print(f'OffsetConverter Map to {offsetConverterMapFile}') + #offsetConverter.printDetails() + #offsetConverter.dumpLookUpMap(offsetConverterMapFile) + + with open(out_offset_convertet_map_file, "w") as out_offset_file: + json.dump(offsetConverter.name_map, out_offset_file) + #sm = build_address_sourcemap(module, args.file, offsetConverter) + pc_info = {} + for pc in pcs: + base = 16 if pc.lower().startswith('0x') else 10 + address_str = pc + address = int(address_str, base) + if isJSPC(address): + address = getJSPC(address) + + pc_addr = str(address) + if pc_addr in JS_PC_MAP: + src = JS_PC_MAP[pc_addr] + locInfo = get_location_info_from_line(src, pc_addr) + else: + locInfo = LocationInfo( + "?", + "?", + "?", + str(address) + ) + #print("JS: address: {0}, source: {1}".format(hex(address), src)) + pc_info[address_str] = locInfo.getAsJson() + else: + symbolized = 0 + + # if args.addrtype == 'code': + # address += get_codesec_offset(module) + + # print(f'has_debug_line_section?: ${has_debug_line_section(module)}') + + if ((has_debug_line_section(module) and not args.source) or + 'dwarf' in args.source): + symbolize_address_dwarf(module, address) + symbolized += 1 + + if ((get_sourceMappingURL_section(module) and not args.source) or + 'sourcemap' in args.source): + #print(sm.lookup(address)) + #addr_info = sm.lookup(address).getAsJson() + addr_info = LocationInfo( + "?", + "?", + "?", + offsetConverter.getName(address) + ) + + # print(hex(address), addr_info) + pc_info[address_str] = addr_info.getAsJson() + symbolized += 1 + # print(f'Trying again for offset {hex(address)}') + # address += get_codesec_offset(module) + # addr_info2 = sm.lookup(address).getAsJson() + # print(f'After offset addition: {hex(address)}, {addr_info2}') + + if not symbolized: + raise Error('No .debug_line or sourceMappingURL section found in ' + f'{module.filename}.' + " I don't know how to symbolize this file yet") + # print("Writing symbols to {}".format(out_sym_map_file)) + with open(out_sym_map_file, 'w') as f: + json.dump(pc_info, f) + #pickle.dump(pc_info, f) + def main(args): + if args.addrfile == 'file': + convert_pc_file_to_symbol_file(args) + return + with webassembly.Module(args.wasm_file) as module: base = 16 if args.address.lower().startswith('0x') else 10 - address = int(args.address, base) + address_str = args.address + if address_str[-8] == "8": + # TODO:Gopi; ust dumping random name for JS symbol, fix this properly. + print(f'anonymous_js_function\n??:??:??') + address_str = address_str[:-8] + '0' + address_str[-7:] + return + address = int(address_str, base) symbolized = 0 + offsetConverter = wasm_offset_converter.WasmOffsetConverter(args.wasm_file, module) if args.addrtype == 'code': address += get_codesec_offset(module) @@ -222,7 +423,7 @@ def main(args): if ((get_sourceMappingURL_section(module) and not args.source) or 'sourcemap' in args.source): - symbolize_address_sourcemap(module, address, args.file) + symbolize_address_sourcemap(module, address, args.file, offsetConverter) symbolized += 1 if not symbolized: @@ -243,7 +444,15 @@ def get_args(): parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose info for debugging this script') parser.add_argument('wasm_file', help='Wasm file') - parser.add_argument('address', help='Address to lookup') + # addr_group = parser.add_mutually_exclusive_group() + # addr_group.add_argument('address', help='Address to lookup', nargs='?') + # addr_group.add_argument('address-file', help='Address File', nargs='?') + parser.add_argument('address', help='Address to lookup or File containing addresses') + + parser.add_argument('-x', '--addrfile', choices=['code', 'file'], + default='code', + help='address is file or hexcode') + args = parser.parse_args() if args.verbose: shared.PRINT_SUBPROCS = 1 diff --git a/symbolizer_offset_converter.py b/symbolizer_offset_converter.py new file mode 100755 index 0000000000000..0a0ef704ef0a1 --- /dev/null +++ b/symbolizer_offset_converter.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 + +# This is a utility for looking up the symbol names for a list of addresses in +# a file. The addresses can be either PC addresses or JS PC addresses. The utility +# uses the VENUS_JS_PC_CACHE.json file to look up the symbol names for JS PC addresses. +# and offsetConverter to look up the symbol names for PC addresses. + +# TODO: We need to use emsymbolizer.py to look up more info like line/file numbers. + +import argparse +import json +import os +import re +import subprocess +import sys +import time +from tools import shared +from tools import webassembly +from tools import wasm_offset_converter + + +def run_command(command, cmd_stdin=None, cmd_stdout=None): + start_time = time.time() + result = subprocess.run(command, stdin=cmd_stdin, stdout=cmd_stdout) + end_time = time.time() + elapsed_time = "{:.2f}".format(end_time - start_time) + print(f"############# \033[95m[TIMING]: {elapsed_time} seconds for Command {command}\033[0m") + if result.returncode != 0: + print(f"Command failed: {command}") + sys.exit(1) + + +class Error(BaseException): + pass + + +# Class to treat location info in a uniform way across information sources. +class LocationInfo(object): + def __init__(self, source=None, line=0, column=0, func=None): + self.source = source + self.line = line + self.column = column + self.func = func + + def print(self): + source = self.source if self.source else '??' + func = self.func if self.func else '??' + print(f'{func}\n{source}:{self.line}:{self.column}') + + def get_as_json(self): + source = self.source if self.source else '??' + func = self.func if self.func else '??' + return {'source': source, 'func': func, 'line': self.line, 'column': self.column} + + +def get_codesec_offset(module): + sec = module.get_section(webassembly.SecType.CODE) + if not sec: + raise Error(f'No code section found in {module.filename}') + return sec.offset + + +def has_debug_line_section(module): + for sec in module.sections(): + if sec.name == ".debug_line": + return True + return False + + +def is_js_pc(val): + addr = '{:032b}'.format(val) + return addr[0] == '1' + + +def get_js_pc(val): + addr = '{:032b}'.format(val) + addr = '0' + addr[1:] + return int(addr, 2) + + +def get_location_info_from_line(line, pc_addr): + m = re.search(r"\(https://(.+?)\)", line) + file_name = "??" + line_no = "??" + col_no = "??" + function_name = pc_addr + source_line = None + if m: + source_line = m.group(1) + # Exclude the https substring for function-name. + function_name = line.replace(source_line, '') + function_name = function_name.replace("(https://)", '') + # print("Function: ", function_name) + elif "https://" in line: + # Some stacktrace lines are present without any function names. + # e.g https://localhost.corp.adobe.com:3000/libProteusWeb.js:14757:34 + function_name = pc_addr + source_line = line.strip() + + if source_line: + first_slash = source_line.find('/') + file_line_col_substr = source_line[first_slash:] + colon_positions = [pos for pos, char in enumerate(file_line_col_substr) if char == ':'] + file_name = file_line_col_substr[:colon_positions[0]] + line_no = file_line_col_substr[colon_positions[0] + 1:colon_positions[1]] + col_no = file_line_col_substr[colon_positions[1] + 1:] + # print("Source Info fileName:{0}, line_no: {1}, column_no:{2}".format(file_name, line_no, col_no)) + + return LocationInfo( + file_name, + line_no, + col_no, + function_name + ) + + +def demangle_function_names(pc_info, out_dir, args): + # Extract the function names from the symbol map and write to a file named orig_function_names.txt + orig_function_names_file = os.path.join(out_dir, "orig_function_names.txt") + PC_FILENAME_SEPARATOR = " #### " + DEMANGLE_SCRIPT = os.path.join(args.llvm_bin_path, "llvm-cxxfilt") + with open(orig_function_names_file, "w") as f: + for pc in pc_info: + func_name = pc_info[pc]['func'] + f.write(f"{pc}{PC_FILENAME_SEPARATOR}{func_name}\n") + """ + Run the demangler script at args.llvm_bin_path/llvm-cxxfilt to demangle the function names. + """ + demangled_function_names_file = os.path.join(out_dir, "demangled_function_names.txt") + with open(demangled_function_names_file, "w") as f: + cmd = [DEMANGLE_SCRIPT, "-n"] + run_command(cmd, cmd_stdin=open(orig_function_names_file), cmd_stdout=f) + + """ + update the symbol map with the demangled function names. + """ + demangled_names = {} + with open(demangled_function_names_file, "r") as f: + for line in f: + pc, demangled_func_name = line.split(PC_FILENAME_SEPARATOR) + pc = pc.strip() + demangled_names[pc] = demangled_func_name.strip() + for pc in pc_info: + pc_info[pc]['func'] = demangled_names[pc] + return pc_info + + +def convert_pc_file_to_symbol_file(args): + # print("convert_pc_file_to_symbol_file", args) + pc_file = args.pc_file + # removing the new line characters + with open(pc_file) as f: + pcs = [line.rstrip() for line in f] + # print("Number of addresses: {}".format(len(pcs))) + + out_sym_map_file = pc_file + ".symbol_map.json" + print("Writing symbols to {}".format(out_sym_map_file)) + OUT_DIR = os.path.dirname(pc_file) + js_pc_map_file = os.path.join(OUT_DIR, "VENUS_JS_PC_CACHE.json") + out_offset_convertet_map_file = pc_file + ".offset_map.json" + + with open(js_pc_map_file, "r") as js_pc_file: + JS_PC_MAP = json.load(js_pc_file) + + offset_converter = None + with webassembly.Module(args.wasm_file) as module: + offset_converter = wasm_offset_converter.WasmOffsetConverter(args.wasm_file, module) + if not offset_converter: + print(f'Failed to create offset converter for {args.wasm_file}') + sys.exit(1) + + with open(out_offset_convertet_map_file, "w") as out_offset_file: + json.dump(offset_converter.name_map, out_offset_file) + pc_info = {} + for pc in pcs: + base = 16 if pc.lower().startswith('0x') else 10 + address_str = pc + address = int(address_str, base) + if is_js_pc(address): + address = get_js_pc(address) + + pc_addr = str(address) + if pc_addr in JS_PC_MAP: + src = JS_PC_MAP[pc_addr] + loc_info = get_location_info_from_line(src, pc_addr) + else: + loc_info = LocationInfo( + "?", + "?", + "?", + str(address) + ) + pc_info[address_str] = loc_info.get_as_json() + else: + # TODO: If we need line number info, and we can either go with DWARF or sourcemap + addr_info = LocationInfo( + "?", + "?", + "?", + offset_converter.getName(address) + ) + + pc_info[address_str] = addr_info.get_as_json() + # print(f'Trying again for offset {hex(address)}') + # address += get_codesec_offset(module) + # addr_info2 = offset_converter.get_name(address) + # print(f'After offset addition: {hex(address)}, {addr_info2}') + + # Demangle the C++ function names. + pc_info = demangle_function_names(pc_info, OUT_DIR, args) + # Write the updated symbol map to the output file. + with open(out_sym_map_file, 'w') as f: + json.dump(pc_info, f) + + +def main(args): + convert_pc_file_to_symbol_file(args) + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-v', '--verbose', action='store_true', + help='Print verbose info for debugging this script') + parser.add_argument('wasm_file', help='Wasm file') + parser.add_argument('pc_file', help='File containing list of PC addresses') + # Add argument for LLVM bin path with "~/.venus_emscripten/upstream/bin" as default value. + parser.add_argument('--llvm_bin_path', default=os.path.expanduser("~/.venus_emscripten/upstream/bin"), + help='Path to LLVM bin directory') + + args = parser.parse_args() + if args.verbose: + shared.PRINT_SUBPROCS = 1 + shared.DEBUG = True + return args + + +if __name__ == '__main__': + try: + rv = main(get_args()) + except (Error, webassembly.InvalidWasmError, OSError) as e: + print(f'{sys.argv[0]}: {str(e)}', file=sys.stderr) + rv = 1 + sys.exit(rv) diff --git a/tools/wasm_offset_converter.py b/tools/wasm_offset_converter.py new file mode 100644 index 0000000000000..29b1621937785 --- /dev/null +++ b/tools/wasm_offset_converter.py @@ -0,0 +1,133 @@ +from tools import webassembly +import json +# +""" +Python version of wasm_offset_converter.js +- Support to build offset-function-name map is added to the webassembly.py itself unlike .js file. +""" +class WasmOffsetConverter: + def __init__(self, wasmFile, wasmModule): + # map from function index to byte offset in WASM binary + self.offset_map = {} + self.func_starts = [] + # map from function index to names in WASM binary + # number of imported functions self module has + self.import_functions = 0 + with open(wasmFile, "rb") as f_h: + self.buffer = bytearray(f_h.read()) + self.wasmModule = wasmModule + self.name_map = self.wasmModule.get_function_names() + self.parse_buffer() + + def parse_buffer(self): + # the buffer unsignedLEB128 will read from. + buffer = self.buffer + offset = 8 + funcidx = 0 + + def unsignedLEB128(): + # consumes an unsigned LEB128 integer starting at `offset`. + # changes `offset` to immediately after the integer + nonlocal offset + result = 0 + shift = 0 + while True: + byte = buffer[offset] + offset += 1 + result += (byte & 0x7F) << shift + shift += 7 + if not byte & 0x80: + break + return result + + def skipLimits(): + flags = unsignedLEB128() + unsignedLEB128() # initial size + hasMax = (flags & 1) != 0 + if hasMax: + unsignedLEB128() + + continue_parsing = True + while continue_parsing and offset < len(buffer): + start = offset + type = buffer[offset] + offset += 1 + end = unsignedLEB128() + offset + if type == 2: # import section + count = unsignedLEB128() + while count > 0: + offset = unsignedLEB128() + offset + offset = unsignedLEB128() + offset + kind = buffer[offset] + offset = offset + 1 + if kind == 0: + funcidx = funcidx + 1 + unsignedLEB128() + elif kind == 1: + unsignedLEB128() + skipLimits() + elif kind == 2: + skipLimits() + elif kind == 3: + offset = offset + 2 + elif kind == 4: + offset = offset + 1 + unsignedLEB128() + else: + raise Exception('bad import kind: ' + str(kind)) + count = count - 1 + self.import_functions = funcidx + elif type == 10: # code section + count = unsignedLEB128() + while count > 0: + size = unsignedLEB128() + self.offset_map[funcidx] = offset + funcidx += 1 + self.func_starts.append(offset) + offset += size + count -= 1 + continue_parsing = False + offset = end + + def convert(self, funcidx, offset): + return self.offset_map[funcidx] + offset + + def getIndex(self, offset): + lo = 0 + hi = len(self.func_starts) + mid = 0 + while lo < hi: + mid = (lo + hi) // 2 + if self.func_starts[mid] > offset: + hi = mid + else: + lo = mid + 1 + if lo == len(self.func_starts): + return -1 + return lo + self.import_functions - 1 + + def isSameFunc(self, offset1, offset2): + return self.getIndex(offset1) == self.getIndex(offset2) + + def printLookupMap(self): + print("wasmOffsetConverter lookup map\n") + print(self.name_map) + + def printDetails(self): + print("wasmOffsetConverter details\n") + print(f'NameMap entries: {len(self.name_map)}, ImportedFunctions: {self.import_functions}, OffsetMap: {len(self.offset_map)}') + + def dumpLookUpMap(self, filePath): + with open(filePath, 'w') as fileHandle: + json.dump(self.name_map, fileHandle) + + def lookupIndexFromName(self, fname): + for key, value in self.name_map.items(): + if fname in value: + return [key, value, hex(self.offset_map[key])] + return [None, None, None] + + def getName(self, offset): + index = self.getIndex(offset) + lookup_map = self.name_map + return lookup_map[index] if index in lookup_map else ('wasm-function[' + str(index) + ']') diff --git a/tools/webassembly.py b/tools/webassembly.py index 69376b989ed4b..cc5c2b5afa5ff 100644 --- a/tools/webassembly.py +++ b/tools/webassembly.py @@ -466,6 +466,29 @@ def get_functions(self): functions.append(FunctionBody(start, body_size)) self.seek(start + body_size) return functions + + @memoize + def get_function_names(self): + name_section = self.get_custom_section('name') + name_map = {} + if not name_section: + return [] + self.seek(name_section.offset) + self.read_string() # name + section_end = name_section.offset + name_section.size + while self.tell() < section_end: + subsection_type = self.read_byte() + subsection_size = self.read_uleb() + if subsection_type != 1: + self.seek(self.tell() + subsection_size) + continue + count = self.read_uleb() + while count > 0: + index = self.read_uleb() + name = self.read_string() + name_map[index] = name + count -= 1 + return name_map def get_section(self, section_code): return next((s for s in self.sections() if s.type == section_code), None)