import editdistance import frontmatter from hexdump2 import hexdump import gradio as gr import json import shlex import subprocess import tempfile from dist import levenshtein_with_wildcard, print_match_summary description = frontmatter.load("README.md").content def trim(str, n): return "\n".join(str.splitlines()[n:]) def trim_objdump(str): return trim(str, 7) def disassemble_bytes(byte_data, architecture, options): with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as temp_bin_file: temp_bin_file.write(byte_data) temp_bin_file_name = temp_bin_file.name disassembly = subprocess.run( [ "objdump", "-D", "-b", "binary", "-m", architecture, "-M", options, temp_bin_file_name, ], capture_output=True, text=True, ).stdout disassembly = trim_objdump(disassembly) return disassembly def compile(compiler, flags, source): # Create a temporary file for the C source code with tempfile.NamedTemporaryFile(suffix=".c", delete=False) as temp_c_file: temp_c_file.write(source.encode()) temp_c_file_name = temp_c_file.name # Create a temporary file for the object file with tempfile.NamedTemporaryFile(suffix=".o", delete=False) as temp_o_file: temp_o_file_name = temp_o_file.name # Compile the C file to an object file result = subprocess.run( [compiler, "-c", temp_c_file_name] + shlex.split(flags) + ["-o", temp_o_file_name], capture_output=True, text=True, ) compile_output = result.stdout + result.stderr if result.returncode == 0: # Create a temporary file for the raw bytes with tempfile.NamedTemporaryFile(suffix=".raw", delete=True) as raw_bytes_file: subprocess.run( [ "objcopy", "--only-section", ".text", # XXX in reality we should probably look at the sections "--only-section", ".text.*", "-O", "binary", temp_o_file_name, raw_bytes_file.name, ] ) compiled_bytes = raw_bytes_file.read() # Disassemble the object file disassembly = subprocess.run( ["objdump", "-dr", temp_o_file_name], capture_output=True, text=True ).stdout disassembly = trim_objdump(disassembly) # Relocs json_relocs = subprocess.run( [ "llvm-readobj-19", "--elf-output-style=JSON", "--relocations", temp_o_file_name, ], capture_output=True, text=True, ).stdout json_relocs = json.loads(json_relocs) json_relocs = json_relocs[0]["Relocations"] json_relocs = [r["Relocation"] for d in json_relocs for r in d["Relocs"]] # Filter out .text json_relocs = [r for r in json_relocs if r["Symbol"]["Name"] != ".text"] return json_relocs, compiled_bytes, compile_output, disassembly else: return None, None, compile_output, None def _reloc_type2size(s): match s: case "R_X86_64_PC32": return 4 case "R_X86_64_PLT32": return 4 case _: assert False, f"Unknown reloc {s}" def _compute_relocs_byte_range(json_relocs): relocs_byte_range = [ range(r["Offset"], r["Offset"] + _reloc_type2size(r["Type"]["Name"])) for r in json_relocs ] # Flatten relocs_byte_range relocs_byte_range = [i for r in relocs_byte_range for i in r] return relocs_byte_range def predict(target_bytes, source, compiler, flags, disasm_arch, disasm_options): target_bytes = bytes.fromhex(target_bytes) compiled_relocs, compiled_bytes, compile_output, compiled_disassembly = compile( compiler, flags, source ) target_disassembly = disassemble_bytes(target_bytes, disasm_arch, disasm_options) if compiled_bytes is not None: reloc_edit_distance, reloc_operations = print_match_summary( target_bytes, compiled_bytes, wildcard_offsets_seq2=_compute_relocs_byte_range(compiled_relocs), ) print(f"reloc_edit_distance: {reloc_edit_distance}") print(f"reloc operations: {reloc_operations}") return ( hexdump(compiled_bytes, result="return"), hexdump(target_bytes, result="return"), editdistance.eval(compiled_bytes, target_bytes), reloc_edit_distance, "\n".join(reloc_operations), compile_output, compiled_disassembly, compiled_relocs, target_disassembly, ) else: return ( "Compilation failed", hexdump(target_bytes, result="return"), -1, None, None, compile_output, compiled_disassembly, compiled_relocs, target_disassembly, ) def run(): demo = gr.Interface( fn=predict, description=description, inputs=[ gr.Textbox( lines=10, label="Bytes of Target Function (in hex)", value="b8 2a 00 00 00 c3", ), gr.Textbox( lines=10, label="Decompiled C Source Code", value="int x;\nint foo() { return x; }", ), gr.Textbox(label="Compiler", value="g++"), gr.Textbox(label="Compiler Flags", value="-O2"), gr.Textbox(label="Architecture (objdump -m)", value="i386"), gr.Textbox(label="Disassembler options (objdump -M)", value="x86-64"), ], outputs=[ gr.Textbox(label="Compiled bytes"), gr.Textbox(label="Target bytes"), gr.Number(label="Edit distance (lower is better)"), gr.Number(label="Edit distance (ignoring relocs; lower is better)"), gr.Textbox(label="Edit description (ignoring relocs)"), gr.Textbox(label="Compiler Output"), gr.Textbox(label="Compiled Disassembly"), gr.JSON(label="Compiled relocations", open=True), gr.Textbox(label="Target Disassembly"), ], ) demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True) run()