import editdistance import frontmatter from hexdump2 import hexdump import gradio as gr import json import shlex import subprocess import tempfile from dist import levenshtein_with_wildcards, print_match_summary description = frontmatter.load("README.md").content def trim(str, n): return "\n".join(str.splitlines()[n:]) def trim_objdump(str): return trim(str, 7) def disassemble_bytes(byte_data, architecture, options): with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as temp_bin_file: temp_bin_file.write(byte_data) temp_bin_file_name = temp_bin_file.name disassembly = subprocess.run( ["objdump", "-D", "-b", "binary", "-m", architecture, "-M", options, temp_bin_file_name], capture_output=True, text=True ).stdout disassembly = trim_objdump(disassembly) return disassembly def compile(compiler, flags, source): # Create a temporary file for the C source code with tempfile.NamedTemporaryFile(suffix=".c", delete=False) as temp_c_file: temp_c_file.write(source.encode()) temp_c_file_name = temp_c_file.name # Create a temporary file for the object file with tempfile.NamedTemporaryFile(suffix=".o", delete=False) as temp_o_file: temp_o_file_name = temp_o_file.name # Compile the C file to an object file result = subprocess.run( [compiler, "-c", temp_c_file_name] + shlex.split(flags) + ["-o", temp_o_file_name], capture_output=True, text=True, ) compile_output = result.stdout + result.stderr # Create a temporary file for the raw bytes with tempfile.NamedTemporaryFile(suffix=".raw", delete=True) as raw_bytes_file: subprocess.run( [ "objcopy", "--only-section", ".text", # XXX in reality we should probably look at the sections "--only-section", ".text.*", "-O", "binary", temp_o_file_name, raw_bytes_file.name, ] ) compiled_bytes = raw_bytes_file.read() # Disassemble the object file disassembly = subprocess.run( ["objdump", "-dr", temp_o_file_name], capture_output=True, text=True ).stdout disassembly = trim_objdump(disassembly) # Relocs # relocs = subprocess.run( # ["objdump", "-r", temp_o_file_name], # capture_output=True, # text=True # ).stdout # relocs = trim(relocs, 3) json_relocs = subprocess.run( ["llvm-readobj-19", "--elf-output-style=JSON", "--relocations", temp_o_file_name], capture_output=True, text=True, ).stdout json_relocs = json.loads(json_relocs) json_relocs = json_relocs[0]["Relocations"] json_relocs = [r["Relocation"] for d in json_relocs for r in d['Relocs']] # Filter out .text json_relocs = [r for r in json_relocs if r["Symbol"]["Name"] != ".text"] def reloc_type2size(s): match s: case "R_X86_64_PC32": return 32 case "R_X86_64_PLT32": return 32 case _: assert False, f"Unknown reloc {s}" relocs = [(r["Offset"], reloc_type2size(r["Type"]["Name"])) for r in json_relocs] print(f"relocs: {relocs}") if result.returncode == 0: return json_relocs, compiled_bytes, compile_output, disassembly else: return None, None, compile_output, disassembly def predict(target_bytes, source, compiler, flags, disasm_arch, disasm_options): target_bytes = bytes.fromhex(target_bytes) compiled_relocs, compiled_bytes, compile_output, compiled_disassembly = compile(compiler, flags, source) target_disassembly = disassemble_bytes(target_bytes, disasm_arch, disasm_options) if compiled_bytes is not None: return ( hexdump(compiled_bytes, result="return"), hexdump(target_bytes, result="return"), editdistance.eval(compiled_bytes, target_bytes), compile_output, compiled_disassembly, compiled_relocs, target_disassembly ) else: return ( "Compilation failed", hexdump(target_bytes, result="return"), -1, compile_output, compiled_disassembly, compiled_relocs, target_disassembly ) def run(): demo = gr.Interface( fn=predict, description=description, inputs=[ gr.Textbox( lines=10, label="Bytes of Target Function (in hex)", value="b8 2a 00 00 00 c3", ), gr.Textbox( lines=10, label="Decompiled C Source Code", value="int x;\nint foo() { return x; }", ), gr.Textbox(label="Compiler", value="g++"), gr.Textbox(label="Compiler Flags", value="-O2"), gr.Textbox(label="Architecture (objdump -m)", value="i386"), gr.Textbox(label="Disassembler options (objdump -M)", value="x86-64") ], outputs=[ gr.Textbox(label="Compiled bytes"), gr.Textbox(label="Target bytes"), gr.Number(label="Edit distance (lower is better)"), gr.Textbox(label="Compiler Output"), gr.Textbox(label="Compiled Disassembly"), gr.JSON(label="Compiled relocations", open=True), gr.Textbox(label="Target Disassembly"), ], ) demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True) run()