From 54a0a3e9c045461906ffd50473d74ea3d53e8bb9 Mon Sep 17 00:00:00 2001 From: bird_egop Date: Fri, 18 Apr 2025 14:19:13 +0300 Subject: [PATCH] Fix RVA to offset calculation for control flow-based disassembly --- .../Decompiler/ControlFlowGraph.cs | 271 --------- .../Decompiler/DataFlowAnalysis.cs | 516 ----------------- X86Disassembler/Decompiler/Decompiler.cs | 531 ------------------ X86Disassembler/Program.cs | 125 +++-- X86Disassembler/X86/Disassembler.cs | 197 ++++++- 5 files changed, 279 insertions(+), 1361 deletions(-) delete mode 100644 X86Disassembler/Decompiler/ControlFlowGraph.cs delete mode 100644 X86Disassembler/Decompiler/DataFlowAnalysis.cs delete mode 100644 X86Disassembler/Decompiler/Decompiler.cs diff --git a/X86Disassembler/Decompiler/ControlFlowGraph.cs b/X86Disassembler/Decompiler/ControlFlowGraph.cs deleted file mode 100644 index f95fedb..0000000 --- a/X86Disassembler/Decompiler/ControlFlowGraph.cs +++ /dev/null @@ -1,271 +0,0 @@ -namespace X86Disassembler.Decompiler; - -using System.Collections.Generic; -using X86; -using X86.Operands; - -/// -/// Represents a control flow graph for decompilation -/// - public class ControlFlowGraph -{ - /// - /// Represents a basic block in the control flow graph - /// - public class BasicBlock - { - /// - /// Gets or sets the starting address of the basic block - /// - public ulong StartAddress { get; set; } - - /// - /// Gets or sets the ending address of the basic block - /// - public ulong EndAddress { get; set; } - - /// - /// Gets the list of instructions in this basic block - /// - public List Instructions { get; } = []; - - /// - /// Gets the list of successor blocks (blocks that can be executed after this one) - /// - public List Successors { get; } = []; - - /// - /// Gets the list of predecessor blocks (blocks that can execute before this one) - /// - public List Predecessors { get; } = []; - - /// - /// Returns a string representation of the basic block - /// - /// A string representation of the basic block - public override string ToString() - { - return $"Block {StartAddress:X8}-{EndAddress:X8} with {Instructions.Count} instructions"; - } - } - - // Dictionary mapping addresses to basic blocks - private readonly Dictionary _blocks = []; - - // Entry point of the control flow graph - private BasicBlock? _entryBlock; - - /// - /// Gets the entry block of the control flow graph - /// - public BasicBlock? EntryBlock => _entryBlock; - - /// - /// Gets all basic blocks in the control flow graph - /// - public IReadOnlyDictionary Blocks => _blocks; - - /// - /// Builds a control flow graph from a list of instructions - /// - /// The list of instructions - /// The entry point address - /// A control flow graph - public static ControlFlowGraph Build(List instructions, ulong entryPoint) - { - ControlFlowGraph cfg = new ControlFlowGraph(); - - // First pass: identify basic block boundaries - HashSet leaders = new HashSet(); - - // The entry point is always a leader - leaders.Add(entryPoint); - - // Identify other leaders - for (int i = 0; i < instructions.Count; i++) - { - Instruction inst = instructions[i]; - - // Check if this instruction is a branch or jump - if (IsControlTransfer(inst)) - { - // The target of a jump/branch is a leader - ulong? targetAddress = GetTargetAddress(inst); - if (targetAddress.HasValue) - { - leaders.Add(targetAddress.Value); - } - - // The instruction following a jump/branch is also a leader (if it exists) - if (i + 1 < instructions.Count) - { - leaders.Add(instructions[i + 1].Address); - } - } - } - - // Second pass: create basic blocks - BasicBlock? currentBlock = null; - - foreach (Instruction inst in instructions) - { - // If this instruction is a leader, start a new basic block - if (leaders.Contains(inst.Address)) - { - // Finalize the previous block if it exists - if (currentBlock != null) - { - currentBlock.EndAddress = inst.Address - 1; - cfg._blocks[currentBlock.StartAddress] = currentBlock; - } - - // Create a new block - currentBlock = new BasicBlock - { - StartAddress = inst.Address - }; - - // If this is the entry point, set it as the entry block - if (inst.Address == entryPoint) - { - cfg._entryBlock = currentBlock; - } - } - - // Add the instruction to the current block - if (currentBlock != null) - { - currentBlock.Instructions.Add(inst); - } - - // If this instruction is a control transfer, finalize the current block - if (IsControlTransfer(inst) && currentBlock != null) - { - currentBlock.EndAddress = inst.Address; - cfg._blocks[currentBlock.StartAddress] = currentBlock; - currentBlock = null; - } - } - - // Finalize the last block if it exists - if (currentBlock != null) - { - currentBlock.EndAddress = instructions[^1].Address; - cfg._blocks[currentBlock.StartAddress] = currentBlock; - } - - // Third pass: connect basic blocks - foreach (var block in cfg._blocks.Values) - { - // Get the last instruction in the block - Instruction lastInst = block.Instructions[^1]; - - // If the last instruction is a jump, add the target as a successor - if (IsControlTransfer(lastInst)) - { - ulong? targetAddress = GetTargetAddress(lastInst); - if (targetAddress.HasValue && cfg._blocks.TryGetValue(targetAddress.Value, out BasicBlock? targetBlock)) - { - block.Successors.Add(targetBlock); - targetBlock.Predecessors.Add(block); - } - - // If the instruction is a conditional jump, the next block is also a successor - if (IsConditionalJump(lastInst)) - { - // Assume each instruction is 1-15 bytes in length - // Since we don't have RawBytes, use a constant for now - const int estimatedInstructionLength = 4; // Typical x86 instruction length - ulong nextAddress = lastInst.Address + (ulong)estimatedInstructionLength; - if (cfg._blocks.TryGetValue(nextAddress, out BasicBlock? nextBlock)) - { - block.Successors.Add(nextBlock); - nextBlock.Predecessors.Add(block); - } - } - } - // If the last instruction is not a jump, the next block is the successor - else - { - // Assume each instruction is 1-15 bytes in length - // Since we don't have RawBytes, use a constant for now - const int estimatedInstructionLength = 4; // Typical x86 instruction length - ulong nextAddress = lastInst.Address + (ulong)estimatedInstructionLength; - if (cfg._blocks.TryGetValue(nextAddress, out BasicBlock? nextBlock)) - { - block.Successors.Add(nextBlock); - nextBlock.Predecessors.Add(block); - } - } - } - - return cfg; - } - - /// - /// Checks if an instruction is a control transfer instruction (jump, call, ret) - /// - /// The instruction to check - /// True if the instruction is a control transfer - private static bool IsControlTransfer(Instruction instruction) - { - // Check instruction type instead of mnemonic - return instruction.Type == InstructionType.Jmp || - instruction.Type == InstructionType.Je || - instruction.Type == InstructionType.Jne || - instruction.Type == InstructionType.Jb || - instruction.Type == InstructionType.Jbe || - instruction.Type == InstructionType.Ja || - instruction.Type == InstructionType.Jae || - instruction.Type == InstructionType.Call || - instruction.Type == InstructionType.Ret; - } - - /// - /// Checks if an instruction is a conditional jump - /// - /// The instruction to check - /// True if the instruction is a conditional jump - private static bool IsConditionalJump(Instruction instruction) - { - // Check for conditional jump instruction types - return instruction.Type == InstructionType.Je || - instruction.Type == InstructionType.Jne || - instruction.Type == InstructionType.Jb || - instruction.Type == InstructionType.Jbe || - instruction.Type == InstructionType.Ja || - instruction.Type == InstructionType.Jae; - } - - /// - /// Gets the target address of a control transfer instruction - /// - /// The instruction - /// The target address, or null if it cannot be determined - private static ulong? GetTargetAddress(Instruction instruction) - { - // Check if we have structured operands - if (instruction.StructuredOperands.Count == 0) - { - return null; - } - - // Get the first operand - var operand = instruction.StructuredOperands[0]; - - // Check if the operand is a direct address (e.g., immediate value) - if (operand is ImmediateOperand immediateOperand) - { - return (ulong)immediateOperand.Value; - } - - // Check if the operand is a relative offset - if (operand is RelativeOffsetOperand relativeOperand) - { - return relativeOperand.TargetAddress; - } - - // For now, we cannot determine the target for other types of operands - return null; - } -} diff --git a/X86Disassembler/Decompiler/DataFlowAnalysis.cs b/X86Disassembler/Decompiler/DataFlowAnalysis.cs deleted file mode 100644 index c1537e0..0000000 --- a/X86Disassembler/Decompiler/DataFlowAnalysis.cs +++ /dev/null @@ -1,516 +0,0 @@ -namespace X86Disassembler.Decompiler; - -using System.Collections.Generic; -using X86; - -/// -/// Performs data flow analysis on x86 instructions -/// -public class DataFlowAnalysis -{ - /// - /// Represents a variable in the decompiled code - /// - public class Variable - { - /// - /// Gets or sets the name of the variable - /// - public string Name { get; set; } = string.Empty; - - /// - /// Gets or sets the type of the variable (if known) - /// - public string Type { get; set; } = "int"; // Default to int - - /// - /// Gets or sets the storage location (register, memory, etc.) - /// - public string Location { get; set; } = string.Empty; - - /// - /// Gets or sets whether this variable is a parameter - /// - public bool IsParameter { get; set; } - - /// - /// Gets or sets whether this variable is a return value - /// - public bool IsReturnValue { get; set; } - } - - /// - /// Represents an operation in the decompiled code - /// - public class Operation - { - /// - /// Gets or sets the operation type - /// - public string Type { get; set; } = string.Empty; - - /// - /// Gets or sets the destination variable - /// - public Variable? Destination { get; set; } - - /// - /// Gets or sets the source variables or constants - /// - public List Sources { get; } = []; // Can be Variable or constant value - - /// - /// Gets or sets the original instruction - /// - public Instruction OriginalInstruction { get; set; } = null!; - - public ulong InstructionAddress { get; set; } - } - - // Map of register names to variables - private readonly Dictionary _registerVariables = []; - - // Map of memory locations to variables - private readonly Dictionary _memoryVariables = []; - - // List of operations - private readonly List _operations = []; - - // Counter for generating variable names - private int _variableCounter = 0; - - /// - /// Gets the list of operations - /// - public IReadOnlyList Operations => _operations; - - /// - /// Gets the list of variables - /// - public IEnumerable Variables - { - get - { - HashSet uniqueVariables = []; - foreach (var variable in _registerVariables.Values) - { - uniqueVariables.Add(variable); - } - foreach (var variable in _memoryVariables.Values) - { - uniqueVariables.Add(variable); - } - return uniqueVariables; - } - } - - /// - /// Analyzes a list of instructions to identify variables and operations - /// - /// The list of instructions to analyze - public void Analyze(List instructions) - { - // Initialize common register variables - InitializeRegisterVariables(); - - // Process each instruction - foreach (var instruction in instructions) - { - AnalyzeInstruction(instruction); - } - } - - /// - /// Initializes common register variables - /// - private void InitializeRegisterVariables() - { - // 32-bit general purpose registers - _registerVariables["eax"] = new Variable { Name = "eax", Location = "eax" }; - _registerVariables["ebx"] = new Variable { Name = "ebx", Location = "ebx" }; - _registerVariables["ecx"] = new Variable { Name = "ecx", Location = "ecx" }; - _registerVariables["edx"] = new Variable { Name = "edx", Location = "edx" }; - _registerVariables["esi"] = new Variable { Name = "esi", Location = "esi" }; - _registerVariables["edi"] = new Variable { Name = "edi", Location = "edi" }; - _registerVariables["ebp"] = new Variable { Name = "ebp", Location = "ebp" }; - _registerVariables["esp"] = new Variable { Name = "esp", Location = "esp" }; - - // Mark EAX as the return value register - _registerVariables["eax"].IsReturnValue = true; - - // 16-bit registers - _registerVariables["ax"] = new Variable { Name = "ax", Location = "ax" }; - _registerVariables["bx"] = new Variable { Name = "bx", Location = "bx" }; - _registerVariables["cx"] = new Variable { Name = "cx", Location = "cx" }; - _registerVariables["dx"] = new Variable { Name = "dx", Location = "dx" }; - _registerVariables["si"] = new Variable { Name = "si", Location = "si" }; - _registerVariables["di"] = new Variable { Name = "di", Location = "di" }; - _registerVariables["bp"] = new Variable { Name = "bp", Location = "bp" }; - _registerVariables["sp"] = new Variable { Name = "sp", Location = "sp" }; - - // 8-bit registers - _registerVariables["al"] = new Variable { Name = "al", Location = "al" }; - _registerVariables["ah"] = new Variable { Name = "ah", Location = "ah" }; - _registerVariables["bl"] = new Variable { Name = "bl", Location = "bl" }; - _registerVariables["bh"] = new Variable { Name = "bh", Location = "bh" }; - _registerVariables["cl"] = new Variable { Name = "cl", Location = "cl" }; - _registerVariables["ch"] = new Variable { Name = "ch", Location = "ch" }; - _registerVariables["dl"] = new Variable { Name = "dl", Location = "dl" }; - _registerVariables["dh"] = new Variable { Name = "dh", Location = "dh" }; - } - - /// - /// Analyzes a single instruction to identify variables and operations - /// - /// The instruction to analyze - private void AnalyzeInstruction(Instruction instruction) - { - // Use instruction.Type instead of instruction.Mnemonic - InstructionType type = instruction.Type; - - // Use instruction.StructuredOperands instead of instruction.Operands - var structuredOperands = instruction.StructuredOperands; - - // Skip instructions without operands - if (structuredOperands == null || structuredOperands.Count == 0) - { - return; - } - - // Create a new operation based on the instruction type - Operation operation = new Operation - { - InstructionAddress = instruction.Address, - Type = GetOperationType(type) - }; - - // Process the operation based on the instruction type - // This would need to be updated to work with structured operands - // For now, we'll just add a placeholder - _operations.Add(operation); - } - - private string GetOperationType(InstructionType type) - { - switch (type) - { - case InstructionType.Add: - return "add"; - case InstructionType.Sub: - return "sub"; - case InstructionType.Mul: - return "mul"; - case InstructionType.Div: - return "div"; - case InstructionType.And: - return "and"; - case InstructionType.Or: - return "or"; - case InstructionType.Xor: - return "xor"; - case InstructionType.Push: - return "push"; - case InstructionType.Pop: - return "pop"; - case InstructionType.Call: - return "call"; - case InstructionType.Ret: - return "return"; - case InstructionType.Cmp: - return "cmp"; - case InstructionType.Test: - return "test"; - case InstructionType.Jmp: - return "jmp"; - case InstructionType.Je: - return "je"; - case InstructionType.Jne: - return "jne"; - case InstructionType.Jg: - return "jg"; - case InstructionType.Jge: - return "jge"; - case InstructionType.Jl: - return "jl"; - case InstructionType.Jle: - return "jle"; - default: - return type.ToString(); - } - } - - /// - /// Handles a MOV instruction - /// - /// The operation to populate - /// The operand parts - private void HandleMovInstruction(Operation operation, string[] operandParts) - { - if (operandParts.Length != 2) - { - return; - } - - operation.Type = "assignment"; - - // Get or create the destination variable - Variable destination = GetOrCreateVariable(operandParts[0]); - operation.Destination = destination; - - // Get the source (variable or constant) - object source = GetOperandValue(operandParts[1]); - operation.Sources.Add(source); - } - - /// - /// Handles an arithmetic instruction (ADD, SUB, MUL, DIV, AND, OR, XOR) - /// - /// The operation to populate - /// The instruction mnemonic - /// The operand parts - private void HandleArithmeticInstruction(Operation operation, string mnemonic, string[] operandParts) - { - if (operandParts.Length != 2) - { - return; - } - - operation.Type = mnemonic; - - // Get or create the destination variable - Variable destination = GetOrCreateVariable(operandParts[0]); - operation.Destination = destination; - - // Get the source (variable or constant) - object source = GetOperandValue(operandParts[1]); - operation.Sources.Add(source); - operation.Sources.Add(destination); // The destination is also a source in arithmetic operations - } - - /// - /// Handles a stack instruction (PUSH, POP) - /// - /// The operation to populate - /// The instruction mnemonic - /// The operand parts - private void HandleStackInstruction(Operation operation, string mnemonic, string[] operandParts) - { - if (operandParts.Length != 1) - { - return; - } - - operation.Type = mnemonic; - - if (mnemonic == "push") - { - // For PUSH, the operand is the source - object source = GetOperandValue(operandParts[0]); - operation.Sources.Add(source); - } - else if (mnemonic == "pop") - { - // For POP, the operand is the destination - Variable destination = GetOrCreateVariable(operandParts[0]); - operation.Destination = destination; - } - } - - /// - /// Handles a CALL instruction - /// - /// The operation to populate - /// The operand parts - private void HandleCallInstruction(Operation operation, string[] operandParts) - { - if (operandParts.Length != 1) - { - return; - } - - operation.Type = "call"; - - // The operand is the function name or address - operation.Sources.Add(operandParts[0]); - } - - /// - /// Handles a RET instruction - /// - /// The operation to populate - private void HandleReturnInstruction(Operation operation) - { - operation.Type = "return"; - - // The return value is in EAX - if (_registerVariables.TryGetValue("eax", out Variable? eax)) - { - operation.Sources.Add(eax); - } - } - - /// - /// Handles a comparison instruction (CMP, TEST) - /// - /// The operation to populate - /// The instruction mnemonic - /// The operand parts - private void HandleComparisonInstruction(Operation operation, string mnemonic, string[] operandParts) - { - if (operandParts.Length != 2) - { - return; - } - - operation.Type = mnemonic; - - // Get the operands - object left = GetOperandValue(operandParts[0]); - object right = GetOperandValue(operandParts[1]); - - operation.Sources.Add(left); - operation.Sources.Add(right); - } - - /// - /// Handles a jump instruction (JMP, JE, JNE, etc.) - /// - /// The operation to populate - /// The instruction mnemonic - /// The operand parts - private void HandleJumpInstruction(Operation operation, string mnemonic, string[] operandParts) - { - if (operandParts.Length != 1) - { - return; - } - - operation.Type = mnemonic; - - // The operand is the jump target - operation.Sources.Add(operandParts[0]); - } - - /// - /// Gets or creates a variable for an operand - /// - /// The operand string - /// The variable - private Variable GetOrCreateVariable(string operand) - { - // Check if it's a register - if (IsRegister(operand)) - { - string register = operand.ToLower(); - if (_registerVariables.TryGetValue(register, out Variable? variable)) - { - return variable; - } - } - - // Check if it's a memory location - if (IsMemoryLocation(operand)) - { - string normalizedLocation = NormalizeMemoryLocation(operand); - if (_memoryVariables.TryGetValue(normalizedLocation, out Variable? variable)) - { - return variable; - } - - // Create a new variable for this memory location - variable = new Variable - { - Name = $"var_{_variableCounter++}", - Location = normalizedLocation - }; - - _memoryVariables[normalizedLocation] = variable; - return variable; - } - - // If it's neither a register nor a memory location, create a temporary variable - Variable tempVariable = new Variable - { - Name = $"temp_{_variableCounter++}", - Location = operand - }; - - return tempVariable; - } - - /// - /// Gets the value of an operand (variable or constant) - /// - /// The operand string - /// The operand value (Variable or constant) - private object GetOperandValue(string operand) - { - // Check if it's a register or memory location - if (IsRegister(operand) || IsMemoryLocation(operand)) - { - return GetOrCreateVariable(operand); - } - - // Check if it's a hexadecimal constant - if (operand.StartsWith("0x") && operand.Length > 2) - { - if (int.TryParse(operand.Substring(2), System.Globalization.NumberStyles.HexNumber, null, out int value)) - { - return value; - } - } - - // Check if it's a decimal constant - if (int.TryParse(operand, out int decimalValue)) - { - return decimalValue; - } - - // Otherwise, return the operand as a string - return operand; - } - - /// - /// Checks if an operand is a register - /// - /// The operand to check - /// True if the operand is a register - private bool IsRegister(string operand) - { - string[] registers = { "eax", "ebx", "ecx", "edx", "esi", "edi", "ebp", "esp", - "ax", "bx", "cx", "dx", "si", "di", "bp", "sp", - "al", "ah", "bl", "bh", "cl", "ch", "dl", "dh" }; - - return Array.IndexOf(registers, operand.ToLower()) >= 0; - } - - /// - /// Checks if an operand is a memory location - /// - /// The operand to check - /// True if the operand is a memory location - private bool IsMemoryLocation(string operand) - { - return operand.Contains('[') && operand.Contains(']'); - } - - /// - /// Normalizes a memory location operand - /// - /// The operand to normalize - /// The normalized memory location - private string NormalizeMemoryLocation(string operand) - { - // Extract the part inside the brackets - int startIndex = operand.IndexOf('['); - int endIndex = operand.IndexOf(']'); - - if (startIndex >= 0 && endIndex > startIndex) - { - string memoryReference = operand.Substring(startIndex + 1, endIndex - startIndex - 1).Trim(); - return memoryReference; - } - - return operand; - } -} diff --git a/X86Disassembler/Decompiler/Decompiler.cs b/X86Disassembler/Decompiler/Decompiler.cs deleted file mode 100644 index c5b73eb..0000000 --- a/X86Disassembler/Decompiler/Decompiler.cs +++ /dev/null @@ -1,531 +0,0 @@ -namespace X86Disassembler.Decompiler; - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using X86; -using X86.Operands; - -/// -/// Main decompiler class that translates assembly code into higher-level code -/// -public class Decompiler -{ - // The list of disassembled instructions - private readonly List _instructions; - - // The control flow graph - private ControlFlowGraph? _controlFlowGraph; - - // The data flow analysis - private DataFlowAnalysis? _dataFlowAnalysis; - - // The entry point address - private readonly ulong _entryPoint; - - /// - /// Initializes a new instance of the Decompiler class - /// - /// The list of disassembled instructions - /// The entry point address - public Decompiler(List instructions, ulong entryPoint) - { - _instructions = instructions; - _entryPoint = entryPoint; - } - - /// - /// Decompiles the instructions and returns the decompiled code - /// - /// The decompiled code - public string Decompile() - { - // Build the control flow graph - _controlFlowGraph = ControlFlowGraph.Build(_instructions, _entryPoint); - - // Perform data flow analysis - _dataFlowAnalysis = new DataFlowAnalysis(); - _dataFlowAnalysis.Analyze(_instructions); - - // Generate pseudocode from the control flow graph and data flow analysis - return GeneratePseudocode(); - } - - /// - /// Generates pseudocode from the control flow graph and data flow analysis - /// - /// The generated pseudocode - private string GeneratePseudocode() - { - if (_controlFlowGraph == null || _controlFlowGraph.EntryBlock == null) - { - return "// Could not build control flow graph"; - } - - StringBuilder code = new StringBuilder(); - - // Add a function header - code.AppendLine("// Decompiled function"); - code.AppendLine("int DecompiledFunction() {") - .AppendLine(); - - // Generate variable declarations - if (_dataFlowAnalysis != null) - { - foreach (var variable in _dataFlowAnalysis.Variables) - { - // Skip register variables - if (IsRegister(variable.Location)) - { - continue; - } - - // Generate a variable declaration - code.AppendLine($" {variable.Type} {variable.Name}; // {variable.Location}"); - } - - if (_dataFlowAnalysis.Variables.Any(v => !IsRegister(v.Location))) - { - code.AppendLine(); - } - } - - // Process the blocks in a depth-first order - HashSet visitedBlocks = new HashSet(); - GenerateCodeForBlock(_controlFlowGraph.EntryBlock, code, visitedBlocks, 1); - - // Add a return statement if not already present - if (!code.ToString().Contains("return")) - { - code.AppendLine(" return 0;"); - } - - // Close the function - code.AppendLine("}"); - - return code.ToString(); - } - - /// - /// Generates code for a basic block and its successors - /// - /// The basic block - /// The code builder - /// The set of visited blocks - /// The indentation level - private void GenerateCodeForBlock(ControlFlowGraph.BasicBlock block, StringBuilder code, HashSet visitedBlocks, int indentLevel) - { - // If we've already visited this block, add a goto statement - if (visitedBlocks.Contains(block.StartAddress)) - { - string indent = new string(' ', indentLevel * 4); - code.AppendLine($"{indent}goto block_{block.StartAddress:X8};"); - return; - } - - // Mark this block as visited - visitedBlocks.Add(block.StartAddress); - - // Add a label for this block - string blockIndent = new string(' ', (indentLevel - 1) * 4); - code.AppendLine($"{blockIndent}block_{block.StartAddress:X8}:") - .AppendLine(); - - // Generate code for the instructions in this block - foreach (var instruction in block.Instructions) - { - string instructionCode = TranslateInstruction(instruction, indentLevel); - if (!string.IsNullOrEmpty(instructionCode)) - { - code.AppendLine(instructionCode); - } - } - - // Handle successors based on the control flow - if (block.Successors.Count == 1) - { - // Unconditional branch to the next block - GenerateCodeForBlock(block.Successors[0], code, visitedBlocks, indentLevel); - } - else if (block.Successors.Count == 2) - { - // Conditional branch - string indent = new string(' ', indentLevel * 4); - - // Get the last instruction in the block - Instruction lastInstruction = block.Instructions[^1]; - string condition = GetConditionFromJump(lastInstruction); - - // Find the fall-through block and the jump target block - ControlFlowGraph.BasicBlock? fallthroughBlock = null; - ControlFlowGraph.BasicBlock? jumpTargetBlock = null; - - // Use a constant estimated instruction length since RawBytes is not available - const int estimatedInstructionLength = 4; // Typical x86 instruction length - ulong nextAddress = lastInstruction.Address + (ulong)estimatedInstructionLength; - foreach (var successor in block.Successors) - { - if (successor.StartAddress == nextAddress) - { - fallthroughBlock = successor; - } - else - { - jumpTargetBlock = successor; - } - } - - if (fallthroughBlock != null && jumpTargetBlock != null) - { - // Generate an if statement - code.AppendLine($"{indent}if ({condition}) {{") - .AppendLine(); - - // Generate code for the jump target block - GenerateCodeForBlock(jumpTargetBlock, code, visitedBlocks, indentLevel + 1); - - // Close the if statement - code.AppendLine($"{indent}}}") - .AppendLine(); - - // Generate code for the fall-through block - GenerateCodeForBlock(fallthroughBlock, code, visitedBlocks, indentLevel); - } - else - { - // If we couldn't determine the fall-through and jump target blocks, - // just generate code for both successors - foreach (var successor in block.Successors) - { - GenerateCodeForBlock(successor, code, visitedBlocks, indentLevel); - } - } - } - } - - /// - /// Translates an instruction into a higher-level code statement - /// - /// The instruction to translate - /// The indentation level - /// The translated code statement - private string TranslateInstruction(Instruction instruction, int indentLevel) - { - string indent = new string(' ', indentLevel * 4); - string mnemonic = instruction.Type.ToString().ToLower(); - string operands = ""; - - // Format operands if available - if (instruction.StructuredOperands != null && instruction.StructuredOperands.Count > 0) - { - operands = string.Join(", ", instruction.StructuredOperands.Select(op => op.ToString())); - } - - // Skip jumps (handled by control flow) - if (mnemonic.StartsWith("j")) - { - return $"{indent}// {instruction}"; - } - - // Handle different instruction types - switch (mnemonic) - { - case "mov": - return TranslateMovInstruction(instruction, indent); - - case "add": - case "sub": - case "mul": - case "div": - case "and": - case "or": - case "xor": - return TranslateArithmeticInstruction(instruction, indent); - - case "push": - case "pop": - return $"{indent}// {instruction}"; - - case "call": - return TranslateCallInstruction(instruction, indent); - - case "ret": - return TranslateReturnInstruction(instruction, indent); - - case "cmp": - case "test": - return $"{indent}// {instruction}"; - - default: - // For other instructions, just add a comment - return $"{indent}// {instruction}"; - } - } - - /// - /// Translates a MOV instruction - /// - /// The instruction to translate - /// The indentation string - /// The translated code statement - private string TranslateMovInstruction(Instruction instruction, string indent) - { - string[] operandParts = instruction.StructuredOperands.Select(op => op.ToString()).ToArray(); - if (operandParts.Length != 2) - { - return $"{indent}// {instruction}"; - } - - string destination = operandParts[0].Trim(); - string source = operandParts[1].Trim(); - - // Skip register-to-register moves for registers we don't track - if (IsRegister(destination) && IsRegister(source)) - { - return $"{indent}// {instruction}"; - } - - // Translate memory access - if (IsMemoryLocation(destination)) - { - string variableName = GetVariableNameForMemory(destination); - return $"{indent}{variableName} = {GetReadableOperand(source)}; // {instruction}"; - } - else if (IsMemoryLocation(source)) - { - string variableName = GetVariableNameForMemory(source); - return $"{indent}{GetReadableOperand(destination)} = {variableName}; // {instruction}"; - } - - // Default case - return $"{indent}{GetReadableOperand(destination)} = {GetReadableOperand(source)}; // {instruction}"; - } - - /// - /// Translates an arithmetic instruction - /// - /// The instruction to translate - /// The indentation string - /// The translated code statement - private string TranslateArithmeticInstruction(Instruction instruction, string indent) - { - string[] operandParts = instruction.StructuredOperands.Select(op => op.ToString()).ToArray(); - if (operandParts.Length != 2) - { - return $"{indent}// {instruction}"; - } - - string destination = operandParts[0].Trim(); - string source = operandParts[1].Trim(); - string operatorSymbol = GetOperatorForMnemonic(instruction.Type.ToString().ToLower()); - - // Skip register-to-register operations for registers we don't track - if (IsRegister(destination) && IsRegister(source)) - { - return $"{indent}// {instruction}"; - } - - // Translate the operation - return $"{indent}{GetReadableOperand(destination)} {operatorSymbol}= {GetReadableOperand(source)}; // {instruction}"; - } - - /// - /// Translates a CALL instruction - /// - /// The instruction to translate - /// The indentation string - /// The translated code statement - private string TranslateCallInstruction(Instruction instruction, string indent) - { - string target = instruction.StructuredOperands.FirstOrDefault()?.ToString() ?? ""; - - // Try to get a function name from the target - string functionName = GetFunctionNameFromTarget(target); - - return $"{indent}{functionName}(); // {instruction}"; - } - - /// - /// Translates a RET instruction - /// - /// The instruction to translate - /// The indentation string - /// The translated code statement - private string TranslateReturnInstruction(Instruction instruction, string indent) - { - // Check if EAX is used as a return value - if (_dataFlowAnalysis != null) - { - var eaxVariable = _dataFlowAnalysis.Variables.FirstOrDefault(v => v.Location == "eax" && v.IsReturnValue); - if (eaxVariable != null) - { - return $"{indent}return {eaxVariable.Name}; // {instruction}"; - } - } - - return $"{indent}return; // {instruction}"; - } - - /// - /// Gets the condition from a conditional jump instruction - /// - /// The jump instruction - /// The condition expression - private string GetConditionFromJump(Instruction instruction) - { - string mnemonic = instruction.Type.ToString().ToLower(); - - // Map jump mnemonics to conditions - return mnemonic switch - { - "je" => "a == b", - "jne" => "a != b", - "jz" => "a == 0", - "jnz" => "a != 0", - "jg" => "a > b", - "jge" => "a >= b", - "jl" => "a < b", - "jle" => "a <= b", - "ja" => "a > b (unsigned)", - "jae" => "a >= b (unsigned)", - "jb" => "a < b (unsigned)", - "jbe" => "a <= b (unsigned)", - _ => "condition" - }; - } - - /// - /// Gets the operator for an arithmetic mnemonic - /// - /// The instruction mnemonic - /// The operator - private string GetOperatorForMnemonic(string mnemonic) - { - return mnemonic switch - { - "add" => "+", - "sub" => "-", - "mul" => "*", - "div" => "/", - "and" => "&", - "or" => "|", - "xor" => "^", - _ => mnemonic - }; - } - - /// - /// Gets a readable representation of an operand - /// - /// The operand - /// A readable representation - private string GetReadableOperand(string operand) - { - // If it's a register, return it as is - if (IsRegister(operand)) - { - return operand; - } - - // If it's a memory location, get a variable name - if (IsMemoryLocation(operand)) - { - return GetVariableNameForMemory(operand); - } - - // If it's a hexadecimal constant, format it - if (operand.StartsWith("0x") && operand.Length > 2) - { - return operand; - } - - // Otherwise, return it as is - return operand; - } - - /// - /// Gets a variable name for a memory location - /// - /// The memory location - /// A variable name - private string GetVariableNameForMemory(string memoryLocation) - { - if (_dataFlowAnalysis == null) - { - return "memory"; - } - - // Extract the part inside the brackets - int startIndex = memoryLocation.IndexOf('['); - int endIndex = memoryLocation.IndexOf(']'); - - if (startIndex >= 0 && endIndex > startIndex) - { - string memoryReference = memoryLocation.Substring(startIndex + 1, endIndex - startIndex - 1).Trim(); - - // Try to find a variable for this memory location - var variable = _dataFlowAnalysis.Variables.FirstOrDefault(v => v.Location == memoryReference); - if (variable != null) - { - return variable.Name; - } - - // If it's a stack variable (relative to EBP), give it a meaningful name - if (memoryReference.StartsWith("ebp+") || memoryReference.StartsWith("ebp-")) - { - string offset = memoryReference.Substring(4); - return $"local_{offset.Replace("+", "plus_").Replace("-", "minus_")}"; - } - } - - return "memory"; - } - - /// - /// Gets a function name from a call target - /// - /// The call target - /// A function name - private string GetFunctionNameFromTarget(string target) - { - // If it's a direct address, format it - if (target.StartsWith("0x") && target.Length > 2) - { - return $"function_{target.Substring(2)}"; - } - - // If it's a memory location, extract the address - if (IsMemoryLocation(target)) - { - return $"function_ptr_{GetVariableNameForMemory(target)}"; - } - - // Otherwise, use the target as is - return target; - } - - /// - /// Checks if an operand is a register - /// - /// The operand to check - /// True if the operand is a register - private bool IsRegister(string operand) - { - string[] registers = { "eax", "ebx", "ecx", "edx", "esi", "edi", "ebp", "esp", - "ax", "bx", "cx", "dx", "si", "di", "bp", "sp", - "al", "ah", "bl", "bh", "cl", "ch", "dl", "dh" }; - - return Array.IndexOf(registers, operand.ToLower()) >= 0; - } - - /// - /// Checks if an operand is a memory location - /// - /// The operand to check - /// True if the operand is a memory location - private bool IsMemoryLocation(string operand) - { - return operand.Contains('[') && operand.Contains(']'); - } -} diff --git a/X86Disassembler/Program.cs b/X86Disassembler/Program.cs index 0dbc5ba..8b534e6 100644 --- a/X86Disassembler/Program.cs +++ b/X86Disassembler/Program.cs @@ -1,10 +1,5 @@ -using System; -using System.IO; -using System.Text; -using System.Collections.Generic; using X86Disassembler.PE; using X86Disassembler.X86; -using X86Disassembler.Decompiler; namespace X86Disassembler; @@ -68,56 +63,102 @@ public class Program var section = codeSections[0]; byte[] codeBytes = peFile.GetSectionData(peFile.SectionHeaders.IndexOf(section)); - Console.WriteLine($"Disassembling section {section.Name} at RVA 0x{section.VirtualAddress:X8}:"); + // First demonstrate sequential disassembly + Console.WriteLine($"Sequential disassembly of section {section.Name} at RVA 0x{section.VirtualAddress:X8}:"); // Create a disassembler for the code section - Disassembler disassembler = new Disassembler(codeBytes, peFile.OptionalHeader.ImageBase + section.VirtualAddress); + // Base address should be the section's virtual address, not the image base + VA + Disassembler disassembler = new Disassembler(codeBytes, section.VirtualAddress); - // Disassemble all instructions - var instructions = disassembler.Disassemble(); - - var unknownIndex = instructions.FindIndex( - x => x.ToString() - .Contains("??") || x.ToString() - .Contains("TODO") - ); - if (unknownIndex != -1) - { - _ = 5; - } + // Disassemble sequentially (linear approach) + var linearInstructions = disassembler.Disassemble(); - // Print the first 100 instructions - int count = Math.Min(100, instructions.Count); - for (int i = 0; i < count; i++) + // Print the first 30 instructions from linear disassembly + int linearCount = Math.Min(30, linearInstructions.Count); + for (int i = 0; i < linearCount; i++) { - Console.WriteLine(instructions[i]); + Console.WriteLine(linearInstructions[i]); } // Print a summary of how many more instructions there are - if (instructions.Count > count) + if (linearInstructions.Count > linearCount) { - Console.WriteLine($"... ({instructions.Count - count} more instructions not shown)"); + Console.WriteLine($"... ({linearInstructions.Count - linearCount} more instructions not shown)"); } - // Decompile the instructions - Console.WriteLine("\nDecompiling the first function:\n"); + Console.WriteLine(); + Console.WriteLine("===================================================="); + Console.WriteLine(); - // For demonstration, we'll decompile a small subset of instructions - // In a real scenario, you'd identify function boundaries first - int functionSize = Math.Min(50, instructions.Count); - List functionInstructions = instructions.GetRange(0, functionSize); + // Now demonstrate control flow-based disassembly from entry point + Console.WriteLine($"Control flow-based disassembly starting from entry point 0x{peFile.OptionalHeader.AddressOfEntryPoint:X8}:"); - // Create a decompiler for the function - Decompiler.Decompiler decompiler = new Decompiler.Decompiler( - functionInstructions, - functionInstructions[0].Address - ); - - // Decompile the function - string decompiledCode = decompiler.Decompile(); - - // Print the decompiled code - Console.WriteLine(decompiledCode); + try + { + // Get the entry point RVA from the PE header + uint entryPointRva = peFile.OptionalHeader.AddressOfEntryPoint; + + // Make sure the entry point is within this code section + if (entryPointRva >= section.VirtualAddress && + entryPointRva < section.VirtualAddress + section.VirtualSize) + { + // Disassemble starting from the entry point (control flow-based) + var cfgInstructions = disassembler.DisassembleFunction(entryPointRva); + + // Print the instructions from the entry point function + int cfgCount = Math.Min(50, cfgInstructions.Count); + for (int i = 0; i < cfgCount; i++) + { + Console.WriteLine(cfgInstructions[i]); + } + + // Print a summary if there are more instructions + if (cfgInstructions.Count > cfgCount) + { + Console.WriteLine($"... ({cfgInstructions.Count - cfgCount} more instructions in this function not shown)"); + } + + Console.WriteLine(); + Console.WriteLine($"Found {cfgInstructions.Count} instructions following control flow from entry point."); + } + else + { + // Try one of the exported functions instead + Console.WriteLine($"Entry point is not in the {section.Name} section. Trying the first exported function instead..."); + + if (peFile.ExportDirectory != null && peFile.ExportedFunctions.Count > 0) + { + uint functionRva = peFile.ExportedFunctions[0].AddressRva; + Console.WriteLine($"Disassembling exported function at RVA 0x{functionRva:X8} ({peFile.ExportedFunctions[0].Name}):"); + + var cfgInstructions = disassembler.DisassembleFunction(functionRva); + + // Print the instructions from the function + int cfgCount = Math.Min(50, cfgInstructions.Count); + for (int i = 0; i < cfgCount; i++) + { + Console.WriteLine(cfgInstructions[i]); + } + + // Print a summary if there are more instructions + if (cfgInstructions.Count > cfgCount) + { + Console.WriteLine($"... ({cfgInstructions.Count - cfgCount} more instructions in this function not shown)"); + } + + Console.WriteLine(); + Console.WriteLine($"Found {cfgInstructions.Count} instructions following control flow from exported function."); + } + else + { + Console.WriteLine("No exported functions found to disassemble."); + } + } + } + catch (Exception ex) + { + Console.WriteLine($"Error during control flow disassembly: {ex.Message}"); + } } // Console.WriteLine("\nPress Enter to exit..."); diff --git a/X86Disassembler/X86/Disassembler.cs b/X86Disassembler/X86/Disassembler.cs index 0f3e70d..d3ce829 100644 --- a/X86Disassembler/X86/Disassembler.cs +++ b/X86Disassembler/X86/Disassembler.cs @@ -64,7 +64,7 @@ public class Disassembler } /// - /// Disassembles the code buffer and returns the disassembled instructions + /// Disassembles the code buffer sequentially and returns all disassembled instructions /// /// A list of disassembled instructions public List Disassemble() @@ -117,4 +117,199 @@ public class Disassembler return instructions; } + + /// + /// Disassembles a function starting from a specific virtual address (RVA) and follows control flow + /// + /// The relative virtual address to start disassembly from + /// A list of disassembled instructions representing the function + public List DisassembleFunction(uint startRva) + { + // The _baseAddress is the section's RVA (stored in Program.cs) + // We need to calculate the offset within the section by subtracting the section's RVA from the start RVA + int startOffset = (int)(startRva - _baseAddress); + + // Debug output to verify addresses + Console.WriteLine($"Debug: startRva=0x{startRva:X8}, sectionRVA=0x{_baseAddress:X8}, calculated offset=0x{startOffset:X8}"); + + // Validate the offset is within bounds + if (startOffset < 0 || startOffset >= _length) + { + throw new ArgumentOutOfRangeException(nameof(startRva), + $"Start address 0x{startRva:X8} is outside the bounds of the section at RVA 0x{_baseAddress:X8} with size {_length}"); + } + + return DisassembleFromOffset(startOffset); + } + + /// + /// Disassembles instructions starting from a specific offset using control flow analysis + /// + /// The offset in the code buffer to start disassembly from + /// A list of disassembled instructions + private List DisassembleFromOffset(int startOffset) + { + // Keep track of disassembled instructions + List instructions = new List(); + + // Track visited addresses to avoid infinite loops + HashSet visitedOffsets = new HashSet(); + + // Queue of offsets to process + Queue offsetQueue = new Queue(); + offsetQueue.Enqueue(startOffset); + + while (offsetQueue.Count > 0) + { + int currentOffset = offsetQueue.Dequeue(); + + // Skip if we've already processed this offset + if (visitedOffsets.Contains(currentOffset)) + { + continue; + } + + // Create a new decoder positioned at the current offset + InstructionDecoder decoder = new InstructionDecoder(_codeBuffer, _length); + decoder.SetPosition(currentOffset); + + // Process instructions at this address until we hit a control flow change + while (decoder.CanReadByte() && decoder.GetPosition() < _length) + { + int positionBeforeDecode = decoder.GetPosition(); + visitedOffsets.Add(positionBeforeDecode); + + // Decode the instruction + Instruction? instruction = decoder.DecodeInstruction(); + if (instruction == null) + { + // Invalid instruction, skip to next byte + decoder.SetPosition(positionBeforeDecode + 1); + continue; + } + + // Set the instruction address + instruction.Address = _baseAddress + (uint)positionBeforeDecode; + + // Add the instruction to our list + instructions.Add(instruction); + + // Check for control flow instructions + if (IsReturnInstruction(instruction)) + { + // End of function, don't follow any further from this branch + break; + } + else if (IsUnconditionalJump(instruction)) + { + // Follow the unconditional jump target + int? targetOffset = GetJumpTargetOffset(instruction, positionBeforeDecode); + if (targetOffset.HasValue && targetOffset.Value >= 0 && targetOffset.Value < _length) + { + offsetQueue.Enqueue(targetOffset.Value); + } + + // End this branch of execution + break; + } + else if (IsConditionalJump(instruction)) + { + // Follow both paths for conditional jumps (target and fall-through) + int? targetOffset = GetJumpTargetOffset(instruction, positionBeforeDecode); + if (targetOffset.HasValue && targetOffset.Value >= 0 && targetOffset.Value < _length) + { + offsetQueue.Enqueue(targetOffset.Value); + } + + // Continue with fall-through path in this loop + } + else if (IsCallInstruction(instruction)) + { + // For calls, we just continue with the next instruction (we don't follow the call) + // We could add separate functionality to follow calls if needed + } + } + } + + // Sort instructions by address for readability + instructions.Sort((a, b) => a.Address.CompareTo(b.Address)); + + return instructions; + } + + /// + /// Checks if an instruction is a return instruction + /// + private bool IsReturnInstruction(Instruction instruction) + { + return instruction.Type == InstructionType.Ret || + instruction.Type == InstructionType.Retf; + } + + /// + /// Checks if an instruction is an unconditional jump + /// + private bool IsUnconditionalJump(Instruction instruction) + { + return instruction.Type == InstructionType.Jmp; + } + + /// + /// Checks if an instruction is a conditional jump + /// + private bool IsConditionalJump(Instruction instruction) + { + return instruction.Type == InstructionType.Je || + instruction.Type == InstructionType.Jne || + instruction.Type == InstructionType.Ja || + instruction.Type == InstructionType.Jae || + instruction.Type == InstructionType.Jb || + instruction.Type == InstructionType.Jbe || + instruction.Type == InstructionType.Jg || + instruction.Type == InstructionType.Jge || + instruction.Type == InstructionType.Jl || + instruction.Type == InstructionType.Jle || + instruction.Type == InstructionType.Jo || + instruction.Type == InstructionType.Jno || + instruction.Type == InstructionType.Jp || + instruction.Type == InstructionType.Jnp || + instruction.Type == InstructionType.Js || + instruction.Type == InstructionType.Jns || + instruction.Type == InstructionType.Jcxz; + } + + /// + /// Checks if an instruction is a call instruction + /// + private bool IsCallInstruction(Instruction instruction) + { + return instruction.Type == InstructionType.Call; + } + + /// + /// Gets the jump target offset from a jump instruction + /// + private int? GetJumpTargetOffset(Instruction instruction, int instructionOffset) + { + // Check if the instruction has at least one operand + if (instruction.StructuredOperands == null || instruction.StructuredOperands.Count == 0) + { + return null; + } + + // Look for an immediate operand which represents the offset + var operand = instruction.StructuredOperands[0]; + if (operand is ImmediateOperand immediateOperand) + { + // Calculate the target address + // For relative jumps, the target is IP (instruction pointer) + instruction length + offset + int instructionLength = (int)(instruction.Address - _baseAddress) - instructionOffset + 1; + int jumpOffset = Convert.ToInt32(immediateOperand.Value); + + return instructionOffset + instructionLength + jumpOffset; + } + + // For now, we don't handle indirect jumps like JMP [eax] or JMP [ebx+4] + return null; + } } \ No newline at end of file