公众号：https://mp.weixin.qq.com/s/dMkrp5Wi8VDIa7b57I8ikw

或许我们的公众号会有更多你感兴趣的内容

花指令（junk code）是一种专门用来迷惑反编译器的指令片段，这些指令片段不会影响程序的原有功能，但会使得反汇编器的结果出现偏差，从而使破解者分析失败。比较经典的花指令技巧有利用 jmp 、call、ret 指令改变执行流，从而使得反汇编器解析出与运行时不相符的错误代码。

这里使用这个网站进行汇编到机器码的快速查询

https://defuse.ca/online-x86-assembler.htm#disassembly

如何添加花指令

要知道如何去除，首先就要想到如何添加

1. x32 手动添加

从简单的开始，从x86 （32位）开始，因为在windows上还支持在32位中进行汇编内联

这里先用个简单的吧

jmp $+2
#0:  eb 00                   jmp    2 <_main+0x2>

这条汇编长度是2，跳转到当前地址+2的地方，也就是说没有执行任何操作

但是MSVC的内联汇编不能这样写，得

_asm {
	jmp next1;
next1:
}

完整代码如下

#include <iostream>

bool verify(char* passwd) {
	size_t sum = 0;
	_asm {
		jmp next1;
	next1:
	}
	for (size_t i = 0; passwd[i]; i++)
	{
		_asm {
			jmp next2;
		next2:
		}
		sum *= (sum+passwd[i]) % 13 + 1;
		_asm {
			jmp next3;
		next3:
		}
	}
	if (sum == 0x1234567) {
		return true;
	}
	else {
		return false;
	}
}

int main()
{
	char buffer[0x10] = {};
	_asm {
		jmp next;
		next:
	}
    std::cout << "input password: ";
	scanf_s("%s", buffer, 0x10);


	if(verify(buffer))
		std::cout << "OK!\n";
	else
		std::cout << "NO!\n";

}

稍微进阶一点点的就是使用宏定义

#include <iostream>
#include <stdio.h>

#define JUNK1 __asm { mov edx, edx }
#define JUNK2 __asm { push eax } __asm { pop eax }
#define JUNK3 __asm { xor eax, eax } __asm { add eax, 1 } __asm { sub eax, 1 }
#define JUNK4 __asm { mov eax, eax }

#define CONCAT(a, b) a##b
#define CONCAT_EXPAND(a, b) CONCAT(a, b)

#define JUNK5_IMPL(x) __asm { jmp CONCAT_EXPAND(skip, x) } CONCAT_EXPAND(skip, x) :
#define JUNK5 JUNK5_IMPL(__COUNTER__)

#define JUNK_0 JUNK1
#define JUNK_1 JUNK2
#define JUNK_2 JUNK3
#define JUNK_3 JUNK4
#define JUNK_4 JUNK5
#define JUNK_5 JUNK1
#define JUNK_6 JUNK2
#define JUNK_7 JUNK3
#define JUNK_8 JUNK4
#define JUNK_9 JUNK5

#define JUNK_EXPAND(x) JUNK_EXPAND2(x)
#define JUNK_EXPAND2(x) JUNK_##x
#define JUNKFUNC() JUNK_EXPAND(__COUNTER__)

bool verify(char* passwd) {
    size_t sum = 1;

    JUNKFUNC();

    for (size_t i = 0; passwd[i]; i++)
    {
        JUNKFUNC();
        sum *= (sum + passwd[i]) % 13 + 1;
        JUNKFUNC();
    }

    JUNKFUNC();

    return sum == 0x1234567;
}

int main()
{
    char buffer[0x10] = {};

    JUNKFUNC();

    std::cout << "input password: ";
    scanf_s("%15s", buffer, (unsigned)_countof(buffer));

    JUNKFUNC();

    if (verify(buffer))
        std::cout << "OK!\n";
    else
        std::cout << "NO!\n";

    JUNKFUNC();
}

2. x32 简易shellcode混淆

这个方法和x64是一致的，我个人使用hde(Hacker Disassembler Engine 32)这个反汇编器做shellcode的混淆，你可以在minhook下或者github保存起来的仓库找到这个项目

首先是找到所有相对跳转的汇编指令，然后标记他们的起点和目的绝对跳转地址

#include "include/jumper.hpp"

bool is_jcc(uint8_t opcode)
{
    return (opcode >= 0x70 && opcode <= 0x7F); // short jcc
}

bool is_jcc_0f(uint8_t opcode2)
{
    return (opcode2 >= 0x80 && opcode2 <= 0x8F); // 0F 8x
}


std::vector<JumpInfo> find_jumps(unsigned char* shellcode, size_t size)
{
    std::vector<JumpInfo> result;

    size_t offset = 0;

    while (offset < size)
    {
        hde32s hs;
        uint32_t len = hde32_disasm(shellcode + offset, &hs);

        if (hs.flags & F_ERROR || len == 0)
            break;

        uint8_t op = hs.opcode;
        JumpType type = JT_NONE;
        int32_t rel = 0;
        bool isRelative = false;

        if (op == 0xE8) //CALL
        {
            type = JT_CALL;
            isRelative = true;
        }

        else if (op == 0xE9 || op == 0xEB) // JMP
        {
            type = JT_JMP;
            isRelative = true;
        }


        else if (is_jcc(op))    //JCC(short)
        {
            type = JT_JCC;
            isRelative = true;
        }

        else if (op == 0x0F && is_jcc_0f(hs.opcode2)) // JCC(0x8)
        {
            type = JT_JCC;
            isRelative = true;
        }

        else if (op == 0xFF)    //FF /2 /4 （间接call/jmp）
        {
            uint8_t modrm = hs.modrm;
            uint8_t reg = (modrm >> 3) & 7;
            if (reg == 4)
                type = JT_JMP;      // jmp r/m32

            if (type != JT_NONE)
            {
                type = JT_ABSOLUTE; // 标记为绝对跳转
            }
        }

        if (type != JT_NONE)    //计算相对跳转目标
        {
            uint32_t from = (uint32_t)offset;
            uint32_t to = 0;

            if (isRelative)
            {
                if (hs.flags & F_IMM8)
                    rel = (int8_t)hs.imm.imm8;
                else if (hs.flags & F_IMM16)
                    rel = (int16_t)hs.imm.imm16;
                else if (hs.flags & F_IMM32)
                    rel = (int32_t)hs.imm.imm32;

                to = (uint32_t)(offset + len + rel);
            }

            result.push_back({ from, to, type });
        }

        offset += len;
    }

    return result;
}

然后就是解析长度后随机位置插入花指令然后修复相对位置的偏移

std::vector<uint8_t> remapper(unsigned char* shellcode, size_t size) {
    std::vector<Instruction> instructions;

    size_t offset = 0;

    while (offset < size)
    {
        hde32s hs;
        uint32_t len = hde32_disasm(shellcode + offset, &hs);

        if (hs.flags & F_ERROR || len == 0)
            break;

        Instruction inst;
        inst.old_offset = offset;
        inst.len = len;
        inst.bytes.assign(shellcode + offset, shellcode + offset + len);
        inst.hs = hs;

        instructions.push_back(inst);

        offset += len;
    }

    std::vector<uint8_t> new_code;
    std::unordered_map<uint32_t, uint32_t> offset_map;

    for (auto& inst : instructions)
    {
        // 记录映射
        offset_map[inst.old_offset] = new_code.size();

        // 写入原指令
        new_code.insert(new_code.end(), inst.bytes.begin(), inst.bytes.end());

        // 随机插入 junk
        if (rand() % 5 == 0)
        {
            std::vector<uint8_t> junk = {
                0x90,       // nop
                0x50, 0x58  // push eax; pop eax
            };

            new_code.insert(new_code.end(), junk.begin(), junk.end());
        }
        else if (rand() % 3 == 0)
        {
            std::vector<uint8_t> junk = { 0x89, 0xD2 };//mov edx, edx

            new_code.insert(new_code.end(), junk.begin(), junk.end());
        }
        else if (rand() % 6 == 0)
        {
            std::vector<uint8_t> junk = {
                0x83, 0xC0, 0x01, // add    eax,0x1
                0x83, 0xE8, 0x01 //  sub    eax,0x1
            };

            new_code.insert(new_code.end(), junk.begin(), junk.end());
        }
    }

    for (auto& inst : instructions)
    {
        auto& hs = inst.hs;

        // 只处理相对跳转
        if (!(hs.flags & F_RELATIVE))
            continue;

        uint32_t old_from = inst.old_offset;
        uint32_t new_from = offset_map[old_from];

        int32_t rel = 0;

        if (hs.flags & F_IMM8)
            rel = (int8_t)hs.imm.imm8;
        else if (hs.flags & F_IMM32)
            rel = (int32_t)hs.imm.imm32;

        uint32_t old_target = old_from + inst.len + rel;

        // 找新地址
        if (offset_map.find(old_target) == offset_map.end())
            continue; // 跳到外部，跳过

        uint32_t new_target = offset_map[old_target];

        // 计算新的相对偏移
        int32_t new_rel = (int32_t)(new_target - (new_from + inst.len));

        // 写回 new_code
        uint32_t write_pos = new_from + inst.len;

        if (hs.flags & F_IMM8)
            *(int8_t*)(&new_code[write_pos - 1]) = (int8_t)new_rel;
        else if (hs.flags & F_IMM32)
            *(int32_t*)(&new_code[write_pos - 4]) = new_rel;
    }
    return new_code;
}

进行测试

#include <iostream>
#include "include/jumper.hpp"
int main()
{
    unsigned char shellcode[] =
        "\xFC\x33\xD2\xB2\x30\x64\xFF\x32\x5A\x8B"
        "\x52\x0C\x8B\x52\x14\x8B\x72\x28\x33\xC9"
        "\xB1\x18\x33\xFF\x33\xC0\xAC\x3C\x61\x7C"
        "\x02\x2C\x20\xC1\xCF\x0D\x03\xF8\xE2\xF0"
        "\x81\xFF\x5B\xBC\x4A\x6A\x8B\x5A\x10\x8B"
        "\x12\x75\xDA\x8B\x53\x3C\x03\xD3\xFF\x72"
        "\x34\x8B\x52\x78\x03\xD3\x8B\x72\x20\x03"
        "\xF3\x33\xC9\x41\xAD\x03\xC3\x81\x38\x47"
        "\x65\x74\x50\x75\xF4\x81\x78\x04\x72\x6F"
        "\x63\x41\x75\xEB\x81\x78\x08\x64\x64\x72"
        "\x65\x75\xE2\x49\x8B\x72\x24\x03\xF3\x66"
        "\x8B\x0C\x4E\x8B\x72\x1C\x03\xF3\x8B\x14"
        "\x8E\x03\xD3\x52\x33\xFF\x57\x68\x61\x72"
        "\x79\x41\x68\x4C\x69\x62\x72\x68\x4C\x6F"
        "\x61\x64\x54\x53\xFF\xD2\x68\x33\x32\x01"
        "\x01\x66\x89\x7C\x24\x02\x68\x75\x73\x65"
        "\x72\x54\xFF\xD0\x68\x6F\x78\x41\x01\x8B"
        "\xDF\x88\x5C\x24\x03\x68\x61\x67\x65\x42"
        "\x68\x4D\x65\x73\x73\x54\x50\xFF\x54\x24"
        "\x2C\x57\x68\x4F\x5F\x6F\x21\x8B\xDC\x57"
        "\x53\x53\x57\xFF\xD0\x68\x65\x73\x73\x01"
        "\x8B\xDF\x88\x5C\x24\x03\x68\x50\x72\x6F"
        "\x63\x68\x45\x78\x69\x74\x54\xFF\x74\x24"
        "\x40\xFF\x54\x24\x40\x57\xFF\xD0";
    auto jumps = find_jumps(shellcode, sizeof(shellcode));

    for (auto& j : jumps)
    {
        std::cout << "from: 0x" << std::hex << j.from
            << " -> to: 0x" << j.to
            << " type: " << j.type << std::endl;
    }
    std::vector<uint8_t> newcode = remapper(shellcode, sizeof(shellcode));
    jumps = find_jumps(&newcode[0], newcode.size());

    for (auto& j : jumps)
    {
        std::cout << "[new]from: 0x" << std::hex << j.from
            << " -> to: 0x" << j.to
            << " type: " << j.type << std::endl;
    }
    PVOID lpAddr = VirtualAlloc(nullptr, newcode.size(), MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
    RtlCopyMemory(lpAddr, &newcode[0], newcode.size());
    HANDLE hThread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)lpAddr, NULL, 0, NULL);
    WaitForSingleObject(hThread, -1);

    return 0;
}

在尝试对CobaltStrike的shellcode进行混淆，很明显是不能成功的，大致来说就是cs的shellcode包含有SMC自解码、大量的跳转和自定义的hash API等。具体的shellcode分析可以见【免杀】Cobaltstrike Stager Payload分析

3. x64 手动添加

相较于x32更加困难就是无法使用内联汇编，所以尝试将核心代码使用汇编编写，直接在汇编中添加花指令，然后在主程序中调用。

我主要使用的是cmake进行编写，在CMakeLists.txt中

project(jumper LANGUAGES CXX ASM_MASM)//启用汇编
//....
add_library(cr4 OBJECT src/cr4.asm)//添加到项目
//最后链接的时候加上
target_link_libraries(${PROJECT_NAME} PRIVATE hde $<TARGET_OBJECTS:cr4>)

在Visual Studio的方法可以参考syswhispers的方法

OPTION CASEMAP:NONE
.code

PUBLIC CR4Enc

; void CR4Enc(uint8_t* plain, uint8_t* cipher, uint64_t size, uint64_t key)

CR4Enc PROC
    ; RCX = plain
    ; RDX = cipher
    ; R8  = size
    ; R9  = key

    push rbx
    push rsi
    push rdi

    ;无意义的寄存器操作
    push rax
    pop rax

    mov rsi, rcx        ; plain
    
    mov rdi, rdx        ; cipher
    mov rcx, r8         ; loop counter
    mov rbx, r9         ; key
    
; 使用ret跳转，打破ida的分析
ret_junk:
    lea r10, ret_junk
    mov r11d, 5653D986h
    xor r11d, 5653d99Ch
    add r10, r11
    push r10
    ret

    test rcx, rcx
    jz done

loop_start:
    mov al, byte ptr [rsi]

    ; --- 简单 CR4Enc-like 混淆 ---
    xor al, bl          ; XOR key低字节
    rol al, 3           ; 左旋3位
    add al, 55h        ; 加常数扰动

    mov byte ptr [rdi], al

    ;增加无意义的操作
    jmp short skip_junk
skip_junk:
    xor rax, 0DEADBEEFh
    xor rax, 0DEADBEEFh

    ; key 演化（类似流加密）
    ror rbx, 1
    add rbx, 1337h
    
    ;无意义的跳转
    jz junk
    jnz junk
junk:

    inc rsi
    inc rdi
    dec rcx
    jnz loop_start

done:
    pop rdi
    pop rsi
    pop rbx
    ret

CR4Enc ENDP

END

#include <iostream>

extern "C" void CR4Enc(uint8_t* plain, uint8_t* cipher, uint64_t size, uint64_t key);

int main()
{
    uint8_t data[] = "HelloWorld";
    uint8_t out[sizeof(data)] = { 0 };

    CR4Enc(data, out, sizeof(data) - 1, 0x12345678);

    for (int i = 0; i < sizeof(data) - 1; i++)
        printf("%02X ", out[i]);

    return 0;
}

现在就可以在汇编中手动添加花指令了，具体方法和x32手动添加类似

4. 编译中添加

笔者之所以想起这个方法是突然回忆起了AFL-fuzz的插装方法，这里简单提及

所以，AFL的代码插桩，就是在将源文件编译为汇编代码后，通过afl-as完成。开始重写汇编指令，准备在分支处插入代码

https://joe1sn.eu.org/2023/07/22/afl-source/

static const u8* trampoline_fmt_32 =
  "\n"
  "/* --- AFL TRAMPOLINE (32-BIT) --- */\n"
  "\n"
  ".align 4\n"
  "\n"
  "leal -16(%%esp), %%esp\n"
  "movl %%edi,  0(%%esp)\n"
  "movl %%edx,  4(%%esp)\n"
  "movl %%ecx,  8(%%esp)\n"
  "movl %%eax, 12(%%esp)\n"
  "movl $0x%08x, %%ecx\n"
  "call __afl_maybe_log\n"
  "movl 12(%%esp), %%eax\n"
  "movl  8(%%esp), %%ecx\n"
  "movl  4(%%esp), %%edx\n"
  "movl  0(%%esp), %%edi\n"
  "leal 16(%%esp), %%esp\n"
  "\n"
  "/* --- END --- */\n"
  "\n";

AFL 相当于魔改了编译器，这种方式更可能贴近 OLLVM 这种爆改编译器的做法，所以这里也只是提一嘴。

使用IDA去除花指令

1. 手动patch

就是识别到花指令，然后直接修改汇编代码，没什么好说的，例如这里的

直接jmp，我这里是通过修改byte而不是assembly得到的

2. 使用IDA-Python自动清除

依旧使用之前的x64 手动添加的例子，我们可以编写idc脚本进行去除

主要思想就是找到特征码，然后清除

https://docs.hex-rays.com/9.1/developer-guide/idc

#include<idc.idc>
static main()
{
    auto StartVa, StopVa, Size, i;
    StartVa=0x1400015E0;
    StopVa=0x14000161D;
    Size=StopVa-StartVa;
    for (i=0; i<Size; i++){
        if (Byte(StartVa)==0x4C && Byte(StartVa+1)==0x8D && Byte(StartVa+2)==0x15)
        {
                PatchByte(StartVa, 0xEB);
                PatchByte(StartVa+1, 0x18);
                PatchByte(StartVa+2, 0x90);
                MakeCode(StartVa);
                StartVa++;
                Message("Find Fakereturn Opcode!!\n");
                continue;
        }
        StartVa++;
    }
    Message("Clear Fakereturn Opcode Ok\n");
}