From 76adc9174f161d39c93734dad9669ee0c1ebc6d7 Mon Sep 17 00:00:00 2001 From: Jason White Date: Tue, 1 Nov 2022 10:11:35 -0700 Subject: [PATCH] Fix initial syscall injection Summary: This fixes the initial syscall injection on aarch64. This also does slight simplification of the instructions that get plopped down at the current instruction pointer so that we can fit the instructions into a single word. That is, instead of injecting `INT3; SYSCALL; INT3`, we inject `SYSCALL; INT3`. Reviewed By: VladimirMakaev Differential Revision: D40867427 fbshipit-source-id: 2afa96f2270e16284523a17b09da00529893c20e --- reverie-process/src/seccomp/mod.rs | 2 +- reverie-ptrace/src/cp/consts.rs | 21 +++-- reverie-ptrace/src/cp/mmap.rs | 51 +++++++----- reverie-ptrace/src/task.rs | 128 ++++++++++++++++++----------- reverie-ptrace/src/tracer.rs | 6 +- 5 files changed, 126 insertions(+), 82 deletions(-) diff --git a/reverie-process/src/seccomp/mod.rs b/reverie-process/src/seccomp/mod.rs index fe3582f..22b0c4d 100644 --- a/reverie-process/src/seccomp/mod.rs +++ b/reverie-process/src/seccomp/mod.rs @@ -82,7 +82,7 @@ use syscalls::Errno; use syscalls::Sysno; /// Builder for creating seccomp filters. -#[derive(Clone)] +#[derive(Debug, Clone)] pub struct FilterBuilder { /// The target architecture. target_arch: TargetArch, diff --git a/reverie-ptrace/src/cp/consts.rs b/reverie-ptrace/src/cp/consts.rs index fef9538..65709f5 100644 --- a/reverie-ptrace/src/cp/consts.rs +++ b/reverie-ptrace/src/cp/consts.rs @@ -7,20 +7,27 @@ */ /// A page that is reserved by Reverie in every guest process. -pub const PRIVATE_PAGE_OFFSET: u64 = 0x7000_0000; +pub const PRIVATE_PAGE_OFFSET: usize = 0x7000_0000; /// trampoline data from private pages -pub const TRAMPOLINE_BASE: u64 = PRIVATE_PAGE_OFFSET; +pub const TRAMPOLINE_BASE: usize = PRIVATE_PAGE_OFFSET; pub const TRAMPOLINE_SIZE: usize = 0x1000; /// total private page size pub const PRIVATE_PAGE_SIZE: usize = TRAMPOLINE_SIZE; -/// The size of a breakpoint instruction. On x86_64, this is just 0xcc, which is -/// one byte. +/// The size of the `ud2` instruction on x86_64. #[cfg(target_arch = "x86_64")] -pub const BREAKPOINT_SIZE: usize = 1; +pub const UD_INSTR_SIZE: usize = 1; -/// The size of a breakpoint instruction. On aarch64, this is 4 bytes. +/// The size of the `udf` instruction on aarch64. #[cfg(target_arch = "aarch64")] -pub const BREAKPOINT_SIZE: usize = 4; +pub const UD_INSTR_SIZE: usize = 4; + +/// The size of the syscall instruction. On x86_64, this is 2 bytes. +#[cfg(target_arch = "x86_64")] +pub const SYSCALL_INSTR_SIZE: usize = 2; + +/// The size of the syscall instruction. On aarch64, this is 4 bytes. +#[cfg(target_arch = "aarch64")] +pub const SYSCALL_INSTR_SIZE: usize = 4; diff --git a/reverie-ptrace/src/cp/mmap.rs b/reverie-ptrace/src/cp/mmap.rs index ff3083f..d4ad602 100644 --- a/reverie-ptrace/src/cp/mmap.rs +++ b/reverie-ptrace/src/cp/mmap.rs @@ -19,40 +19,47 @@ use super::consts::*; /// the byte code can be confirmed by running objcopy /// x86_64-linux-gnu-objcopy -I binary /tmp/1.bin -O elf64-x86-64 -B i386:x86-64 /tmp/1.elf /// then objdump -d 1.elf must match the instructions listed below. -pub fn populate_mmap_page(pid: Pid, page_address: u64) -> nix::Result<()> { - /* For x86_64 architecture, we use the following syscall sequences: - * 0: 0f 05 syscall // untraced syscall - * 2: 0f 0b ud2 - * 4: 0f 05 syscall // traced syscall - * 6: 0f 0b ud2 - */ +pub fn populate_mmap_page(pid: Pid, page_address: usize) -> nix::Result<()> { #[cfg(target_arch = "x86_64")] - let mut syscall_stubs: Vec = vec![0x0f, 0x05, 0x0f, 0x0b, 0x0f, 0x05, 0x0f, 0x0b]; - /* For aarch64 architecture, we use the following syscall sequences: - * 0: d4 00 00 01 svc 0 // Equivalent to the syscall instruction - * 2: 00 00 de ad udf #0xdead // Equivalent to the ud2 instruction - * 4: d4 00 00 01 svc 0 - * 6: 00 00 de ad udf #0xdead - */ + let mut syscall_stubs: Vec = vec![ + 0x0f, 0x05, // syscall (untraced) + 0x0f, 0x0b, // udf2 + 0x0f, 0x05, // syscall (traced) + 0x0f, 0x0b, // ud2 + ]; + #[cfg(target_arch = "aarch64")] let mut syscall_stubs: Vec = vec![ - 0xd4, 0x00, 0x00, 0x01, 0x00, 0x00, 0xde, 0xad, 0xd4, 0x00, 0x00, 0x01, 0x00, 0x00, 0xde, - 0xad, + 0x01, 0x00, 0x00, 0xd4, // svc 0 (untraced syscall) + 0xad, 0xde, 0x00, 0x00, // udf 0xdead + 0x01, 0x00, 0x00, 0xd4, // svc 0 (traced syscall) + 0xad, 0xde, 0x00, 0x00, // udf 0xdead ]; - // Fill syscall_stubs with a software interrupt (or debug breakpoint) instruction until it reaches the trampoline size + + // Fill syscall_stubs with a software interrupt (or debug breakpoint) + // instruction until it reaches the trampoline size. If things are working + // correctly, we will never execute beyond our syscall stub, so this is more + // of a safeguard to make debugging easier if things go horribly wrong. + + // On x86_64, the opcode for the int3 (breakpoint) instruction is 0xcc. #[cfg(target_arch = "x86_64")] - const SOFTWARE_INTERUPT: u8 = 0xcc; // int3 instruction opcode for x86_64 + const SOFTWARE_INTERUPT: u8 = 0xcc; + + // For aarch64, we should use BRK 1 but as it is not a single-byte + // instruction, we'll use a sequence of 0x00 (same as a sequence of udf #0x0 + // instructions) #[cfg(target_arch = "aarch64")] - const SOFTWARE_INTERUPT: u8 = 0x00; // For aarch64, we should use BRK 1 but as it is not a single-byte instruction, we'll use a sequence of 0x00 (same as a sequence of udf #0x0 instructions) + const SOFTWARE_INTERUPT: u8 = 0x00; + syscall_stubs.resize_with(TRAMPOLINE_SIZE, || SOFTWARE_INTERUPT); let local_iov = &[IoSlice::new(syscall_stubs.as_slice())]; let remote_iov = &[RemoteIoVec { - base: page_address as usize, + base: page_address, len: TRAMPOLINE_SIZE, }]; - // initialize the whole page with int3 to prevent unintended - // execution in our injected page. + // Initialize the whole page with int3 to prevent unintended execution in + // our injected page. uio::process_vm_writev(pid, local_iov, remote_iov)?; Ok(()) } diff --git a/reverie-ptrace/src/task.rs b/reverie-ptrace/src/task.rs index 4da344a..8e4f1de 100644 --- a/reverie-ptrace/src/task.rs +++ b/reverie-ptrace/src/task.rs @@ -26,7 +26,6 @@ use futures::future::Either; use futures::future::Future; use futures::future::FutureExt; use futures::future::TryFutureExt; -use libc::user_regs_struct; use nix::sys::mman::ProtFlags; use nix::sys::signal::Signal; use reverie::syscalls::Addr; @@ -642,20 +641,24 @@ impl TracedTask { /// Postcondition: the guest registers and code memory are restored to their original state, /// including RIP, but the vdso page and special shared page are modified accordingly. pub async fn tracee_preinit(&mut self, task: Stopped) -> Result { + type SavedInstructions = [u8; 8]; + /// Helper function for tracee_preinit that does the core work. - async fn setup_special_mmap_page(task: Stopped) -> Result { - // NB: This point in the code assumes that a specific instruction sequence "INT3; - // SYSCALL; INT3", has been patched into the guest, and that RIP points to the syscall. - // (I.e. we're already past the first breakpoint.) - let mut regs = task.getregs()?; - let mut saved_regs = regs; + async fn setup_special_mmap_page( + task: Stopped, + saved_regs: &libc::user_regs_struct, + ) -> Result { + // NOTE: This point in the code assumes that a specific instruction + // sequence "SYSCALL; INT3", has been patched into the guest, and + // that RIP points to the syscall. + let mut regs = saved_regs.clone(); let page_addr = cp::PRIVATE_PAGE_OFFSET; *regs.syscall_mut() = Sysno::mmap as Reg; *regs.orig_syscall_mut() = regs.syscall(); regs.set_args(( - page_addr, + page_addr as Reg, cp::PRIVATE_PAGE_SIZE as Reg, (libc::PROT_READ | libc::PROT_WRITE | libc::PROT_EXEC) as Reg, (libc::MAP_PRIVATE | libc::MAP_FIXED | libc::MAP_ANONYMOUS) as Reg, @@ -683,7 +686,9 @@ impl TracedTask { running = task.resume(sig)?; } Event::Seccomp => { - // Injected mmap trapped. + // Injected mmap trapped. We may not necessarily + // intercept a seccomp event here if the tool hasn't + // subscribed to the mmap syscall. running = task.resume(None)?; } unknown => { @@ -694,7 +699,7 @@ impl TracedTask { // Make sure we got our desired address. assert_eq!( - Errno::from_ret(task.getregs()?.ret() as usize)? as u64, + Errno::from_ret(task.getregs()?.ret() as usize)?, page_addr, "Could not mmap address {}", page_addr @@ -702,59 +707,75 @@ impl TracedTask { cp::populate_mmap_page(task.pid().into(), page_addr).map_err(|err| err)?; - *saved_regs.ip_mut() -= cp::BREAKPOINT_SIZE as Reg; - task.setregs(saved_regs)?; + // Restore our saved registers, including our instruction pointer. + task.setregs(*saved_regs)?; Ok(task) } /// Put the guest into the weird state where it has an /// "INT3;SYSCALL;INT3" patched into the code wherever RIP happens to be - /// pointing. It leaves RIP pointing at the syscall instruction. This + /// pointing. It leaves RIP pointing at the syscall instruction. This /// allows forcible injection of syscalls into the guest. async fn establish_injection_state( mut task: Stopped, - ) -> Result<(Stopped, user_regs_struct, u64), TraceError> { - // A syscall instruction flanked by INT3 breakpoints (1+2+1 bytes): - let bp_syscall_bp: u64 = 0xcc050fcc; + ) -> Result<(Stopped, libc::user_regs_struct, SavedInstructions), TraceError> { + #[cfg(target_arch = "x86_64")] + const SYSCALL_BP: SavedInstructions = [ + 0x0f, 0x05, // syscall + 0xcc, // int3 + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // padding + ]; + + #[cfg(target_arch = "aarch64")] + const SYSCALL_BP: SavedInstructions = [ + 0x01, 0x00, 0x00, 0xd4, // svc 0 + 0x20, 0x00, 0x20, 0xd4, // brk 1 + ]; + + // Save the original registers so we can restore them later. let regs = task.getregs()?; // Saved instruction memory let ip = AddrMut::from_raw(regs.ip() as usize).unwrap(); - let saved: u64 = task.read_value(ip)?; - // Patch the tracee at the current instruction pointer. - task.write_value(ip, &((saved & !(0xffffffff_u64)) | bp_syscall_bp))?; + let saved: SavedInstructions = task.read_value(ip)?; + + // Patch the tracee at the current instruction pointer. + // + // NOTE: `process_vm_writev` cannot write to write-protected pages, + // but `PTRACE_POKEDATA` can! Thus, we need to make sure we only + // write one word-sized chunk at a time. Luckily, the instructions + // we want to inject fit inside of just one 64-bit word. + task.write_value(ip.cast(), &SYSCALL_BP)?; - // When resumed, the tracee will hit the first breakpoint. Then we - // wait for it to reach that breakpoint and trap/stop. - let (task, event) = task - .resume(None)? - .wait_for_signal(Signal::SIGTRAP) - .await? - .assume_stopped(); - assert_eq!(event, Event::Signal(Signal::SIGTRAP)); Ok((task, regs, saved)) } /// Undo the effects of `establish_injection_state` and put the program - /// code memory back to normal. + /// code memory and instruction pointer back to normal. fn remove_injection_state( - mut task: Stopped, - regs: user_regs_struct, - saved: u64, - ) -> Result { - // Restore what we dirtied: - task.write_value(AddrMut::from_raw(regs.ip() as usize).unwrap(), &saved)?; + task: &mut Stopped, + regs: libc::user_regs_struct, + saved: SavedInstructions, + ) -> Result<(), TraceError> { + // NOTE: Again, because `process_vm_writev` cannot write to + // write-protected pages, we must write in word-sized chunks with + // PTRACE_POKEDATA. + let ip = AddrMut::from_raw(regs.ip() as usize).unwrap(); + task.write_value(ip, &saved)?; task.setregs(regs)?; - Ok(task) + Ok(()) } - let (task, regs, saved) = establish_injection_state(task).await?; - let task = setup_special_mmap_page(task).await?; + let (task, regs, prev_state) = establish_injection_state(task).await?; + let mut task = setup_special_mmap_page(task, ®s).await?; + + // Restore registers after adding our temporary injection state. + remove_injection_state(&mut task, regs, prev_state)?; vdso::vdso_patch(self).await.expect("unable to patch vdso"); let mprotect = Mprotect::new() - .with_addr(AddrMut::from_raw(cp::TRAMPOLINE_BASE as usize)) + .with_addr(AddrMut::from_raw(cp::TRAMPOLINE_BASE)) .with_len(cp::TRAMPOLINE_SIZE) .with_protection(ProtFlags::PROT_READ | ProtFlags::PROT_EXEC); self.inject(mprotect).await?; @@ -765,8 +786,7 @@ impl TracedTask { tracing::warn!("unable to intercept cpuid"); } - // Registers are restored from establish_injection_state. - remove_injection_state(task, regs, saved) + Ok(task) } #[cfg(target_arch = "x86_64")] @@ -987,7 +1007,18 @@ impl TracedTask { // inject or tail inject after execve succeeded. self.pending_syscall = None; - // TODO: Update thread ID? Need to write a test checking this. + // TODO: Update PID? Need to write a test checking this. + + // Step the tracee to get the SIGTRAP that immediately follows the + // PTRACE_EVENT_EXEC. We can't call `tracee_preinit` until after this + // because when it tries to step the tracee, it'll get this SIGTRAP + // signal instead. + let (task, event) = task + .step(None)? + .wait_for_signal(Signal::SIGTRAP) + .await? + .assume_stopped(); + assert_eq!(event, Event::Signal(Signal::SIGTRAP)); let task = self.tracee_preinit(task).await?; @@ -1529,15 +1560,16 @@ impl TracedTask { args.arg5 as Reg, )); - // instruction at PRIVATE_PAGE_OFFSET, see `populate_mmap_page`. - // 7000_0000: 0f 05 syscall - // 7000_0002: 0f 0b ud2 - *regs.ip_mut() = cp::PRIVATE_PAGE_OFFSET; + // Jump to our private page to run the syscall instruction there. See + // `populate_mmap_page` for details. + *regs.ip_mut() = cp::PRIVATE_PAGE_OFFSET as Reg; task.setregs(regs)?; + // Step to run the syscall instruction. let wait = task.step(None)?.next_state().await?; + // Get the result of the syscall to return to the caller. self.from_task_state(wait, Some(oldregs)).await } @@ -1570,8 +1602,8 @@ impl TracedTask { // SIGCHLD) before single step finishes (in that case rip == // 0x7000_0000u64). debug_assert!( - regs.ip() == cp::PRIVATE_PAGE_OFFSET + 0x2 - || regs.ip() == cp::PRIVATE_PAGE_OFFSET + regs.ip() as usize == cp::PRIVATE_PAGE_OFFSET + cp::SYSCALL_INSTR_SIZE + || regs.ip() as usize == cp::PRIVATE_PAGE_OFFSET ); // interrupted by signal, return -ERESTARTSYS so that tracee can do a // restart_syscall. @@ -2060,7 +2092,7 @@ impl Guest for TracedTask { self.assume_stopped() } - async fn regs(&mut self) -> user_regs_struct { + async fn regs(&mut self) -> libc::user_regs_struct { let task = self.assume_stopped(); match task.getregs() { diff --git a/reverie-ptrace/src/tracer.rs b/reverie-ptrace/src/tracer.rs index a76f782..c9ebf84 100644 --- a/reverie-ptrace/src/tracer.rs +++ b/reverie-ptrace/src/tracer.rs @@ -360,11 +360,9 @@ fn seccomp_filter(events: &Subscription) -> seccomp::Filter { .syscall(Sysno::restart_syscall, Action::Allow) .syscall(Sysno::rt_sigreturn, Action::Allow) // Allow untraced syscalls through without tracing them. - // NOTE: 2 is the length of a syscall instruction (0x0f 0x05) and we - // want to allow the ud2 instruction immediately following it. .ip_range( - cp::TRAMPOLINE_BASE + 2, - cp::TRAMPOLINE_BASE + 3, + (cp::TRAMPOLINE_BASE + cp::SYSCALL_INSTR_SIZE) as u64, + (cp::TRAMPOLINE_BASE + cp::SYSCALL_INSTR_SIZE + cp::UD_INSTR_SIZE) as u64, Action::Allow, ) .build()