Fix initial syscall injection

Summary: This fixes the initial syscall injection on aarch64. This also does slight simplification of the instructions that get plopped down at the current instruction pointer so that we can fit the instructions into a single word. That is, instead of injecting `INT3; SYSCALL; INT3`, we inject `SYSCALL; INT3`.

Reviewed By: VladimirMakaev

Differential Revision: D40867427

fbshipit-source-id: 2afa96f2270e16284523a17b09da00529893c20e
This commit is contained in:
Jason White 2022-11-01 10:11:35 -07:00 committed by Facebook GitHub Bot
parent 2346c73d2c
commit 76adc9174f
5 changed files with 126 additions and 82 deletions

View file

@ -82,7 +82,7 @@ use syscalls::Errno;
use syscalls::Sysno;
/// Builder for creating seccomp filters.
#[derive(Clone)]
#[derive(Debug, Clone)]
pub struct FilterBuilder {
/// The target architecture.
target_arch: TargetArch,

View file

@ -7,20 +7,27 @@
*/
/// A page that is reserved by Reverie in every guest process.
pub const PRIVATE_PAGE_OFFSET: u64 = 0x7000_0000;
pub const PRIVATE_PAGE_OFFSET: usize = 0x7000_0000;
/// trampoline data from private pages
pub const TRAMPOLINE_BASE: u64 = PRIVATE_PAGE_OFFSET;
pub const TRAMPOLINE_BASE: usize = PRIVATE_PAGE_OFFSET;
pub const TRAMPOLINE_SIZE: usize = 0x1000;
/// total private page size
pub const PRIVATE_PAGE_SIZE: usize = TRAMPOLINE_SIZE;
/// The size of a breakpoint instruction. On x86_64, this is just 0xcc, which is
/// one byte.
/// The size of the `ud2` instruction on x86_64.
#[cfg(target_arch = "x86_64")]
pub const BREAKPOINT_SIZE: usize = 1;
pub const UD_INSTR_SIZE: usize = 1;
/// The size of a breakpoint instruction. On aarch64, this is 4 bytes.
/// The size of the `udf` instruction on aarch64.
#[cfg(target_arch = "aarch64")]
pub const BREAKPOINT_SIZE: usize = 4;
pub const UD_INSTR_SIZE: usize = 4;
/// The size of the syscall instruction. On x86_64, this is 2 bytes.
#[cfg(target_arch = "x86_64")]
pub const SYSCALL_INSTR_SIZE: usize = 2;
/// The size of the syscall instruction. On aarch64, this is 4 bytes.
#[cfg(target_arch = "aarch64")]
pub const SYSCALL_INSTR_SIZE: usize = 4;

View file

@ -19,40 +19,47 @@ use super::consts::*;
/// the byte code can be confirmed by running objcopy
/// x86_64-linux-gnu-objcopy -I binary /tmp/1.bin -O elf64-x86-64 -B i386:x86-64 /tmp/1.elf
/// then objdump -d 1.elf must match the instructions listed below.
pub fn populate_mmap_page(pid: Pid, page_address: u64) -> nix::Result<()> {
/* For x86_64 architecture, we use the following syscall sequences:
* 0: 0f 05 syscall // untraced syscall
* 2: 0f 0b ud2
* 4: 0f 05 syscall // traced syscall
* 6: 0f 0b ud2
*/
pub fn populate_mmap_page(pid: Pid, page_address: usize) -> nix::Result<()> {
#[cfg(target_arch = "x86_64")]
let mut syscall_stubs: Vec<u8> = vec![0x0f, 0x05, 0x0f, 0x0b, 0x0f, 0x05, 0x0f, 0x0b];
/* For aarch64 architecture, we use the following syscall sequences:
* 0: d4 00 00 01 svc 0 // Equivalent to the syscall instruction
* 2: 00 00 de ad udf #0xdead // Equivalent to the ud2 instruction
* 4: d4 00 00 01 svc 0
* 6: 00 00 de ad udf #0xdead
*/
let mut syscall_stubs: Vec<u8> = vec![
0x0f, 0x05, // syscall (untraced)
0x0f, 0x0b, // udf2
0x0f, 0x05, // syscall (traced)
0x0f, 0x0b, // ud2
];
#[cfg(target_arch = "aarch64")]
let mut syscall_stubs: Vec<u8> = vec![
0xd4, 0x00, 0x00, 0x01, 0x00, 0x00, 0xde, 0xad, 0xd4, 0x00, 0x00, 0x01, 0x00, 0x00, 0xde,
0xad,
0x01, 0x00, 0x00, 0xd4, // svc 0 (untraced syscall)
0xad, 0xde, 0x00, 0x00, // udf 0xdead
0x01, 0x00, 0x00, 0xd4, // svc 0 (traced syscall)
0xad, 0xde, 0x00, 0x00, // udf 0xdead
];
// Fill syscall_stubs with a software interrupt (or debug breakpoint) instruction until it reaches the trampoline size
// Fill syscall_stubs with a software interrupt (or debug breakpoint)
// instruction until it reaches the trampoline size. If things are working
// correctly, we will never execute beyond our syscall stub, so this is more
// of a safeguard to make debugging easier if things go horribly wrong.
// On x86_64, the opcode for the int3 (breakpoint) instruction is 0xcc.
#[cfg(target_arch = "x86_64")]
const SOFTWARE_INTERUPT: u8 = 0xcc; // int3 instruction opcode for x86_64
const SOFTWARE_INTERUPT: u8 = 0xcc;
// For aarch64, we should use BRK 1 but as it is not a single-byte
// instruction, we'll use a sequence of 0x00 (same as a sequence of udf #0x0
// instructions)
#[cfg(target_arch = "aarch64")]
const SOFTWARE_INTERUPT: u8 = 0x00; // For aarch64, we should use BRK 1 but as it is not a single-byte instruction, we'll use a sequence of 0x00 (same as a sequence of udf #0x0 instructions)
const SOFTWARE_INTERUPT: u8 = 0x00;
syscall_stubs.resize_with(TRAMPOLINE_SIZE, || SOFTWARE_INTERUPT);
let local_iov = &[IoSlice::new(syscall_stubs.as_slice())];
let remote_iov = &[RemoteIoVec {
base: page_address as usize,
base: page_address,
len: TRAMPOLINE_SIZE,
}];
// initialize the whole page with int3 to prevent unintended
// execution in our injected page.
// Initialize the whole page with int3 to prevent unintended execution in
// our injected page.
uio::process_vm_writev(pid, local_iov, remote_iov)?;
Ok(())
}

View file

@ -26,7 +26,6 @@ use futures::future::Either;
use futures::future::Future;
use futures::future::FutureExt;
use futures::future::TryFutureExt;
use libc::user_regs_struct;
use nix::sys::mman::ProtFlags;
use nix::sys::signal::Signal;
use reverie::syscalls::Addr;
@ -642,20 +641,24 @@ impl<L: Tool + 'static> TracedTask<L> {
/// Postcondition: the guest registers and code memory are restored to their original state,
/// including RIP, but the vdso page and special shared page are modified accordingly.
pub async fn tracee_preinit(&mut self, task: Stopped) -> Result<Stopped, TraceError> {
type SavedInstructions = [u8; 8];
/// Helper function for tracee_preinit that does the core work.
async fn setup_special_mmap_page(task: Stopped) -> Result<Stopped, TraceError> {
// NB: This point in the code assumes that a specific instruction sequence "INT3;
// SYSCALL; INT3", has been patched into the guest, and that RIP points to the syscall.
// (I.e. we're already past the first breakpoint.)
let mut regs = task.getregs()?;
let mut saved_regs = regs;
async fn setup_special_mmap_page(
task: Stopped,
saved_regs: &libc::user_regs_struct,
) -> Result<Stopped, TraceError> {
// NOTE: This point in the code assumes that a specific instruction
// sequence "SYSCALL; INT3", has been patched into the guest, and
// that RIP points to the syscall.
let mut regs = saved_regs.clone();
let page_addr = cp::PRIVATE_PAGE_OFFSET;
*regs.syscall_mut() = Sysno::mmap as Reg;
*regs.orig_syscall_mut() = regs.syscall();
regs.set_args((
page_addr,
page_addr as Reg,
cp::PRIVATE_PAGE_SIZE as Reg,
(libc::PROT_READ | libc::PROT_WRITE | libc::PROT_EXEC) as Reg,
(libc::MAP_PRIVATE | libc::MAP_FIXED | libc::MAP_ANONYMOUS) as Reg,
@ -683,7 +686,9 @@ impl<L: Tool + 'static> TracedTask<L> {
running = task.resume(sig)?;
}
Event::Seccomp => {
// Injected mmap trapped.
// Injected mmap trapped. We may not necessarily
// intercept a seccomp event here if the tool hasn't
// subscribed to the mmap syscall.
running = task.resume(None)?;
}
unknown => {
@ -694,7 +699,7 @@ impl<L: Tool + 'static> TracedTask<L> {
// Make sure we got our desired address.
assert_eq!(
Errno::from_ret(task.getregs()?.ret() as usize)? as u64,
Errno::from_ret(task.getregs()?.ret() as usize)?,
page_addr,
"Could not mmap address {}",
page_addr
@ -702,59 +707,75 @@ impl<L: Tool + 'static> TracedTask<L> {
cp::populate_mmap_page(task.pid().into(), page_addr).map_err(|err| err)?;
*saved_regs.ip_mut() -= cp::BREAKPOINT_SIZE as Reg;
task.setregs(saved_regs)?;
// Restore our saved registers, including our instruction pointer.
task.setregs(*saved_regs)?;
Ok(task)
}
/// Put the guest into the weird state where it has an
/// "INT3;SYSCALL;INT3" patched into the code wherever RIP happens to be
/// pointing. It leaves RIP pointing at the syscall instruction. This
/// pointing. It leaves RIP pointing at the syscall instruction. This
/// allows forcible injection of syscalls into the guest.
async fn establish_injection_state(
mut task: Stopped,
) -> Result<(Stopped, user_regs_struct, u64), TraceError> {
// A syscall instruction flanked by INT3 breakpoints (1+2+1 bytes):
let bp_syscall_bp: u64 = 0xcc050fcc;
) -> Result<(Stopped, libc::user_regs_struct, SavedInstructions), TraceError> {
#[cfg(target_arch = "x86_64")]
const SYSCALL_BP: SavedInstructions = [
0x0f, 0x05, // syscall
0xcc, // int3
0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // padding
];
#[cfg(target_arch = "aarch64")]
const SYSCALL_BP: SavedInstructions = [
0x01, 0x00, 0x00, 0xd4, // svc 0
0x20, 0x00, 0x20, 0xd4, // brk 1
];
// Save the original registers so we can restore them later.
let regs = task.getregs()?;
// Saved instruction memory
let ip = AddrMut::from_raw(regs.ip() as usize).unwrap();
let saved: u64 = task.read_value(ip)?;
// Patch the tracee at the current instruction pointer.
task.write_value(ip, &((saved & !(0xffffffff_u64)) | bp_syscall_bp))?;
let saved: SavedInstructions = task.read_value(ip)?;
// Patch the tracee at the current instruction pointer.
//
// NOTE: `process_vm_writev` cannot write to write-protected pages,
// but `PTRACE_POKEDATA` can! Thus, we need to make sure we only
// write one word-sized chunk at a time. Luckily, the instructions
// we want to inject fit inside of just one 64-bit word.
task.write_value(ip.cast(), &SYSCALL_BP)?;
// When resumed, the tracee will hit the first breakpoint. Then we
// wait for it to reach that breakpoint and trap/stop.
let (task, event) = task
.resume(None)?
.wait_for_signal(Signal::SIGTRAP)
.await?
.assume_stopped();
assert_eq!(event, Event::Signal(Signal::SIGTRAP));
Ok((task, regs, saved))
}
/// Undo the effects of `establish_injection_state` and put the program
/// code memory back to normal.
/// code memory and instruction pointer back to normal.
fn remove_injection_state(
mut task: Stopped,
regs: user_regs_struct,
saved: u64,
) -> Result<Stopped, TraceError> {
// Restore what we dirtied:
task.write_value(AddrMut::from_raw(regs.ip() as usize).unwrap(), &saved)?;
task: &mut Stopped,
regs: libc::user_regs_struct,
saved: SavedInstructions,
) -> Result<(), TraceError> {
// NOTE: Again, because `process_vm_writev` cannot write to
// write-protected pages, we must write in word-sized chunks with
// PTRACE_POKEDATA.
let ip = AddrMut::from_raw(regs.ip() as usize).unwrap();
task.write_value(ip, &saved)?;
task.setregs(regs)?;
Ok(task)
Ok(())
}
let (task, regs, saved) = establish_injection_state(task).await?;
let task = setup_special_mmap_page(task).await?;
let (task, regs, prev_state) = establish_injection_state(task).await?;
let mut task = setup_special_mmap_page(task, &regs).await?;
// Restore registers after adding our temporary injection state.
remove_injection_state(&mut task, regs, prev_state)?;
vdso::vdso_patch(self).await.expect("unable to patch vdso");
let mprotect = Mprotect::new()
.with_addr(AddrMut::from_raw(cp::TRAMPOLINE_BASE as usize))
.with_addr(AddrMut::from_raw(cp::TRAMPOLINE_BASE))
.with_len(cp::TRAMPOLINE_SIZE)
.with_protection(ProtFlags::PROT_READ | ProtFlags::PROT_EXEC);
self.inject(mprotect).await?;
@ -765,8 +786,7 @@ impl<L: Tool + 'static> TracedTask<L> {
tracing::warn!("unable to intercept cpuid");
}
// Registers are restored from establish_injection_state.
remove_injection_state(task, regs, saved)
Ok(task)
}
#[cfg(target_arch = "x86_64")]
@ -987,7 +1007,18 @@ impl<L: Tool + 'static> TracedTask<L> {
// inject or tail inject after execve succeeded.
self.pending_syscall = None;
// TODO: Update thread ID? Need to write a test checking this.
// TODO: Update PID? Need to write a test checking this.
// Step the tracee to get the SIGTRAP that immediately follows the
// PTRACE_EVENT_EXEC. We can't call `tracee_preinit` until after this
// because when it tries to step the tracee, it'll get this SIGTRAP
// signal instead.
let (task, event) = task
.step(None)?
.wait_for_signal(Signal::SIGTRAP)
.await?
.assume_stopped();
assert_eq!(event, Event::Signal(Signal::SIGTRAP));
let task = self.tracee_preinit(task).await?;
@ -1529,15 +1560,16 @@ impl<L: Tool + 'static> TracedTask<L> {
args.arg5 as Reg,
));
// instruction at PRIVATE_PAGE_OFFSET, see `populate_mmap_page`.
// 7000_0000: 0f 05 syscall
// 7000_0002: 0f 0b ud2
*regs.ip_mut() = cp::PRIVATE_PAGE_OFFSET;
// Jump to our private page to run the syscall instruction there. See
// `populate_mmap_page` for details.
*regs.ip_mut() = cp::PRIVATE_PAGE_OFFSET as Reg;
task.setregs(regs)?;
// Step to run the syscall instruction.
let wait = task.step(None)?.next_state().await?;
// Get the result of the syscall to return to the caller.
self.from_task_state(wait, Some(oldregs)).await
}
@ -1570,8 +1602,8 @@ impl<L: Tool + 'static> TracedTask<L> {
// SIGCHLD) before single step finishes (in that case rip ==
// 0x7000_0000u64).
debug_assert!(
regs.ip() == cp::PRIVATE_PAGE_OFFSET + 0x2
|| regs.ip() == cp::PRIVATE_PAGE_OFFSET
regs.ip() as usize == cp::PRIVATE_PAGE_OFFSET + cp::SYSCALL_INSTR_SIZE
|| regs.ip() as usize == cp::PRIVATE_PAGE_OFFSET
);
// interrupted by signal, return -ERESTARTSYS so that tracee can do a
// restart_syscall.
@ -2060,7 +2092,7 @@ impl<L: Tool + 'static> Guest<L> for TracedTask<L> {
self.assume_stopped()
}
async fn regs(&mut self) -> user_regs_struct {
async fn regs(&mut self) -> libc::user_regs_struct {
let task = self.assume_stopped();
match task.getregs() {

View file

@ -360,11 +360,9 @@ fn seccomp_filter(events: &Subscription) -> seccomp::Filter {
.syscall(Sysno::restart_syscall, Action::Allow)
.syscall(Sysno::rt_sigreturn, Action::Allow)
// Allow untraced syscalls through without tracing them.
// NOTE: 2 is the length of a syscall instruction (0x0f 0x05) and we
// want to allow the ud2 instruction immediately following it.
.ip_range(
cp::TRAMPOLINE_BASE + 2,
cp::TRAMPOLINE_BASE + 3,
(cp::TRAMPOLINE_BASE + cp::SYSCALL_INSTR_SIZE) as u64,
(cp::TRAMPOLINE_BASE + cp::SYSCALL_INSTR_SIZE + cp::UD_INSTR_SIZE) as u64,
Action::Allow,
)
.build()