crosvm: Put block device process in a minijail

Run with the new seccomp filter and drop all capabilities.  In addition enter a
new user, mount, network, and ipc namespace.  Leave the mount namespace empty
after pivot-rooting to an empty directory.

Change-Id: Iee583cf260ede8ca13f005836684eb80c2c3ac3e
Signed-off-by: Dylan Reid <dgreid@chromium.org>
Reviewed-on: https://chromium-review.googlesource.com/515603
This commit is contained in:
Dylan Reid 2017-05-12 16:15:53 -07:00 committed by chrome-bot
parent f2164a18bf
commit 61edbbff53
4 changed files with 96 additions and 8 deletions

View file

@ -8,12 +8,14 @@ lto = true
panic = 'abort'
[dependencies]
io_jail = { path = "io_jail" }
kvm = { path = "kvm" }
sys_util = { path = "sys_util" }
x86_64 = { path = "x86_64" }
kernel_loader = { path = "kernel_loader" }
libc = "0.2.21"
byteorder = "1"
syscall_defines = { path = "syscall_defines" }
[dependencies.clap]
version = "*"

20
block_device.policy Normal file
View file

@ -0,0 +1,20 @@
close: 1
exit_group: 1
futex: 1
lseek: 1
# Disallow mmap with PROT_EXEC set. The syntax here doesn't allow bit
# negation, thus the manually negated mask constant.
mmap: arg2 in 0xfffffffb
mprotect: arg2 in 0xfffffffb
munmap: 1
read: 1
recvfrom: 1
sched_getaffinity: 1
set_robust_list: 1
sigaltstack: 1
# Disallow clone's other than new threads.
clone: arg0 & 0x00010000
write: 1
eventfd2: 1
dup: 1
poll: 1

View file

@ -9,11 +9,13 @@ use std::io::{Error, Result};
use std::os::unix::net::UnixDatagram;
use std::time::Duration;
use libc::fork;
use libc;
use libc::pid_t;
use byteorder::{NativeEndian, ByteOrder};
use hw::BusDevice;
use syscall_defines::linux::LinuxSyscall::SYS_clone;
const SOCKET_TIMEOUT_MS: u64 = 2000;
const MSG_SIZE: usize = 24;
@ -78,6 +80,20 @@ fn child_proc(sock: UnixDatagram, device: &mut BusDevice) -> ! {
process::exit(0);
}
unsafe fn do_clone() -> Result<pid_t> {
// Forking is unsafe, this function must be unsafe as there is no way to
// guarantee saftey without more context about the state of the program.
let pid = libc::syscall(SYS_clone as i64,
libc::CLONE_NEWUSER | libc::CLONE_NEWPID |
libc::SIGCHLD as i32,
0);
if pid < 0 {
Err(Error::last_os_error())
} else {
Ok(pid as pid_t)
}
}
/// Wraps an inner `hw::BusDevice` that is run inside a child process via fork.
///
/// Because forks are very unfriendly to destructors and all memory mappings and file descriptors
@ -91,15 +107,19 @@ impl ProxyDevice {
///
/// The forked process will automatically be terminated when this is dropped, so be sure to keep
/// a reference.
pub fn new<D: BusDevice>(mut device: D) -> Result<ProxyDevice> {
/// `post_clone_cb` - Called after forking the child process, passed the
/// child end of the pipe that must be kep open.
pub fn new<D: BusDevice, F>(mut device: D, post_clone_cb: F) -> Result<ProxyDevice>
where F: FnOnce(&UnixDatagram) {
let (child_sock, parent_sock) = UnixDatagram::pair()?;
let ret = unsafe { fork() };
// Forking a new process is unsafe, we must ensure no resources required
// by the other side are freed after the two processes start.
let ret = unsafe { do_clone()? };
if ret == 0 {
post_clone_cb(&child_sock);
// ! Never returns
child_proc(child_sock, &mut device);
} else if ret == -1 {
return Err(Error::last_os_error());
}
let mut buf = [0; MSG_SIZE];

View file

@ -6,22 +6,27 @@
extern crate clap;
extern crate libc;
extern crate io_jail;
extern crate kvm;
extern crate x86_64;
extern crate kernel_loader;
extern crate byteorder;
#[macro_use] extern crate sys_util;
extern crate syscall_defines;
use std::ffi::{CString, CStr};
use std::fmt;
use std::fs::File;
use std::io::{stdin, stdout};
use std::os::unix::io::{AsRawFd, RawFd};
use std::path::Path;
use std::string::String;
use std::sync::{Arc, Mutex, Barrier};
use std::thread::{spawn, JoinHandle};
use clap::{Arg, App, SubCommand};
use io_jail::Minijail;
use kvm::*;
use sys_util::{GuestAddress, GuestMemory, EventFd, Terminal, Poller, Pollable,
register_signal_handler, Killable};
@ -36,6 +41,8 @@ enum Error {
Socket(std::io::Error),
Disk(std::io::Error),
BlockDeviceNew(sys_util::Error),
BlockDeviceJail(io_jail::Error),
BlockDevicePivotRoot(io_jail::Error),
Cmdline(kernel_cmdline::Error),
ProxyDeviceCreation(std::io::Error),
RegisterIoevent(sys_util::Error),
@ -73,6 +80,10 @@ impl fmt::Display for Error {
&Error::Socket(ref e) => write!(f, "failed to create socket: {}", e),
&Error::Disk(ref e) => write!(f, "failed to load disk image: {}", e),
&Error::BlockDeviceNew(ref e) => write!(f, "failed to create block device: {:?}", e),
&Error::BlockDeviceJail(ref e) => write!(f, "failed to jail block device: {:?}", e),
&Error::BlockDevicePivotRoot(ref e) => {
write!(f, "failed to pivot root block device: {:?}", e)
}
&Error::Cmdline(ref e) => write!(f, "the given kernel command line was invalid: {}", e),
&Error::ProxyDeviceCreation(ref e) => write!(f, "failed to create proxy device: {}", e),
&Error::RegisterIoevent(ref e) => write!(f, "error registering ioevent: {:?}", e),
@ -110,6 +121,26 @@ const KERNEL_START_OFFSET: usize = 0x200000;
const CMDLINE_OFFSET: usize = 0x20000;
const CMDLINE_MAX_SIZE: usize = KERNEL_START_OFFSET - CMDLINE_OFFSET;
fn create_block_device_jail() -> Result<Minijail> {
// All child jails run in a new user namespace without any users mapped,
// they run as nobody unless otherwise configured.
let mut j = Minijail::new().map_err(|e| Error::BlockDeviceJail(e))?;
// Don't need any capabilities.
j.use_caps(0);
// Create a new mount namespace with an empty root FS.
j.namespace_vfs();
j.enter_pivot_root(Path::new("/run/asdf"))
.map_err(|e| Error::BlockDevicePivotRoot(e))?;
// Run in an empty network namespace.
j.namespace_net();
// Apply the block device seccomp policy.
j.no_new_privs();
j.parse_seccomp_filters(Path::new("block_device.policy"))
.map_err(|e| Error::BlockDeviceJail(e))?;
j.use_seccomp_filter();
Ok(j)
}
fn run_config(cfg: Config) -> Result<()> {
let socket = if let Some(ref socket_path) = cfg.socket_path {
Some(ControlSocketRecv::new(socket_path)
@ -136,7 +167,11 @@ fn run_config(cfg: Config) -> Result<()> {
let mut irq: u32 = 5;
if let Some(ref disk_path) = cfg.disk_path {
// List of FDs to keep open in the child after it forks.
let mut keep_fds: Vec<RawFd> = Vec::new();
let disk_image = File::open(disk_path).map_err(|e| Error::Disk(e))?;
keep_fds.push(disk_image.as_raw_fd());
let block_box = Box::new(hw::virtio::Block::new(disk_image)
.map_err(|e| Error::BlockDeviceNew(e))?);
@ -144,16 +179,27 @@ fn run_config(cfg: Config) -> Result<()> {
for (i, queue_evt) in block_mmio.queue_evts().iter().enumerate() {
let io_addr = IoeventAddress::Mmio(mmio_base + hw::virtio::NOITFY_REG_OFFSET as u64);
vm_requests.push(VmRequest::RegisterIoevent(queue_evt.try_clone()?, io_addr, i as u32));
keep_fds.push(queue_evt.as_raw_fd());
}
if let Some(interrupt_evt) = block_mmio.interrupt_evt() {
vm_requests.push(VmRequest::RegisterIrqfd(interrupt_evt.try_clone()?, irq));
keep_fds.push(interrupt_evt.as_raw_fd());
}
if cfg.multiprocess {
bus.insert(Arc::new(Mutex::new(hw::ProxyDevice::new(block_mmio).unwrap())),
mmio_base,
mmio_len)
let jail = create_block_device_jail()?;
let proxy_dev = hw::ProxyDevice::new(block_mmio, move |keep_pipe| {
keep_fds.push(keep_pipe.as_raw_fd());
// Need to panic here as there isn't a way to recover from a
// partly-jailed process.
unsafe {
// This is OK as we have whitelisted all the FDs we need open.
jail.enter(Some(&keep_fds)).unwrap();
}
})
.map_err(|e| Error::ProxyDeviceCreation(e))?;
bus.insert(Arc::new(Mutex::new(proxy_dev)), mmio_base, mmio_len)
.unwrap();
} else {
bus.insert(Arc::new(Mutex::new(block_mmio)), mmio_base, mmio_len)