device: vhost_user: Enable seccomp filter vhost-user-fs

Vhost-user-fs currently lacks seccomp filter support, which cause
security concerns to put into real usage. This change introduces
virtio-fs device's seccomp policy filter to vhost-user-fs when sandbox
is enabled.

When specified path of socket does not exist for vhost-user device, the
vhost-user device will call socketpair to create a socket. To support
the syscall, the rule allowing socketpair is added to vhost_user.policy.

Also, this CL adds disable-sandbox option for vhost-user-fs-device. The option
is set to false by default, the vhost-user-fs will enter new
mnt/user/pid/net namespace. If the this option is true, the
vhost-user-fs device only create a new mount namespace.

BUG=b:355159487
TEST=run manual tests
TEST=run e2e test in chromium:5746575

Change-Id: I6c18386f690af7b0d2e1550c0b3881d444280a8b
Reviewed-on: https://chromium-review.googlesource.com/c/crosvm/crosvm/+/5741356
Reviewed-by: Keiichi Watanabe <keiichiw@chromium.org>
Commit-Queue: Yuan Yao <yuanyaogoog@chromium.org>
This commit is contained in:
Yuan Yao 2024-07-26 07:27:31 +00:00 committed by crosvm LUCI
parent 437d6612e0
commit 54e5b6b204
7 changed files with 98 additions and 47 deletions

View file

@ -232,4 +232,11 @@ pub struct Options {
/// gid of the device process in the new user namespace created by minijail. /// gid of the device process in the new user namespace created by minijail.
/// Default: 0. /// Default: 0.
gid: u32, gid: u32,
#[argh(switch)]
/// disable-sandbox controls whether vhost-user-fs device uses minijail sandbox.
/// By default, it is false, the vhost-user-fs will enter new mnt/user/pid/net
/// namespace. If the this option is true, the vhost-user-fs device only create
/// a new mount namespace and run without seccomp filter.
/// Default: false.
disable_sandbox: bool,
} }

View file

@ -10,6 +10,8 @@ use anyhow::Context;
use base::linux::max_open_files; use base::linux::max_open_files;
use base::RawDescriptor; use base::RawDescriptor;
use cros_async::Executor; use cros_async::Executor;
use jail::create_base_minijail;
use jail::set_embedded_bpf_program;
use minijail::Minijail; use minijail::Minijail;
use crate::virtio::vhost::user::device::fs::FsBackend; use crate::virtio::vhost::user::device::fs::FsBackend;
@ -37,39 +39,47 @@ fn jail_and_fork(
gid: u32, gid: u32,
uid_map: Option<String>, uid_map: Option<String>,
gid_map: Option<String>, gid_map: Option<String>,
disable_sandbox: bool,
) -> anyhow::Result<i32> { ) -> anyhow::Result<i32> {
// Create new minijail sandbox
let mut j = Minijail::new()?;
j.namespace_pids();
j.namespace_user();
j.namespace_user_disable_setgroups();
if uid != 0 {
j.change_uid(uid);
}
if gid != 0 {
j.change_gid(gid);
}
j.uidmap(&uid_map.unwrap_or_else(default_uidmap))?;
j.gidmap(&gid_map.unwrap_or_else(default_gidmap))?;
j.run_as_init();
j.namespace_vfs();
j.namespace_net();
j.no_new_privs();
// Only pivot_root if we are not re-using the current root directory.
if dir_path != Path::new("/") {
// It's safe to call `namespace_vfs` multiple times.
j.namespace_vfs();
j.enter_pivot_root(&dir_path)?;
}
j.set_remount_mode(libc::MS_SLAVE);
let limit = max_open_files().context("failed to get max open files")?; let limit = max_open_files().context("failed to get max open files")?;
j.set_rlimit(libc::RLIMIT_NOFILE as i32, limit, limit)?; // Create new minijail sandbox
// vvu locks around 512k memory. Just give 1M. let jail = if disable_sandbox {
j.set_rlimit(libc::RLIMIT_MEMLOCK as i32, 1 << 20, 1 << 20)?; create_base_minijail(dir_path.as_path(), limit)?
} else {
let mut j: Minijail = Minijail::new()?;
j.namespace_pids();
j.namespace_user();
j.namespace_user_disable_setgroups();
if uid != 0 {
j.change_uid(uid);
}
if gid != 0 {
j.change_gid(gid);
}
j.uidmap(&uid_map.unwrap_or_else(default_uidmap))?;
j.gidmap(&gid_map.unwrap_or_else(default_gidmap))?;
j.run_as_init();
j.namespace_vfs();
j.namespace_net();
j.no_new_privs();
// Only pivot_root if we are not re-using the current root directory.
if dir_path != Path::new("/") {
// It's safe to call `namespace_vfs` multiple times.
j.namespace_vfs();
j.enter_pivot_root(&dir_path)?;
}
j.set_remount_mode(libc::MS_SLAVE);
j.set_rlimit(libc::RLIMIT_NOFILE as i32, limit, limit)?;
// vvu locks around 512k memory. Just give 1M.
j.set_rlimit(libc::RLIMIT_MEMLOCK as i32, 1 << 20, 1 << 20)?;
#[cfg(not(feature = "seccomp_trace"))]
set_embedded_bpf_program(&mut j, "fs_device_vhost_user")?;
j.use_seccomp_filter();
j
};
// Make sure there are no duplicates in keep_rds // Make sure there are no duplicates in keep_rds
keep_rds.sort_unstable(); keep_rds.sort_unstable();
@ -77,7 +87,7 @@ fn jail_and_fork(
// fork on the jail here // fork on the jail here
// SAFETY: trivially safe // SAFETY: trivially safe
let pid = unsafe { j.fork(Some(&keep_rds))? }; let pid = unsafe { jail.fork(Some(&keep_rds))? };
if pid > 0 { if pid > 0 {
// Current FS driver jail does not use seccomp and jail_and_fork() does not have other // Current FS driver jail does not use seccomp and jail_and_fork() does not have other
@ -113,6 +123,7 @@ pub fn start_device(opts: Options) -> anyhow::Result<()> {
opts.gid, opts.gid,
opts.uid_map, opts.uid_map,
opts.gid_map, opts.gid_map,
opts.disable_sandbox,
)?; )?;
// Parent, nothing to do but wait and then exit // Parent, nothing to do but wait and then exit

View file

@ -0,0 +1,7 @@
# Copyright 2024 The ChromiumOS Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
@include /usr/share/policy/crosvm/vhost_user.policy
@include /usr/share/policy/crosvm/fs_device.policy

View file

@ -0,0 +1,14 @@
# Copyright 2024 The ChromiumOS Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# Policy file for the vhost-user transport over a socket.
# FIONBIO: for setting non-blocking mode over the socket.
# TCGETS/TCSETS: used on FD 0, probably for serial.
# b/239779171: try moving this to the serial device once we can extend ioctls across policy files.
ioctl: arg1 == FIONBIO || arg1 == TCGETS || arg1 == TCSETS
# For accepting a client connection over the socket.
accept4: 1
# For creating a socket if the specified socket path does not exits
socketpair: arg0 == AF_UNIX

View file

@ -0,0 +1,7 @@
# Copyright 2024 The ChromiumOS Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
@include /usr/share/policy/crosvm/vhost_user.policy
@include /usr/share/policy/crosvm/fs_device.policy

View file

@ -10,3 +10,5 @@
ioctl: arg1 == FIONBIO || arg1 == TCGETS || arg1 == TCSETS ioctl: arg1 == FIONBIO || arg1 == TCGETS || arg1 == TCSETS
# For accepting a client connection over the socket. # For accepting a client connection over the socket.
accept4: 1 accept4: 1
# For creating a socket if the specified socket path does not exits
socketpair: arg0 == AF_UNIX

View file

@ -28,7 +28,6 @@ use zerocopy::AsBytes;
use crate::config::JailConfig; use crate::config::JailConfig;
#[cfg(not(feature = "seccomp_trace"))]
static EMBEDDED_BPFS: Lazy<std::collections::HashMap<&str, Vec<u8>>> = static EMBEDDED_BPFS: Lazy<std::collections::HashMap<&str, Vec<u8>>> =
Lazy::new(|| include!(concat!(env!("OUT_DIR"), "/bpf_includes.in"))); Lazy::new(|| include!(concat!(env!("OUT_DIR"), "/bpf_includes.in")));
@ -288,20 +287,7 @@ pub fn create_sandbox_minijail(
})?; })?;
} }
} else { } else {
let bpf_program = EMBEDDED_BPFS set_embedded_bpf_program(&mut jail, config.seccomp_policy_name)?;
.get(&config.seccomp_policy_name)
.with_context(|| {
format!(
"failed to find embedded seccomp policy: {}",
&config.seccomp_policy_name
)
})?;
jail.parse_seccomp_bytes(bpf_program).with_context(|| {
format!(
"failed to parse embedded seccomp policy: {}",
&config.seccomp_policy_name
)
})?;
} }
jail.use_seccomp_filter(); jail.use_seccomp_filter();
@ -482,3 +468,20 @@ fn add_current_user_to_jail(jail: &mut Minijail) -> Result<()> {
} }
Ok(()) Ok(())
} }
/// Set the seccomp policy for a jail from embedded bpfs
pub fn set_embedded_bpf_program(jail: &mut Minijail, seccomp_policy_name: &str) -> Result<()> {
let bpf_program = EMBEDDED_BPFS.get(seccomp_policy_name).with_context(|| {
format!(
"failed to find embedded seccomp policy: {}",
seccomp_policy_name
)
})?;
jail.parse_seccomp_bytes(bpf_program).with_context(|| {
format!(
"failed to parse embedded seccomp policy: {}",
seccomp_policy_name
)
})?;
Ok(())
}