From 54e5b6b204ae7c620c9d0b2c42a37456b9c37ae0 Mon Sep 17 00:00:00 2001 From: Yuan Yao Date: Fri, 26 Jul 2024 07:27:31 +0000 Subject: [PATCH] device: vhost_user: Enable seccomp filter vhost-user-fs Vhost-user-fs currently lacks seccomp filter support, which cause security concerns to put into real usage. This change introduces virtio-fs device's seccomp policy filter to vhost-user-fs when sandbox is enabled. When specified path of socket does not exist for vhost-user device, the vhost-user device will call socketpair to create a socket. To support the syscall, the rule allowing socketpair is added to vhost_user.policy. Also, this CL adds disable-sandbox option for vhost-user-fs-device. The option is set to false by default, the vhost-user-fs will enter new mnt/user/pid/net namespace. If the this option is true, the vhost-user-fs device only create a new mount namespace. BUG=b:355159487 TEST=run manual tests TEST=run e2e test in chromium:5746575 Change-Id: I6c18386f690af7b0d2e1550c0b3881d444280a8b Reviewed-on: https://chromium-review.googlesource.com/c/crosvm/crosvm/+/5741356 Reviewed-by: Keiichi Watanabe Commit-Queue: Yuan Yao --- devices/src/virtio/vhost/user/device/fs.rs | 7 ++ .../virtio/vhost/user/device/fs/sys/linux.rs | 75 +++++++++++-------- .../aarch64/fs_device_vhost_user.policy | 7 ++ jail/seccomp/aarch64/vhost_user.policy | 14 ++++ .../x86_64/fs_device_vhost_user.policy | 7 ++ jail/seccomp/x86_64/vhost_user.policy | 2 + jail/src/helpers.rs | 33 ++++---- 7 files changed, 98 insertions(+), 47 deletions(-) create mode 100644 jail/seccomp/aarch64/fs_device_vhost_user.policy create mode 100644 jail/seccomp/aarch64/vhost_user.policy create mode 100644 jail/seccomp/x86_64/fs_device_vhost_user.policy diff --git a/devices/src/virtio/vhost/user/device/fs.rs b/devices/src/virtio/vhost/user/device/fs.rs index ba610305f3..08492b899b 100644 --- a/devices/src/virtio/vhost/user/device/fs.rs +++ b/devices/src/virtio/vhost/user/device/fs.rs @@ -232,4 +232,11 @@ pub struct Options { /// gid of the device process in the new user namespace created by minijail. /// Default: 0. gid: u32, + #[argh(switch)] + /// disable-sandbox controls whether vhost-user-fs device uses minijail sandbox. + /// By default, it is false, the vhost-user-fs will enter new mnt/user/pid/net + /// namespace. If the this option is true, the vhost-user-fs device only create + /// a new mount namespace and run without seccomp filter. + /// Default: false. + disable_sandbox: bool, } diff --git a/devices/src/virtio/vhost/user/device/fs/sys/linux.rs b/devices/src/virtio/vhost/user/device/fs/sys/linux.rs index f32726f994..fdec6f9bee 100644 --- a/devices/src/virtio/vhost/user/device/fs/sys/linux.rs +++ b/devices/src/virtio/vhost/user/device/fs/sys/linux.rs @@ -10,6 +10,8 @@ use anyhow::Context; use base::linux::max_open_files; use base::RawDescriptor; use cros_async::Executor; +use jail::create_base_minijail; +use jail::set_embedded_bpf_program; use minijail::Minijail; use crate::virtio::vhost::user::device::fs::FsBackend; @@ -37,39 +39,47 @@ fn jail_and_fork( gid: u32, uid_map: Option, gid_map: Option, + disable_sandbox: bool, ) -> anyhow::Result { - // Create new minijail sandbox - let mut j = Minijail::new()?; - - j.namespace_pids(); - j.namespace_user(); - j.namespace_user_disable_setgroups(); - if uid != 0 { - j.change_uid(uid); - } - if gid != 0 { - j.change_gid(gid); - } - j.uidmap(&uid_map.unwrap_or_else(default_uidmap))?; - j.gidmap(&gid_map.unwrap_or_else(default_gidmap))?; - j.run_as_init(); - - j.namespace_vfs(); - j.namespace_net(); - j.no_new_privs(); - - // Only pivot_root if we are not re-using the current root directory. - if dir_path != Path::new("/") { - // It's safe to call `namespace_vfs` multiple times. - j.namespace_vfs(); - j.enter_pivot_root(&dir_path)?; - } - j.set_remount_mode(libc::MS_SLAVE); - let limit = max_open_files().context("failed to get max open files")?; - j.set_rlimit(libc::RLIMIT_NOFILE as i32, limit, limit)?; - // vvu locks around 512k memory. Just give 1M. - j.set_rlimit(libc::RLIMIT_MEMLOCK as i32, 1 << 20, 1 << 20)?; + // Create new minijail sandbox + let jail = if disable_sandbox { + create_base_minijail(dir_path.as_path(), limit)? + } else { + let mut j: Minijail = Minijail::new()?; + j.namespace_pids(); + j.namespace_user(); + j.namespace_user_disable_setgroups(); + if uid != 0 { + j.change_uid(uid); + } + if gid != 0 { + j.change_gid(gid); + } + j.uidmap(&uid_map.unwrap_or_else(default_uidmap))?; + j.gidmap(&gid_map.unwrap_or_else(default_gidmap))?; + j.run_as_init(); + + j.namespace_vfs(); + j.namespace_net(); + j.no_new_privs(); + + // Only pivot_root if we are not re-using the current root directory. + if dir_path != Path::new("/") { + // It's safe to call `namespace_vfs` multiple times. + j.namespace_vfs(); + j.enter_pivot_root(&dir_path)?; + } + j.set_remount_mode(libc::MS_SLAVE); + + j.set_rlimit(libc::RLIMIT_NOFILE as i32, limit, limit)?; + // vvu locks around 512k memory. Just give 1M. + j.set_rlimit(libc::RLIMIT_MEMLOCK as i32, 1 << 20, 1 << 20)?; + #[cfg(not(feature = "seccomp_trace"))] + set_embedded_bpf_program(&mut j, "fs_device_vhost_user")?; + j.use_seccomp_filter(); + j + }; // Make sure there are no duplicates in keep_rds keep_rds.sort_unstable(); @@ -77,7 +87,7 @@ fn jail_and_fork( // fork on the jail here // SAFETY: trivially safe - let pid = unsafe { j.fork(Some(&keep_rds))? }; + let pid = unsafe { jail.fork(Some(&keep_rds))? }; if pid > 0 { // Current FS driver jail does not use seccomp and jail_and_fork() does not have other @@ -113,6 +123,7 @@ pub fn start_device(opts: Options) -> anyhow::Result<()> { opts.gid, opts.uid_map, opts.gid_map, + opts.disable_sandbox, )?; // Parent, nothing to do but wait and then exit diff --git a/jail/seccomp/aarch64/fs_device_vhost_user.policy b/jail/seccomp/aarch64/fs_device_vhost_user.policy new file mode 100644 index 0000000000..61b48a2d4f --- /dev/null +++ b/jail/seccomp/aarch64/fs_device_vhost_user.policy @@ -0,0 +1,7 @@ +# Copyright 2024 The ChromiumOS Authors +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +@include /usr/share/policy/crosvm/vhost_user.policy + +@include /usr/share/policy/crosvm/fs_device.policy diff --git a/jail/seccomp/aarch64/vhost_user.policy b/jail/seccomp/aarch64/vhost_user.policy new file mode 100644 index 0000000000..d9e32b6e2f --- /dev/null +++ b/jail/seccomp/aarch64/vhost_user.policy @@ -0,0 +1,14 @@ +# Copyright 2024 The ChromiumOS Authors +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +# Policy file for the vhost-user transport over a socket. + +# FIONBIO: for setting non-blocking mode over the socket. +# TCGETS/TCSETS: used on FD 0, probably for serial. +# b/239779171: try moving this to the serial device once we can extend ioctls across policy files. +ioctl: arg1 == FIONBIO || arg1 == TCGETS || arg1 == TCSETS +# For accepting a client connection over the socket. +accept4: 1 +# For creating a socket if the specified socket path does not exits +socketpair: arg0 == AF_UNIX diff --git a/jail/seccomp/x86_64/fs_device_vhost_user.policy b/jail/seccomp/x86_64/fs_device_vhost_user.policy new file mode 100644 index 0000000000..61b48a2d4f --- /dev/null +++ b/jail/seccomp/x86_64/fs_device_vhost_user.policy @@ -0,0 +1,7 @@ +# Copyright 2024 The ChromiumOS Authors +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +@include /usr/share/policy/crosvm/vhost_user.policy + +@include /usr/share/policy/crosvm/fs_device.policy diff --git a/jail/seccomp/x86_64/vhost_user.policy b/jail/seccomp/x86_64/vhost_user.policy index 3a20f98e37..4b271950fe 100644 --- a/jail/seccomp/x86_64/vhost_user.policy +++ b/jail/seccomp/x86_64/vhost_user.policy @@ -10,3 +10,5 @@ ioctl: arg1 == FIONBIO || arg1 == TCGETS || arg1 == TCSETS # For accepting a client connection over the socket. accept4: 1 +# For creating a socket if the specified socket path does not exits +socketpair: arg0 == AF_UNIX diff --git a/jail/src/helpers.rs b/jail/src/helpers.rs index e5c7f885d0..20ccfb7889 100644 --- a/jail/src/helpers.rs +++ b/jail/src/helpers.rs @@ -28,7 +28,6 @@ use zerocopy::AsBytes; use crate::config::JailConfig; -#[cfg(not(feature = "seccomp_trace"))] static EMBEDDED_BPFS: Lazy>> = Lazy::new(|| include!(concat!(env!("OUT_DIR"), "/bpf_includes.in"))); @@ -288,20 +287,7 @@ pub fn create_sandbox_minijail( })?; } } else { - let bpf_program = EMBEDDED_BPFS - .get(&config.seccomp_policy_name) - .with_context(|| { - format!( - "failed to find embedded seccomp policy: {}", - &config.seccomp_policy_name - ) - })?; - jail.parse_seccomp_bytes(bpf_program).with_context(|| { - format!( - "failed to parse embedded seccomp policy: {}", - &config.seccomp_policy_name - ) - })?; + set_embedded_bpf_program(&mut jail, config.seccomp_policy_name)?; } jail.use_seccomp_filter(); @@ -482,3 +468,20 @@ fn add_current_user_to_jail(jail: &mut Minijail) -> Result<()> { } Ok(()) } + +/// Set the seccomp policy for a jail from embedded bpfs +pub fn set_embedded_bpf_program(jail: &mut Minijail, seccomp_policy_name: &str) -> Result<()> { + let bpf_program = EMBEDDED_BPFS.get(seccomp_policy_name).with_context(|| { + format!( + "failed to find embedded seccomp policy: {}", + seccomp_policy_name + ) + })?; + jail.parse_seccomp_bytes(bpf_program).with_context(|| { + format!( + "failed to parse embedded seccomp policy: {}", + seccomp_policy_name + ) + })?; + Ok(()) +}