device: vhost_user: Enable seccomp filter vhost-user-fs

Vhost-user-fs currently lacks seccomp filter support, which cause security concerns to put into real usage. This change introduces virtio-fs device's seccomp policy filter to vhost-user-fs when sandbox is enabled. When specified path of socket does not exist for vhost-user device, the vhost-user device will call socketpair to create a socket. To support the syscall, the rule allowing socketpair is added to vhost_user.policy. Also, this CL adds disable-sandbox option for vhost-user-fs-device. The option is set to false by default, the vhost-user-fs will enter new mnt/user/pid/net namespace. If the this option is true, the vhost-user-fs device only create a new mount namespace. BUG=b:355159487 TEST=run manual tests TEST=run e2e test in chromium:5746575 Change-Id: I6c18386f690af7b0d2e1550c0b3881d444280a8b Reviewed-on: https://chromium-review.googlesource.com/c/crosvm/crosvm/+/5741356 Reviewed-by: Keiichi Watanabe <keiichiw@chromium.org> Commit-Queue: Yuan Yao <yuanyaogoog@chromium.org>
2024-11-28 17:44:10 +00:00 · 2024-07-26 07:27:31 +00:00 · 2024-07-26 07:27:31 +00:00 · 54e5b6b204
commit 54e5b6b204
parent 437d6612e0
7 changed files with 98 additions and 47 deletions
--- a/devices/src/virtio/vhost/user/device/fs.rs
+++ b/devices/src/virtio/vhost/user/device/fs.rs
@ -232,4 +232,11 @@ pub struct Options {
    /// gid of the device process in the new user namespace created by minijail.
    /// Default: 0.
    gid: u32,
    #[argh(switch)]
    /// disable-sandbox controls whether vhost-user-fs device uses minijail sandbox.
    /// By default, it is false, the vhost-user-fs will enter new mnt/user/pid/net
    /// namespace. If the this option is true, the vhost-user-fs device only create
    /// a new mount namespace and run without seccomp filter.
    /// Default: false.
    disable_sandbox: bool,
 }
--- a/devices/src/virtio/vhost/user/device/fs/sys/linux.rs
+++ b/devices/src/virtio/vhost/user/device/fs/sys/linux.rs
@ -10,6 +10,8 @@ use anyhow::Context;
 use base::linux::max_open_files;
 use base::RawDescriptor;
 use cros_async::Executor;
 use jail::create_base_minijail;
 use jail::set_embedded_bpf_program;
 use minijail::Minijail;
 use crate::virtio::vhost::user::device::fs::FsBackend;
@ -37,39 +39,47 @@ fn jail_and_fork(
    gid: u32,
    uid_map: Option<String>,
    gid_map: Option<String>,
    disable_sandbox: bool,
 ) -> anyhow::Result<i32> {
    // Create new minijail sandbox
    let mut j = Minijail::new()?;
    j.namespace_pids();
    j.namespace_user();
    j.namespace_user_disable_setgroups();
    if uid != 0 {
        j.change_uid(uid);
    }
    if gid != 0 {
        j.change_gid(gid);
    }
    j.uidmap(&uid_map.unwrap_or_else(default_uidmap))?;
    j.gidmap(&gid_map.unwrap_or_else(default_gidmap))?;
    j.run_as_init();
    j.namespace_vfs();
    j.namespace_net();
    j.no_new_privs();
    // Only pivot_root if we are not re-using the current root directory.
    if dir_path != Path::new("/") {
        // It's safe to call `namespace_vfs` multiple times.
        j.namespace_vfs();
        j.enter_pivot_root(&dir_path)?;
    }
    j.set_remount_mode(libc::MS_SLAVE);
    let limit = max_open_files().context("failed to get max open files")?;
-    j.set_rlimit(libc::RLIMIT_NOFILE as i32, limit, limit)?;
+    // Create new minijail sandbox
-    // vvu locks around 512k memory. Just give 1M.
+    let jail = if disable_sandbox {
-    j.set_rlimit(libc::RLIMIT_MEMLOCK as i32, 1 << 20, 1 << 20)?;
+        create_base_minijail(dir_path.as_path(), limit)?
    } else {
        let mut j: Minijail = Minijail::new()?;
        j.namespace_pids();
        j.namespace_user();
        j.namespace_user_disable_setgroups();
        if uid != 0 {
            j.change_uid(uid);
        }
        if gid != 0 {
            j.change_gid(gid);
        }
        j.uidmap(&uid_map.unwrap_or_else(default_uidmap))?;
        j.gidmap(&gid_map.unwrap_or_else(default_gidmap))?;
        j.run_as_init();
        j.namespace_vfs();
        j.namespace_net();
        j.no_new_privs();
        // Only pivot_root if we are not re-using the current root directory.
        if dir_path != Path::new("/") {
            // It's safe to call `namespace_vfs` multiple times.
            j.namespace_vfs();
            j.enter_pivot_root(&dir_path)?;
        }
        j.set_remount_mode(libc::MS_SLAVE);
        j.set_rlimit(libc::RLIMIT_NOFILE as i32, limit, limit)?;
        // vvu locks around 512k memory. Just give 1M.
        j.set_rlimit(libc::RLIMIT_MEMLOCK as i32, 1 << 20, 1 << 20)?;
        #[cfg(not(feature = "seccomp_trace"))]
        set_embedded_bpf_program(&mut j, "fs_device_vhost_user")?;
        j.use_seccomp_filter();
        j
    };
    // Make sure there are no duplicates in keep_rds
    keep_rds.sort_unstable();
@ -77,7 +87,7 @@ fn jail_and_fork(
    // fork on the jail here
    // SAFETY: trivially safe
-    let pid = unsafe { j.fork(Some(&keep_rds))? };
+    let pid = unsafe { jail.fork(Some(&keep_rds))? };
    if pid > 0 {
        // Current FS driver jail does not use seccomp and jail_and_fork() does not have other
@ -113,6 +123,7 @@ pub fn start_device(opts: Options) -> anyhow::Result<()> {
        opts.gid,
        opts.uid_map,
        opts.gid_map,
        opts.disable_sandbox,
    )?;
    // Parent, nothing to do but wait and then exit
--- a/jail/seccomp/aarch64/fs_device_vhost_user.policy
+++ b/jail/seccomp/aarch64/fs_device_vhost_user.policy
@ -0,0 +1,7 @@
 # Copyright 2024 The ChromiumOS Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
@include /usr/share/policy/crosvm/vhost_user.policy
@include /usr/share/policy/crosvm/fs_device.policy
--- a/jail/seccomp/aarch64/vhost_user.policy
+++ b/jail/seccomp/aarch64/vhost_user.policy
@ -0,0 +1,14 @@
 # Copyright 2024 The ChromiumOS Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 # Policy file for the vhost-user transport over a socket.
 # FIONBIO: for setting non-blocking mode over the socket.
 # TCGETS/TCSETS: used on FD 0, probably for serial.
 # b/239779171: try moving this to the serial device once we can extend ioctls across policy files.
 ioctl: arg1 == FIONBIO || arg1 == TCGETS || arg1 == TCSETS
 # For accepting a client connection over the socket.
 accept4: 1
 # For creating a socket if the specified socket path does not exits
 socketpair: arg0 == AF_UNIX
--- a/jail/seccomp/x86_64/fs_device_vhost_user.policy
+++ b/jail/seccomp/x86_64/fs_device_vhost_user.policy
@ -0,0 +1,7 @@
 # Copyright 2024 The ChromiumOS Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
@include /usr/share/policy/crosvm/vhost_user.policy
@include /usr/share/policy/crosvm/fs_device.policy
--- a/jail/seccomp/x86_64/vhost_user.policy
+++ b/jail/seccomp/x86_64/vhost_user.policy
@ -10,3 +10,5 @@
 ioctl: arg1 == FIONBIO || arg1 == TCGETS || arg1 == TCSETS
 # For accepting a client connection over the socket.
 accept4: 1
 # For creating a socket if the specified socket path does not exits
 socketpair: arg0 == AF_UNIX
--- a/jail/src/helpers.rs
+++ b/jail/src/helpers.rs
@ -28,7 +28,6 @@ use zerocopy::AsBytes;
 use crate::config::JailConfig;
 #[cfg(not(feature = "seccomp_trace"))]
 static EMBEDDED_BPFS: Lazy<std::collections::HashMap<&str, Vec<u8>>> =
    Lazy::new(|| include!(concat!(env!("OUT_DIR"), "/bpf_includes.in")));
@ -288,20 +287,7 @@ pub fn create_sandbox_minijail(
                })?;
        }
    } else {
-        let bpf_program = EMBEDDED_BPFS
+        set_embedded_bpf_program(&mut jail, config.seccomp_policy_name)?;
            .get(&config.seccomp_policy_name)
            .with_context(|| {
                format!(
                    "failed to find embedded seccomp policy: {}",
                    &config.seccomp_policy_name
                )
            })?;
        jail.parse_seccomp_bytes(bpf_program).with_context(|| {
            format!(
                "failed to parse embedded seccomp policy: {}",
                &config.seccomp_policy_name
            )
        })?;
    }
    jail.use_seccomp_filter();
@ -482,3 +468,20 @@ fn add_current_user_to_jail(jail: &mut Minijail) -> Result<()> {
    }
    Ok(())
 }
 /// Set the seccomp policy for a jail from embedded bpfs
 pub fn set_embedded_bpf_program(jail: &mut Minijail, seccomp_policy_name: &str) -> Result<()> {
    let bpf_program = EMBEDDED_BPFS.get(seccomp_policy_name).with_context(|| {
        format!(
            "failed to find embedded seccomp policy: {}",
            seccomp_policy_name
        )
    })?;
    jail.parse_seccomp_bytes(bpf_program).with_context(|| {
        format!(
            "failed to parse embedded seccomp policy: {}",
            seccomp_policy_name
        )
    })?;
    Ok(())
 }