devices: fs: Allow running virito-fs without root

We usually requires the root permission or sandboxing to run
virtiofs to do bind-mount and pivot_root inside
of a mount namespace.

This CL allows non-root user wherein "root_dir" defaults to the
path provided via "--shared-dir". This override is currently
enabled when "fs_runtime_ugid_map" feature is enabled.

Bug=b:340940950
Test: Launch Terminal App with virtiofs enabled.
Verify virtiofs mounts on guest and run basic I/O tests.
Change-Id: Icc0065cb11dd919f473e5d3ee994e7f8d679ecd0
Signed-off-by: Akilesh Kailash <akailash@google.com>
Reviewed-on: https://chromium-review.googlesource.com/c/crosvm/crosvm/+/5945445
Reviewed-by: Keiichi Watanabe <keiichiw@chromium.org>
This commit is contained in:
Keiichi Watanabe 2024-10-21 03:47:36 +00:00 committed by crosvm LUCI
parent 4d70c7310d
commit f67ee911d9
6 changed files with 304 additions and 29 deletions

View file

@ -98,7 +98,7 @@ pub struct PermissionData {
/// umask to be set at runtime for the files in the path.
pub umask: libc::mode_t,
/// This is the relative path from the root of the shared directory.
/// This is the absolute path from the root of the shared directory.
pub perm_path: String,
}
@ -338,7 +338,7 @@ pub struct Config {
// host-uid: UID to be set for all the files in the path in the host.
// host-gid: GID to be set for all the files in the path in the host.
// umask: umask to be set at runtime for the files in the path.
// path: This is the relative path from the root of the shared directory.
// path: This is the absolute path from the root of the shared directory.
//
// This follows similar format to ARCVM IOCTL "FS_IOC_SETPERMISSION"
#[cfg(feature = "fs_runtime_ugid_map")]

View file

@ -9,6 +9,8 @@ use std::collections::btree_map;
use std::collections::BTreeMap;
use std::ffi::CStr;
use std::ffi::CString;
#[cfg(feature = "fs_runtime_ugid_map")]
use std::ffi::OsStr;
use std::fs::File;
use std::io;
use std::mem;
@ -16,6 +18,10 @@ use std::mem::size_of;
use std::mem::MaybeUninit;
use std::os::raw::c_int;
use std::os::raw::c_long;
#[cfg(feature = "fs_runtime_ugid_map")]
use std::os::unix::ffi::OsStrExt;
#[cfg(feature = "fs_runtime_ugid_map")]
use std::path::Path;
use std::ptr;
use std::ptr::addr_of;
use std::ptr::addr_of_mut;
@ -93,7 +99,6 @@ use crate::virtio::fs::multikey::MultikeyBTreeMap;
use crate::virtio::fs::read_dir::ReadDir;
const EMPTY_CSTR: &[u8] = b"\0";
const ROOT_CSTR: &[u8] = b"/\0";
const PROC_CSTR: &[u8] = b"/proc\0";
const UNLABELED_CSTR: &[u8] = b"unlabeled\0";
@ -730,6 +735,16 @@ pub struct PassthroughFs {
xattr_paths: RwLock<Vec<XattrData>>,
cfg: Config,
// Set the root directory when pivot root isn't enabled for jailed process.
//
// virtio-fs typically uses mount namespaces and pivot_root for file system isolation,
// making the jailed process's root directory "/".
//
// However, Android's security model prevents crosvm from having the necessary SYS_ADMIN
// capability for mount namespaces and pivot_root. This lack of isolation means that
// root_dir defaults to the path provided via "--shared-dir".
root_dir: String,
}
impl std::fmt::Debug for PassthroughFs {
@ -813,6 +828,7 @@ impl PassthroughFs {
#[cfg(feature = "arc_quota")]
xattr_paths: RwLock::new(Vec::new()),
cfg,
root_dir: "/".to_string(),
};
#[cfg(feature = "fs_runtime_ugid_map")]
@ -837,6 +853,21 @@ impl PassthroughFs {
}
}
#[cfg(feature = "fs_runtime_ugid_map")]
pub fn set_root_dir(&mut self, shared_dir: String) -> io::Result<()> {
let canonicalized_root = match std::fs::canonicalize(shared_dir) {
Ok(path) => path,
Err(e) => {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
format!("Failed to canonicalize root_dir: {}", e),
));
}
};
self.root_dir = canonicalized_root.to_string_lossy().to_string();
Ok(())
}
pub fn cfg(&self) -> &Config {
&self.cfg
}
@ -1704,9 +1735,12 @@ impl PassthroughFs {
#[cfg(feature = "fs_runtime_ugid_map")]
impl PassthroughFs {
/// Set permission according to path
fn set_ugid_permission(&self, st: &mut libc::stat64, path: &str) {
let is_root_path = path.is_empty();
fn find_and_set_ugid_permission(
&self,
st: &mut libc::stat64,
path: &str,
is_root_path: bool,
) -> bool {
for perm_data in self
.permission_paths
.read()
@ -1718,12 +1752,36 @@ impl PassthroughFs {
&& perm_data.perm_path != "/"
&& perm_data.need_set_permission(path))
{
st.st_uid = perm_data.guest_uid;
st.st_gid = perm_data.guest_gid;
st.st_mode = (st.st_mode & libc::S_IFMT) | (0o777 & !perm_data.umask);
return;
self.set_permission_from_data(st, perm_data);
return true;
}
}
false
}
fn set_permission_from_data(&self, st: &mut libc::stat64, perm_data: &PermissionData) {
st.st_uid = perm_data.guest_uid;
st.st_gid = perm_data.guest_gid;
st.st_mode = (st.st_mode & libc::S_IFMT) | (0o777 & !perm_data.umask);
}
/// Set permission according to path
fn set_ugid_permission(&self, st: &mut libc::stat64, path: &str) {
let is_root_path = path.is_empty();
if self.find_and_set_ugid_permission(st, path, is_root_path) {
return;
}
if let Some(perm_data) = self
.permission_paths
.read()
.expect("acquire permission_paths read lock")
.iter()
.find(|pd| pd.perm_path == "/")
{
self.set_permission_from_data(st, perm_data);
}
}
/// Set host uid/gid to configured value according to path
@ -1735,6 +1793,25 @@ impl PassthroughFs {
);
let is_root_path = path.is_empty();
if self.find_ugid_creds_for_path(&path, is_root_path).is_some() {
return self.find_ugid_creds_for_path(&path, is_root_path).unwrap();
}
if let Some(perm_data) = self
.permission_paths
.read()
.expect("acquire permission_paths read lock")
.iter()
.find(|pd| pd.perm_path == "/")
{
return (perm_data.host_uid, perm_data.host_gid);
}
(ctx.uid, ctx.gid)
}
fn find_ugid_creds_for_path(&self, path: &str, is_root_path: bool) -> Option<(u32, u32)> {
for perm_data in self
.permission_paths
.read()
@ -1744,13 +1821,12 @@ impl PassthroughFs {
if (is_root_path && perm_data.perm_path == "/")
|| (!is_root_path
&& perm_data.perm_path != "/"
&& perm_data.need_set_permission(&path))
&& perm_data.need_set_permission(path))
{
return (perm_data.host_uid, perm_data.host_gid);
return Some((perm_data.host_uid, perm_data.host_gid));
}
}
(ctx.uid, ctx.gid)
None
}
}
@ -2053,9 +2129,10 @@ impl FileSystem for PassthroughFs {
type DirIter = ReadDir<Box<[u8]>>;
fn init(&self, capable: FsOptions) -> io::Result<FsOptions> {
// SAFETY: this is a constant value that is a nul-terminated string without interior
// nul bytes.
let root = unsafe { CStr::from_bytes_with_nul_unchecked(ROOT_CSTR) };
let mut root_str_with_null: Vec<u8> = self.root_dir.clone().into_bytes();
root_str_with_null.push(0u8);
// SAFETY: this is a nul-terminated string without interior nul bytes.
let root = unsafe { CStr::from_bytes_with_nul_unchecked(&root_str_with_null) };
let flags = libc::O_DIRECTORY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
// SAFETY: this doesn't modify any memory and we check the return value.
@ -2916,6 +2993,17 @@ impl FileSystem for PassthroughFs {
})?;
buf.resize(res as usize, 0);
#[cfg(feature = "fs_runtime_ugid_map")]
{
let link_target = Path::new(OsStr::from_bytes(&buf[..res as usize]));
if !link_target.starts_with(&self.root_dir) {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"Symbolic link points outside of root_dir",
));
}
}
Ok(buf)
}

View file

@ -47,7 +47,13 @@ struct FsBackend {
}
impl FsBackend {
pub fn new(tag: &str, cfg: Option<Config>) -> anyhow::Result<Self> {
#[allow(unused_variables)]
pub fn new(
tag: &str,
shared_dir: &str,
skip_pivot_root: bool,
cfg: Option<Config>,
) -> anyhow::Result<Self> {
if tag.len() > FS_MAX_TAG_LEN {
bail!(
"fs tag is too long: {} (max supported: {})",
@ -60,7 +66,12 @@ impl FsBackend {
| 1 << VHOST_USER_F_PROTOCOL_FEATURES;
// Use default passthroughfs config
let fs = PassthroughFs::new(tag, cfg.unwrap_or_default())?;
#[allow(unused_mut)]
let mut fs = PassthroughFs::new(tag, cfg.unwrap_or_default())?;
#[cfg(feature = "fs_runtime_ugid_map")]
if skip_pivot_root {
fs.set_root_dir(shared_dir.to_string())?;
}
let mut keep_rds: Vec<RawDescriptor> = [0, 1, 2].to_vec();
keep_rds.append(&mut fs.keep_rds());
@ -221,4 +232,17 @@ pub struct Options {
/// a new mount namespace and run without seccomp filter.
/// Default: false.
disable_sandbox: bool,
#[argh(option, arg_name = "skip_pivot_root", default = "false")]
/// disable pivot_root when process is jailed.
///
/// virtio-fs typically uses mount namespaces and pivot_root for file system isolation,
/// making the jailed process's root directory "/".
///
/// Android's security model restricts crosvm's access to certain system capabilities,
/// specifically those related to managing mount namespaces and using pivot_root.
/// These capabilities are typically associated with the SYS_ADMIN capability.
/// To maintain a secure environment, Android relies on mechanisms like SELinux to
/// enforce isolation and control access to directories.
#[allow(dead_code)]
skip_pivot_root: bool,
}

View file

@ -13,6 +13,7 @@ use base::AsRawDescriptors;
use base::RawDescriptor;
use cros_async::Executor;
use jail::create_base_minijail;
use jail::create_base_minijail_without_pivot_root;
use jail::set_embedded_bpf_program;
use minijail::Minijail;
@ -41,13 +42,18 @@ fn jail_and_fork(
uid_map: Option<String>,
gid_map: Option<String>,
disable_sandbox: bool,
pivot_root: bool,
) -> anyhow::Result<i32> {
let limit = max_open_files()
.context("failed to get max open files")?
.rlim_max;
// Create new minijail sandbox
let jail = if disable_sandbox {
create_base_minijail(dir_path.as_path(), limit)?
if pivot_root {
create_base_minijail(dir_path.as_path(), limit)
} else {
create_base_minijail_without_pivot_root(dir_path.as_path(), limit)
}?
} else {
let mut j: Minijail = Minijail::new()?;
j.namespace_pids();
@ -108,15 +114,29 @@ fn jail_and_fork(
/// Starts a vhost-user fs device.
/// Returns an error if the given `args` is invalid or the device fails to run.
pub fn start_device(opts: Options) -> anyhow::Result<()> {
#[allow(unused_mut)]
pub fn start_device(mut opts: Options) -> anyhow::Result<()> {
#[allow(unused_mut)]
let mut is_pivot_root_required = true;
#[cfg(feature = "fs_runtime_ugid_map")]
if let Some(ref cfg) = opts.cfg {
if !cfg.ugid_map.is_empty() && !opts.disable_sandbox {
bail!("uid_gid_map can only be set with disable sandbox option");
if let Some(ref mut cfg) = opts.cfg {
if !cfg.ugid_map.is_empty() && (!opts.disable_sandbox || !opts.skip_pivot_root) {
bail!("uid_gid_map can only be set with disable sandbox and skip_pivot_root option");
}
if opts.skip_pivot_root {
is_pivot_root_required = false;
}
}
let ex = Executor::new().context("Failed to create executor")?;
let fs_device = FsBackend::new(&opts.tag, opts.cfg)?;
let fs_device = FsBackend::new(
&opts.tag,
opts.shared_dir
.to_str()
.expect("Failed to convert opts.shared_dir to str()"),
opts.skip_pivot_root,
opts.cfg,
)?;
let mut keep_rds = fs_device.keep_rds.clone();
keep_rds.append(&mut ex.as_raw_descriptors());
@ -128,7 +148,6 @@ pub fn start_device(opts: Options) -> anyhow::Result<()> {
base::syslog::push_descriptors(&mut keep_rds);
cros_tracing::push_descriptors!(&mut keep_rds);
metrics::push_descriptors(&mut keep_rds);
let pid = jail_and_fork(
keep_rds,
opts.shared_dir,
@ -137,6 +156,7 @@ pub fn start_device(opts: Options) -> anyhow::Result<()> {
opts.uid_map,
opts.gid_map,
opts.disable_sandbox,
is_pivot_root_required,
)?;
// Parent, nothing to do but wait and then exit

View file

@ -213,3 +213,112 @@ fn vhost_user_fs_mount_rw() {
mount_rw(vm, tag, temp_dir);
}
fn copy_file_validate_ugid_mapping(
mut vm: TestVm,
tag: &str,
dir: TempDir,
mapped_uid: u32,
mapped_gid: u32,
) {
use std::os::linux::fs::MetadataExt;
const ORIGINAL_FILE_NAME: &str = "original.txt";
const NEW_FILE_NAME: &str = "new.txt";
const TEST_DATA: &str = "Hello world!";
let orig_file = dir.path().join(ORIGINAL_FILE_NAME);
std::fs::write(orig_file, TEST_DATA).unwrap();
vm.exec_in_guest(&format!(
"mount -t virtiofs {tag} /mnt && cp /mnt/{} /mnt/{} && sync",
ORIGINAL_FILE_NAME, NEW_FILE_NAME,
))
.unwrap();
let output = vm
.exec_in_guest(&format!("stat /mnt/{}", ORIGINAL_FILE_NAME,))
.unwrap();
assert!(output.stdout.contains(&format!("Uid: ({}/", mapped_uid)));
assert!(output.stdout.contains(&format!("Gid: ({}/", mapped_gid)));
let new_file = dir.path().join(NEW_FILE_NAME);
let output_stat = std::fs::metadata(new_file.clone());
assert_eq!(
output_stat
.as_ref()
.expect("stat of new_file failed")
.st_uid(),
base::geteuid()
);
assert_eq!(
output_stat
.as_ref()
.expect("stat of new_file failed")
.st_gid(),
base::getegid()
);
let contents = std::fs::read(new_file).unwrap();
assert_eq!(TEST_DATA.as_bytes(), &contents);
}
pub fn create_ugid_map_config(
socket: &Path,
shared_dir: &Path,
tag: &str,
mapped_uid: u32,
mapped_gid: u32,
) -> VuConfig {
let socket_path = socket.to_str().unwrap();
let shared_dir_path = shared_dir.to_str().unwrap();
let uid = base::geteuid();
let gid = base::getegid();
let ugid_map_value = format!("{} {} {} {} 7 /", mapped_uid, mapped_gid, uid, gid,);
let cfg_arg = format!("writeback=true,ugid_map='{}'", ugid_map_value);
println!("socket={socket_path}, tag={tag}, shared_dir={shared_dir_path}");
VuConfig::new(CmdType::Device, "vhost-user-fs").extra_args(vec![
"fs".to_string(),
format!("--socket-path={socket_path}"),
format!("--shared-dir={shared_dir_path}"),
format!("--tag={tag}"),
format!("--cfg={cfg_arg}"),
format!("--disable-sandbox"),
format!("--skip-pivot-root=true"),
])
}
/// Tests file copy with disabled sandbox
///
/// 1. Create `original.txt` on a temporal directory.
/// 3: Setup ugid_map for vhost-user-fs backend
/// 2. Start a VM with a virtiofs device for the temporal directory.
/// 3. Copy `original.txt` to `new.txt` in the guest.
/// 4. Check that `new.txt` is created in the host.
/// 5: Verify the UID/GID of the files both in the guest and the host.
#[test]
fn vhost_user_fs_without_sandbox_and_pivot_root() {
let socket = NamedTempFile::new().unwrap();
let temp_dir = tempfile::tempdir().unwrap();
let config = Config::new();
let tag = "android";
let mapped_uid = 123456;
let mapped_gid = 12345;
let vu_config =
create_ugid_map_config(socket.path(), temp_dir.path(), tag, mapped_uid, mapped_gid);
let _vu_device = VhostUserBackend::new(vu_config).unwrap();
let config = config.with_vhost_user_fs(socket.path(), tag);
let vm = TestVm::new(config).unwrap();
copy_file_validate_ugid_mapping(vm, tag, temp_dir, mapped_uid, mapped_gid);
}

View file

@ -120,13 +120,11 @@ pub fn create_base_minijail(root: &Path, max_open_files: u64) -> Result<Minijail
bail!("{:?} is not absolute path", root);
}
// All child jails run in a new user namespace without any users mapped, they run as nobody
// unless otherwise configured.
let mut jail = Minijail::new().context("failed to jail device")?;
// Only pivot_root if we are not re-using the current root directory.
if root != Path::new("/") {
// It's safe to call `namespace_vfs` multiple times.
// Run in a new mount namespace.
jail.namespace_vfs();
jail.enter_pivot_root(root)
.context("failed to pivot root device")?;
@ -138,6 +136,42 @@ pub fn create_base_minijail(root: &Path, max_open_files: u64) -> Result<Minijail
Ok(jail)
}
/// Creates a [Minijail] instance which just invokes a jail process and sets
/// `max_open_files` using `RLIMIT_NOFILE`. This is helpful with crosvm process
/// runs as a non-root user without SYS_ADMIN capabilities.
///
/// Unlike `create_base_minijail`, this function doesn't call `pivot_root`
/// and `mount namespace`. So, it runs as a non-root user without
/// SYS_ADMIN capabilities.
///
/// Note that since there is no file system isolation provided by this function,
/// caller of this function should enforce other security mechanisum such as selinux
/// on the host to protect directories.
///
/// # Arguments
///
/// * `root` - The root path to checked before the process is jailed
/// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
#[allow(clippy::unnecessary_cast)]
pub fn create_base_minijail_without_pivot_root(
root: &Path,
max_open_files: u64,
) -> Result<Minijail> {
// Validate new root directory. Path::is_dir() also checks the existence.
if !root.is_dir() {
bail!("{:?} is not a directory, cannot create jail", root);
}
if !root.is_absolute() {
bail!("{:?} is not absolute path", root);
}
let mut jail = Minijail::new().context("failed to jail device")?;
jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
.context("error setting max open files")?;
Ok(jail)
}
/// Creates a [Minijail] instance which creates a sandbox.
///
/// # Arguments