diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 3129e6a432..91df4773ff 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -8,14 +8,14 @@ The principle characteristics of crosvm are: - Written in Rust for security and safety A typical session of crosvm starts in `main.rs` where command line parsing is done to build up a -`Config` structure. The `Config` is used by `run_config` in `linux.rs` to setup and execute a VM. -Broken down into rough steps: +`Config` structure. The `Config` is used by `run_config` in `linux/mod.rs` to setup and execute a +VM. Broken down into rough steps: 1. Load the linux kernel from an ELF file. 1. Create a handful of control sockets used by the virtual devices. 1. Invoke the architecture specific VM builder `Arch::build_vm` (located in `x86_64/src/lib.rs` or `aarch64/src/lib.rs`). -1. `Arch::build_vm` will itself invoke the provided `create_devices` function from `linux.rs` +1. `Arch::build_vm` will itself invoke the provided `create_devices` function from `linux/mod.rs` 1. `create_devices` creates every PCI device, including the virtio devices, that were configured in `Config`, along with matching [minijail] configs for each. 1. `Arch::generate_pci_root`, using a list of every PCI device with optional `Minijail`, will @@ -35,12 +35,12 @@ invalid. ## Sandboxing Policy -Every sandbox is made with [minijail] and starts with `create_base_minijail` in `linux.rs` which set -some very restrictive settings. Linux namespaces and seccomp filters are used extensively. Each -seccomp policy can be found under `seccomp/{arch}/{device}.policy` and should start by -`@include`-ing the `common_device.policy`. With the exception of architecture specific devices (such -as `Pl030` on ARM or `I8042` on x86_64), every device will need a different policy for each -supported architecture. +Every sandbox is made with [minijail] and starts with `create_base_minijail` in +`linux/jail_helpers.rs` which set some very restrictive settings. Linux namespaces and seccomp +filters are used extensively. Each seccomp policy can be found under +`seccomp/{arch}/{device}.policy` and should start by `@include`-ing the `common_device.policy`. With +the exception of architecture specific devices (such as `Pl030` on ARM or `I8042` on x86_64), every +device will need a different policy for each supported architecture. ## The VM Control Sockets diff --git a/docs/book/src/appendix/minijail.md b/docs/book/src/appendix/minijail.md index 0408261fe7..9a48906558 100644 --- a/docs/book/src/appendix/minijail.md +++ b/docs/book/src/appendix/minijail.md @@ -8,8 +8,8 @@ The fact that minijail was written, maintained, and continuously tested by a pro team more than makes up for its being written in an memory unsafe language. The exact configuration of the sandbox varies by device, but they are mostly alike. See -`create_base_minijail` from `linux.rs`. The set of security constraints explicitly used in crosvm -are: +`create_base_minijail` from `linux/jail_helpers.rs`. The set of security constraints explicitly used +in crosvm are: - PID Namespace - Runs as init diff --git a/src/linux/device_helpers.rs b/src/linux/device_helpers.rs new file mode 100644 index 0000000000..7c80e02135 --- /dev/null +++ b/src/linux/device_helpers.rs @@ -0,0 +1,1147 @@ +// Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use std::collections::BTreeMap; +use std::convert::TryFrom; +use std::fs::{File, OpenOptions}; +use std::net::Ipv4Addr; +use std::os::unix::net::UnixListener; +use std::os::unix::{io::FromRawFd, net::UnixStream, prelude::OpenOptionsExt}; +use std::path::{Path, PathBuf}; +use std::str; +use std::sync::Arc; + +use anyhow::{anyhow, bail, Context, Result}; +use base::*; +use devices::serial_device::SerialParameters; +use devices::vfio::{VfioCommonSetup, VfioCommonTrait}; +#[cfg(feature = "audio_cras")] +use devices::virtio::snd::cras_backend::Parameters as CrasSndParameters; +use devices::virtio::vhost::user::proxy::VirtioVhostUser; +#[cfg(feature = "audio")] +use devices::virtio::vhost::user::vmm::Snd as VhostUserSnd; +use devices::virtio::vhost::user::vmm::{ + Block as VhostUserBlock, Console as VhostUserConsole, Fs as VhostUserFs, + Mac80211Hwsim as VhostUserMac80211Hwsim, Net as VhostUserNet, Vsock as VhostUserVsock, + Wl as VhostUserWl, +}; +#[cfg(any(feature = "video-decoder", feature = "video-encoder"))] +use devices::virtio::VideoBackendType; +use devices::virtio::{self, Console, VirtioDevice}; +use devices::IommuDevType; +use devices::{self, PciDevice, VfioContainer, VfioDevice, VfioPciDevice, VfioPlatformDevice}; +use hypervisor::Vm; +use minijail::{self, Minijail}; +use net_util::{MacAddress, Tap}; +use resources::{Alloc, MmioType, SystemAllocator}; +use sync::Mutex; +use vm_memory::GuestAddress; + +use crate::{ + Config, DiskOption, TouchDeviceOption, VhostUserFsOption, VhostUserOption, VhostUserWlOption, + VhostVsockDeviceParameter, +}; +use arch::{self, VirtioDeviceStub}; + +use super::jail_helpers::*; + +pub enum TaggedControlTube { + Fs(Tube), + Vm(Tube), + VmMemory(Tube), + VmIrq(Tube), + VmMsync(Tube), +} + +impl AsRef for TaggedControlTube { + fn as_ref(&self) -> &Tube { + use self::TaggedControlTube::*; + match &self { + Fs(tube) | Vm(tube) | VmMemory(tube) | VmIrq(tube) | VmMsync(tube) => tube, + } + } +} + +impl AsRawDescriptor for TaggedControlTube { + fn as_raw_descriptor(&self) -> RawDescriptor { + self.as_ref().as_raw_descriptor() + } +} + +pub trait IntoUnixStream { + fn into_unix_stream(self) -> Result; +} + +impl<'a> IntoUnixStream for &'a Path { + fn into_unix_stream(self) -> Result { + if let Some(fd) = safe_descriptor_from_path(self).context("failed to open event device")? { + Ok(fd.into()) + } else { + UnixStream::connect(self).context("failed to open event device") + } + } +} + +impl<'a> IntoUnixStream for &'a PathBuf { + fn into_unix_stream(self) -> Result { + self.as_path().into_unix_stream() + } +} + +impl IntoUnixStream for UnixStream { + fn into_unix_stream(self) -> Result { + Ok(self) + } +} + +pub type DeviceResult = Result; + +pub fn create_block_device( + cfg: &Config, + disk: &DiskOption, + disk_device_tube: Tube, +) -> DeviceResult { + let raw_image: File = open_file(&disk.path, disk.read_only, disk.o_direct) + .with_context(|| format!("failed to load disk image {}", disk.path.display()))?; + // Lock the disk image to prevent other crosvm instances from using it. + let lock_op = if disk.read_only { + FlockOperation::LockShared + } else { + FlockOperation::LockExclusive + }; + flock(&raw_image, lock_op, true).context("failed to lock disk image")?; + + info!("Trying to attach block device: {}", disk.path.display()); + let dev = if disk::async_ok(&raw_image).context("failed to check disk async_ok")? { + let async_file = disk::create_async_disk_file(raw_image) + .context("failed to create async virtual disk")?; + Box::new( + virtio::BlockAsync::new( + virtio::base_features(cfg.protected_vm), + async_file, + disk.read_only, + disk.sparse, + disk.block_size, + disk.id, + Some(disk_device_tube), + ) + .context("failed to create block device")?, + ) as Box + } else { + let disk_file = disk::create_disk_file(raw_image, disk::MAX_NESTING_DEPTH) + .context("failed to create virtual disk")?; + Box::new( + virtio::Block::new( + virtio::base_features(cfg.protected_vm), + disk_file, + disk.read_only, + disk.sparse, + disk.block_size, + disk.id, + Some(disk_device_tube), + ) + .context("failed to create block device")?, + ) as Box + }; + + Ok(VirtioDeviceStub { + dev, + jail: simple_jail(cfg, "block_device")?, + }) +} + +pub fn create_vhost_user_block_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { + let dev = VhostUserBlock::new(virtio::base_features(cfg.protected_vm), &opt.socket) + .context("failed to set up vhost-user block device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + // no sandbox here because virtqueue handling is exported to a different process. + jail: None, + }) +} + +pub fn create_vhost_user_console_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { + let dev = VhostUserConsole::new(virtio::base_features(cfg.protected_vm), &opt.socket) + .context("failed to set up vhost-user console device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + // no sandbox here because virtqueue handling is exported to a different process. + jail: None, + }) +} + +pub fn create_vhost_user_fs_device(cfg: &Config, option: &VhostUserFsOption) -> DeviceResult { + let dev = VhostUserFs::new( + virtio::base_features(cfg.protected_vm), + &option.socket, + &option.tag, + ) + .context("failed to set up vhost-user fs device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + // no sandbox here because virtqueue handling is exported to a different process. + jail: None, + }) +} + +pub fn create_vhost_user_mac80211_hwsim_device( + cfg: &Config, + opt: &VhostUserOption, +) -> DeviceResult { + let dev = VhostUserMac80211Hwsim::new(virtio::base_features(cfg.protected_vm), &opt.socket) + .context("failed to set up vhost-user mac80211_hwsim device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + // no sandbox here because virtqueue handling is exported to a different process. + jail: None, + }) +} + +#[cfg(feature = "audio")] +pub fn create_vhost_user_snd_device(cfg: &Config, option: &VhostUserOption) -> DeviceResult { + let dev = VhostUserSnd::new(virtio::base_features(cfg.protected_vm), &option.socket) + .context("failed to set up vhost-user snd device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + // no sandbox here because virtqueue handling is exported to a different process. + jail: None, + }) +} + +pub fn create_vvu_proxy_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { + let listener = UnixListener::bind(&opt.socket).map_err(|e| { + error!("failed to bind listener for vvu proxy device: {}", e); + e + })?; + + let dev = VirtioVhostUser::new(virtio::base_features(cfg.protected_vm), listener) + .context("failed to create VVU proxy device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "vvu_proxy_device")?, + }) +} + +pub fn create_rng_device(cfg: &Config) -> DeviceResult { + let dev = virtio::Rng::new(virtio::base_features(cfg.protected_vm)) + .context("failed to set up rng")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "rng_device")?, + }) +} + +#[cfg(feature = "audio_cras")] +pub fn create_cras_snd_device(cfg: &Config, cras_snd: CrasSndParameters) -> DeviceResult { + let dev = virtio::snd::cras_backend::VirtioSndCras::new( + virtio::base_features(cfg.protected_vm), + cras_snd, + ) + .context("failed to create cras sound device")?; + + let jail = match simple_jail(&cfg, "cras_snd_device")? { + Some(mut jail) => { + // Create a tmpfs in the device's root directory for cras_snd_device. + // The size is 20*1024, or 20 KB. + jail.mount_with_data( + Path::new("none"), + Path::new("/"), + "tmpfs", + (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, + "size=20480", + )?; + + let run_cras_path = Path::new("/run/cras"); + jail.mount_bind(run_cras_path, run_cras_path, true)?; + + add_current_user_to_jail(&mut jail)?; + + Some(jail) + } + None => None, + }; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail, + }) +} + +#[cfg(feature = "tpm")] +pub fn create_tpm_device(cfg: &Config) -> DeviceResult { + use std::ffi::CString; + use std::fs; + use std::process; + + let tpm_storage: PathBuf; + let mut tpm_jail = simple_jail(cfg, "tpm_device")?; + + match &mut tpm_jail { + Some(jail) => { + // Create a tmpfs in the device's root directory for tpm + // simulator storage. The size is 20*1024, or 20 KB. + jail.mount_with_data( + Path::new("none"), + Path::new("/"), + "tmpfs", + (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, + "size=20480", + )?; + + let crosvm_ids = add_current_user_to_jail(jail)?; + + let pid = process::id(); + let tpm_pid_dir = format!("/run/vm/tpm.{}", pid); + tpm_storage = Path::new(&tpm_pid_dir).to_owned(); + fs::create_dir_all(&tpm_storage).with_context(|| { + format!("failed to create tpm storage dir {}", tpm_storage.display()) + })?; + let tpm_pid_dir_c = CString::new(tpm_pid_dir).expect("no nul bytes"); + chown(&tpm_pid_dir_c, crosvm_ids.uid, crosvm_ids.gid) + .context("failed to chown tpm storage")?; + + jail.mount_bind(&tpm_storage, &tpm_storage, true)?; + } + None => { + // Path used inside cros_sdk which does not have /run/vm. + tpm_storage = Path::new("/tmp/tpm-simulator").to_owned(); + } + } + + let dev = virtio::Tpm::new(tpm_storage); + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: tpm_jail, + }) +} + +pub fn create_single_touch_device( + cfg: &Config, + single_touch_spec: &TouchDeviceOption, + idx: u32, +) -> DeviceResult { + let socket = single_touch_spec + .get_path() + .into_unix_stream() + .map_err(|e| { + error!("failed configuring virtio single touch: {:?}", e); + e + })?; + + let (width, height) = single_touch_spec.get_size(); + let dev = virtio::new_single_touch( + idx, + socket, + width, + height, + virtio::base_features(cfg.protected_vm), + ) + .context("failed to set up input device")?; + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "input_device")?, + }) +} + +pub fn create_multi_touch_device( + cfg: &Config, + multi_touch_spec: &TouchDeviceOption, + idx: u32, +) -> DeviceResult { + let socket = multi_touch_spec + .get_path() + .into_unix_stream() + .map_err(|e| { + error!("failed configuring virtio multi touch: {:?}", e); + e + })?; + + let (width, height) = multi_touch_spec.get_size(); + let dev = virtio::new_multi_touch( + idx, + socket, + width, + height, + virtio::base_features(cfg.protected_vm), + ) + .context("failed to set up input device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "input_device")?, + }) +} + +pub fn create_trackpad_device( + cfg: &Config, + trackpad_spec: &TouchDeviceOption, + idx: u32, +) -> DeviceResult { + let socket = trackpad_spec.get_path().into_unix_stream().map_err(|e| { + error!("failed configuring virtio trackpad: {:#}", e); + e + })?; + + let (width, height) = trackpad_spec.get_size(); + let dev = virtio::new_trackpad( + idx, + socket, + width, + height, + virtio::base_features(cfg.protected_vm), + ) + .context("failed to set up input device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "input_device")?, + }) +} + +pub fn create_mouse_device( + cfg: &Config, + mouse_socket: T, + idx: u32, +) -> DeviceResult { + let socket = mouse_socket.into_unix_stream().map_err(|e| { + error!("failed configuring virtio mouse: {:#}", e); + e + })?; + + let dev = virtio::new_mouse(idx, socket, virtio::base_features(cfg.protected_vm)) + .context("failed to set up input device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "input_device")?, + }) +} + +pub fn create_keyboard_device( + cfg: &Config, + keyboard_socket: T, + idx: u32, +) -> DeviceResult { + let socket = keyboard_socket.into_unix_stream().map_err(|e| { + error!("failed configuring virtio keyboard: {:#}", e); + e + })?; + + let dev = virtio::new_keyboard(idx, socket, virtio::base_features(cfg.protected_vm)) + .context("failed to set up input device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "input_device")?, + }) +} + +pub fn create_switches_device( + cfg: &Config, + switches_socket: T, + idx: u32, +) -> DeviceResult { + let socket = switches_socket.into_unix_stream().map_err(|e| { + error!("failed configuring virtio switches: {:#}", e); + e + })?; + + let dev = virtio::new_switches(idx, socket, virtio::base_features(cfg.protected_vm)) + .context("failed to set up input device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "input_device")?, + }) +} + +pub fn create_vinput_device(cfg: &Config, dev_path: &Path) -> DeviceResult { + let dev_file = OpenOptions::new() + .read(true) + .write(true) + .open(dev_path) + .with_context(|| format!("failed to open vinput device {}", dev_path.display()))?; + + let dev = virtio::new_evdev(dev_file, virtio::base_features(cfg.protected_vm)) + .context("failed to set up input device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "input_device")?, + }) +} + +pub fn create_balloon_device( + cfg: &Config, + tube: Tube, + inflate_tube: Option, + init_balloon_size: u64, +) -> DeviceResult { + let dev = virtio::Balloon::new( + virtio::base_features(cfg.protected_vm), + tube, + inflate_tube, + init_balloon_size, + ) + .context("failed to create balloon")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "balloon_device")?, + }) +} + +/// Generic method for creating a network device. `create_device` is a closure that takes the virtio +/// features and number of queue pairs as parameters, and is responsible for creating the device +/// itself. +pub fn create_net_device(cfg: &Config, policy: &str, create_device: F) -> DeviceResult +where + F: Fn(u64, u16) -> Result, + T: VirtioDevice + 'static, +{ + let mut vq_pairs = cfg.net_vq_pairs.unwrap_or(1); + let vcpu_count = cfg.vcpu_count.unwrap_or(1); + if vcpu_count < vq_pairs as usize { + warn!("the number of net vq pairs must not exceed the vcpu count, falling back to single queue mode"); + vq_pairs = 1; + } + let features = virtio::base_features(cfg.protected_vm); + + let dev = create_device(features, vq_pairs)?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev) as Box, + jail: simple_jail(cfg, policy)?, + }) +} + +/// Returns a network device created from a new TAP interface configured with `host_ip`, `netmask`, +/// and `mac_address`. +pub fn create_net_device_from_config( + cfg: &Config, + host_ip: Ipv4Addr, + netmask: Ipv4Addr, + mac_address: MacAddress, +) -> DeviceResult { + let policy = if cfg.vhost_net { + "vhost_net_device" + } else { + "net_device" + }; + + if cfg.vhost_net { + create_net_device(cfg, policy, |features, _vq_pairs| { + virtio::vhost::Net::>::new( + &cfg.vhost_net_device_path, + features, + host_ip, + netmask, + mac_address, + ) + .context("failed to set up vhost networking") + }) + } else { + create_net_device(cfg, policy, |features, vq_pairs| { + virtio::Net::::new(features, host_ip, netmask, mac_address, vq_pairs) + .context("failed to create virtio network device") + }) + } +} + +/// Returns a network device from a file descriptor to a configured TAP interface. +pub fn create_tap_net_device_from_fd(cfg: &Config, tap_fd: RawDescriptor) -> DeviceResult { + create_net_device(cfg, "net_device", |features, vq_pairs| { + // Safe because we ensure that we get a unique handle to the fd. + let tap = unsafe { + Tap::from_raw_descriptor( + validate_raw_descriptor(tap_fd).context("failed to validate tap descriptor")?, + ) + .context("failed to create tap device")? + }; + + virtio::Net::from(features, tap, vq_pairs).context("failed to create tap net device") + }) +} + +/// Returns a network device created by opening the persistent, configured TAP interface `tap_name`. +pub fn create_tap_net_device_from_name(cfg: &Config, tap_name: &[u8]) -> DeviceResult { + create_net_device(cfg, "net_device", |features, vq_pairs| { + virtio::Net::::new_from_name(features, tap_name, vq_pairs) + .context("failed to create configured virtio network device") + }) +} + +pub fn create_vhost_user_net_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { + let dev = VhostUserNet::new(virtio::base_features(cfg.protected_vm), &opt.socket) + .context("failed to set up vhost-user net device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + // no sandbox here because virtqueue handling is exported to a different process. + jail: None, + }) +} + +pub fn create_vhost_user_vsock_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { + let dev = VhostUserVsock::new(virtio::base_features(cfg.protected_vm), &opt.socket) + .context("failed to set up vhost-user vsock device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + // no sandbox here because virtqueue handling is exported to a different process. + jail: None, + }) +} + +pub fn create_vhost_user_wl_device(cfg: &Config, opt: &VhostUserWlOption) -> DeviceResult { + // The crosvm wl device expects us to connect the tube before it will accept a vhost-user + // connection. + let dev = VhostUserWl::new(virtio::base_features(cfg.protected_vm), &opt.socket) + .context("failed to set up vhost-user wl device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + // no sandbox here because virtqueue handling is exported to a different process. + jail: None, + }) +} + +pub fn create_wayland_device( + cfg: &Config, + control_tube: Tube, + resource_bridge: Option, +) -> DeviceResult { + let wayland_socket_dirs = cfg + .wayland_socket_paths + .iter() + .map(|(_name, path)| path.parent()) + .collect::>>() + .ok_or_else(|| anyhow!("wayland socket path has no parent or file name"))?; + + let features = virtio::base_features(cfg.protected_vm); + let dev = virtio::Wl::new( + features, + cfg.wayland_socket_paths.clone(), + control_tube, + resource_bridge, + ) + .context("failed to create wayland device")?; + + let jail = match simple_jail(cfg, "wl_device")? { + Some(mut jail) => { + // Create a tmpfs in the device's root directory so that we can bind mount the wayland + // socket directory into it. The size=67108864 is size=64*1024*1024 or size=64MB. + jail.mount_with_data( + Path::new("none"), + Path::new("/"), + "tmpfs", + (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, + "size=67108864", + )?; + + // Bind mount the wayland socket's directory into jail's root. This is necessary since + // each new wayland context must open() the socket. If the wayland socket is ever + // destroyed and remade in the same host directory, new connections will be possible + // without restarting the wayland device. + for dir in &wayland_socket_dirs { + jail.mount_bind(dir, dir, true)?; + } + add_current_user_to_jail(&mut jail)?; + + Some(jail) + } + None => None, + }; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail, + }) +} + +#[cfg(any(feature = "video-decoder", feature = "video-encoder"))] +pub fn create_video_device( + backend: VideoBackendType, + cfg: &Config, + typ: devices::virtio::VideoDeviceType, + resource_bridge: Tube, +) -> DeviceResult { + let jail = match simple_jail(cfg, "video_device")? { + Some(mut jail) => { + match typ { + #[cfg(feature = "video-decoder")] + devices::virtio::VideoDeviceType::Decoder => add_current_user_to_jail(&mut jail)?, + #[cfg(feature = "video-encoder")] + devices::virtio::VideoDeviceType::Encoder => add_current_user_to_jail(&mut jail)?, + }; + + // Create a tmpfs in the device's root directory so that we can bind mount files. + jail.mount_with_data( + Path::new("none"), + Path::new("/"), + "tmpfs", + (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, + "size=67108864", + )?; + + #[cfg(feature = "libvda")] + // Render node for libvda. + if backend == VideoBackendType::Libvda || backend == VideoBackendType::LibvdaVd { + // follow the implementation at: + // https://chromium.googlesource.com/chromiumos/platform/minigbm/+/c06cc9cccb3cf3c7f9d2aec706c27c34cd6162a0/cros_gralloc/cros_gralloc_driver.cc#90 + const DRM_NUM_NODES: u32 = 63; + const DRM_RENDER_NODE_START: u32 = 128; + for offset in 0..DRM_NUM_NODES { + let path_str = format!("/dev/dri/renderD{}", DRM_RENDER_NODE_START + offset); + let dev_dri_path = Path::new(&path_str); + if !dev_dri_path.exists() { + break; + } + jail.mount_bind(dev_dri_path, dev_dri_path, false)?; + } + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + // Device nodes used by libdrm through minigbm in libvda on AMD devices. + let sys_dev_char_path = Path::new("/sys/dev/char"); + jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?; + let sys_devices_path = Path::new("/sys/devices"); + jail.mount_bind(sys_devices_path, sys_devices_path, false)?; + + // Required for loading dri libraries loaded by minigbm on AMD devices. + jail_mount_bind_if_exists(&mut jail, &["/usr/lib64"])?; + } + + // Device nodes required by libchrome which establishes Mojo connection in libvda. + let dev_urandom_path = Path::new("/dev/urandom"); + jail.mount_bind(dev_urandom_path, dev_urandom_path, false)?; + let system_bus_socket_path = Path::new("/run/dbus/system_bus_socket"); + jail.mount_bind(system_bus_socket_path, system_bus_socket_path, true)?; + + Some(jail) + } + None => None, + }; + + Ok(VirtioDeviceStub { + dev: Box::new(devices::virtio::VideoDevice::new( + virtio::base_features(cfg.protected_vm), + typ, + backend, + Some(resource_bridge), + )), + jail, + }) +} + +#[cfg(any(feature = "video-decoder", feature = "video-encoder"))] +pub fn register_video_device( + backend: VideoBackendType, + devs: &mut Vec, + video_tube: Tube, + cfg: &Config, + typ: devices::virtio::VideoDeviceType, +) -> Result<()> { + devs.push(create_video_device(backend, cfg, typ, video_tube)?); + Ok(()) +} + +pub fn create_vhost_vsock_device(cfg: &Config, cid: u64) -> DeviceResult { + let features = virtio::base_features(cfg.protected_vm); + + let device_file = match cfg + .vhost_vsock_device + .as_ref() + .unwrap_or(&VhostVsockDeviceParameter::default()) + { + VhostVsockDeviceParameter::Fd(fd) => { + let fd = validate_raw_descriptor(*fd) + .context("failed to validate fd for virtual socker device")?; + // Safe because the `fd` is actually owned by this process and + // we have a unique handle to it. + unsafe { File::from_raw_fd(fd) } + } + VhostVsockDeviceParameter::Path(path) => OpenOptions::new() + .read(true) + .write(true) + .custom_flags(libc::O_CLOEXEC | libc::O_NONBLOCK) + .open(path) + .context("failed to open virtual socket device")?, + }; + + let dev = virtio::vhost::Vsock::new(device_file, features, cid) + .context("failed to set up virtual socket device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "vhost_vsock_device")?, + }) +} + +pub fn create_fs_device( + cfg: &Config, + uid_map: &str, + gid_map: &str, + src: &Path, + tag: &str, + fs_cfg: virtio::fs::passthrough::Config, + device_tube: Tube, +) -> DeviceResult { + let max_open_files = + base::get_max_open_files().context("failed to get max number of open files")?; + let j = if cfg.sandbox { + let seccomp_policy = cfg.seccomp_policy_dir.join("fs_device"); + let config = SandboxConfig { + limit_caps: false, + uid_map: Some(uid_map), + gid_map: Some(gid_map), + log_failures: cfg.seccomp_log_failures, + seccomp_policy: &seccomp_policy, + // We want bind mounts from the parent namespaces to propagate into the fs device's + // namespace. + remount_mode: Some(libc::MS_SLAVE), + }; + create_base_minijail(src, Some(max_open_files), Some(&config))? + } else { + create_base_minijail(src, Some(max_open_files), None)? + }; + + let features = virtio::base_features(cfg.protected_vm); + // TODO(chirantan): Use more than one worker once the kernel driver has been fixed to not panic + // when num_queues > 1. + let dev = virtio::fs::Fs::new(features, tag, 1, fs_cfg, device_tube) + .context("failed to create fs device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: Some(j), + }) +} + +pub fn create_9p_device( + cfg: &Config, + uid_map: &str, + gid_map: &str, + src: &Path, + tag: &str, + mut p9_cfg: p9::Config, +) -> DeviceResult { + let max_open_files = + base::get_max_open_files().context("failed to get max number of open files")?; + let (jail, root) = if cfg.sandbox { + let seccomp_policy = cfg.seccomp_policy_dir.join("9p_device"); + let config = SandboxConfig { + limit_caps: false, + uid_map: Some(uid_map), + gid_map: Some(gid_map), + log_failures: cfg.seccomp_log_failures, + seccomp_policy: &seccomp_policy, + // We want bind mounts from the parent namespaces to propagate into the 9p server's + // namespace. + remount_mode: Some(libc::MS_SLAVE), + }; + + let jail = create_base_minijail(src, Some(max_open_files), Some(&config))?; + + // The shared directory becomes the root of the device's file system. + let root = Path::new("/"); + (Some(jail), root) + } else { + // There's no mount namespace so we tell the server to treat the source directory as the + // root. + (None, src) + }; + + let features = virtio::base_features(cfg.protected_vm); + p9_cfg.root = root.into(); + let dev = virtio::P9::new(features, tag, p9_cfg).context("failed to create 9p device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail, + }) +} + +pub fn create_pmem_device( + cfg: &Config, + vm: &mut impl Vm, + resources: &mut SystemAllocator, + disk: &DiskOption, + index: usize, + pmem_device_tube: Tube, +) -> DeviceResult { + let fd = open_file(&disk.path, disk.read_only, false /*O_DIRECT*/) + .with_context(|| format!("failed to load disk image {}", disk.path.display()))?; + + let (disk_size, arena_size) = { + let metadata = std::fs::metadata(&disk.path).with_context(|| { + format!("failed to get disk image {} metadata", disk.path.display()) + })?; + let disk_len = metadata.len(); + // Linux requires pmem region sizes to be 2 MiB aligned. Linux will fill any partial page + // at the end of an mmap'd file and won't write back beyond the actual file length, but if + // we just align the size of the file to 2 MiB then access beyond the last page of the + // mapped file will generate SIGBUS. So use a memory mapping arena that will provide + // padding up to 2 MiB. + let alignment = 2 * 1024 * 1024; + let align_adjust = if disk_len % alignment != 0 { + alignment - (disk_len % alignment) + } else { + 0 + }; + ( + disk_len, + disk_len + .checked_add(align_adjust) + .ok_or_else(|| anyhow!("pmem device image too big"))?, + ) + }; + + let protection = { + if disk.read_only { + Protection::read() + } else { + Protection::read_write() + } + }; + + let arena = { + // Conversion from u64 to usize may fail on 32bit system. + let arena_size = usize::try_from(arena_size).context("pmem device image too big")?; + let disk_size = usize::try_from(disk_size).context("pmem device image too big")?; + + let mut arena = + MemoryMappingArena::new(arena_size).context("failed to reserve pmem memory")?; + arena + .add_fd_offset_protection(0, disk_size, &fd, 0, protection) + .context("failed to reserve pmem memory")?; + + // If the disk is not a multiple of the page size, the OS will fill the remaining part + // of the page with zeroes. However, the anonymous mapping added below must start on a + // page boundary, so round up the size before calculating the offset of the anon region. + let disk_size = round_up_to_page_size(disk_size); + + if arena_size > disk_size { + // Add an anonymous region with the same protection as the disk mapping if the arena + // size was aligned. + arena + .add_anon_protection(disk_size, arena_size - disk_size, protection) + .context("failed to reserve pmem padding")?; + } + arena + }; + + let mapping_address = resources + .mmio_allocator(MmioType::High) + .reverse_allocate_with_align( + arena_size, + Alloc::PmemDevice(index), + format!("pmem_disk_image_{}", index), + // Linux kernel requires pmem namespaces to be 128 MiB aligned. + 128 * 1024 * 1024, /* 128 MiB */ + ) + .context("failed to allocate memory for pmem device")?; + + let slot = vm + .add_memory_region( + GuestAddress(mapping_address), + Box::new(arena), + /* read_only = */ disk.read_only, + /* log_dirty_pages = */ false, + ) + .context("failed to add pmem device memory")?; + + let dev = virtio::Pmem::new( + virtio::base_features(cfg.protected_vm), + fd, + GuestAddress(mapping_address), + slot, + arena_size, + Some(pmem_device_tube), + ) + .context("failed to create pmem device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev) as Box, + jail: simple_jail(cfg, "pmem_device")?, + }) +} + +pub fn create_iommu_device( + cfg: &Config, + phys_max_addr: u64, + endpoints: BTreeMap>>, +) -> DeviceResult { + let dev = virtio::Iommu::new( + virtio::base_features(cfg.protected_vm), + endpoints, + phys_max_addr, + ) + .context("failed to create IOMMU device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "iommu_device")?, + }) +} + +pub fn create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult { + let mut keep_rds = Vec::new(); + let evt = Event::new().context("failed to create event")?; + let dev = param + .create_serial_device::(cfg.protected_vm, &evt, &mut keep_rds) + .context("failed to create console device")?; + + let jail = match simple_jail(cfg, "serial")? { + Some(mut jail) => { + // Create a tmpfs in the device's root directory so that we can bind mount the + // log socket directory into it. + // The size=67108864 is size=64*1024*1024 or size=64MB. + jail.mount_with_data( + Path::new("none"), + Path::new("/"), + "tmpfs", + (libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_NOSUID) as usize, + "size=67108864", + )?; + add_current_user_to_jail(&mut jail)?; + let res = param.add_bind_mounts(&mut jail); + if res.is_err() { + error!("failed to add bind mounts for console device"); + } + Some(jail) + } + None => None, + }; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail, // TODO(dverkamp): use a separate policy for console? + }) +} + +#[cfg(feature = "audio")] +pub fn create_sound_device(path: &Path, cfg: &Config) -> DeviceResult { + let dev = virtio::new_sound(path, virtio::base_features(cfg.protected_vm)) + .context("failed to create sound device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "vios_audio_device")?, + }) +} + +pub fn create_vfio_device( + cfg: &Config, + vm: &impl Vm, + resources: &mut SystemAllocator, + control_tubes: &mut Vec, + vfio_path: &Path, + bus_num: Option, + iommu_endpoints: &mut BTreeMap>>, + coiommu_endpoints: Option<&mut Vec>, + iommu_dev: IommuDevType, +) -> DeviceResult<(Box, Option)> { + let vfio_container = VfioCommonSetup::vfio_get_container(iommu_dev, Some(vfio_path)) + .context("failed to get vfio container")?; + + // create MSI, MSI-X, and Mem request sockets for each vfio device + let (vfio_host_tube_msi, vfio_device_tube_msi) = + Tube::pair().context("failed to create tube")?; + control_tubes.push(TaggedControlTube::VmIrq(vfio_host_tube_msi)); + + let (vfio_host_tube_msix, vfio_device_tube_msix) = + Tube::pair().context("failed to create tube")?; + control_tubes.push(TaggedControlTube::VmIrq(vfio_host_tube_msix)); + + let (vfio_host_tube_mem, vfio_device_tube_mem) = + Tube::pair().context("failed to create tube")?; + control_tubes.push(TaggedControlTube::VmMemory(vfio_host_tube_mem)); + + let hotplug = bus_num.is_some(); + let vfio_device_tube_vm = if hotplug { + let (vfio_host_tube_vm, device_tube_vm) = Tube::pair().context("failed to create tube")?; + control_tubes.push(TaggedControlTube::Vm(vfio_host_tube_vm)); + Some(device_tube_vm) + } else { + None + }; + + let vfio_device = VfioDevice::new_passthrough( + &vfio_path, + vm, + vfio_container.clone(), + iommu_dev != IommuDevType::NoIommu, + ) + .context("failed to create vfio device")?; + let mut vfio_pci_device = Box::new(VfioPciDevice::new( + vfio_device, + bus_num, + vfio_device_tube_msi, + vfio_device_tube_msix, + vfio_device_tube_mem, + vfio_device_tube_vm, + )); + // early reservation for pass-through PCI devices. + let endpoint_addr = vfio_pci_device + .allocate_address(resources) + .context("failed to allocate resources early for vfio pci dev")?; + + match iommu_dev { + IommuDevType::NoIommu => {} + IommuDevType::VirtioIommu => { + iommu_endpoints.insert(endpoint_addr.to_u32(), vfio_container); + } + IommuDevType::CoIommu => { + if let Some(endpoints) = coiommu_endpoints { + endpoints.push(endpoint_addr.to_u32() as u16); + } else { + bail!("Missed coiommu_endpoints vector to store the endpoint addr"); + } + } + } + + if hotplug { + Ok((vfio_pci_device, None)) + } else { + Ok((vfio_pci_device, simple_jail(cfg, "vfio_device")?)) + } +} + +pub fn create_vfio_platform_device( + cfg: &Config, + vm: &impl Vm, + _resources: &mut SystemAllocator, + control_tubes: &mut Vec, + vfio_path: &Path, + _endpoints: &mut BTreeMap>>, + iommu_dev: IommuDevType, +) -> DeviceResult<(VfioPlatformDevice, Option)> { + let vfio_container = VfioCommonSetup::vfio_get_container(iommu_dev, Some(vfio_path)) + .context("Failed to create vfio device")?; + + let (vfio_host_tube_mem, vfio_device_tube_mem) = + Tube::pair().context("failed to create tube")?; + control_tubes.push(TaggedControlTube::VmMemory(vfio_host_tube_mem)); + + let vfio_device = VfioDevice::new_passthrough( + &vfio_path, + vm, + vfio_container, + iommu_dev != IommuDevType::NoIommu, + ) + .context("Failed to create vfio device")?; + let vfio_plat_dev = VfioPlatformDevice::new(vfio_device, vfio_device_tube_mem); + + Ok((vfio_plat_dev, simple_jail(cfg, "vfio_platform_device")?)) +} diff --git a/src/linux/gpu.rs b/src/linux/gpu.rs new file mode 100644 index 0000000000..38cfad2db2 --- /dev/null +++ b/src/linux/gpu.rs @@ -0,0 +1,331 @@ +// Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +//! GPU related things +//! depends on "gpu" feature +use std::collections::HashSet; +use std::env; + +use devices::virtio::vhost::user::vmm::Gpu as VhostUserGpu; +use devices::virtio::GpuRenderServerParameters; + +use super::*; + +pub fn create_vhost_user_gpu_device( + cfg: &Config, + opt: &VhostUserOption, + host_tube: Tube, + device_tube: Tube, +) -> DeviceResult { + // The crosvm gpu device expects us to connect the tube before it will accept a vhost-user + // connection. + let dev = VhostUserGpu::new( + virtio::base_features(cfg.protected_vm), + &opt.socket, + host_tube, + device_tube, + ) + .context("failed to set up vhost-user gpu device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + // no sandbox here because virtqueue handling is exported to a different process. + jail: None, + }) +} + +pub fn gpu_jail(cfg: &Config, policy: &str) -> Result> { + match simple_jail(cfg, policy)? { + Some(mut jail) => { + // Create a tmpfs in the device's root directory so that we can bind mount the + // dri directory into it. The size=67108864 is size=64*1024*1024 or size=64MB. + jail.mount_with_data( + Path::new("none"), + Path::new("/"), + "tmpfs", + (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, + "size=67108864", + )?; + + // Device nodes required for DRM. + let sys_dev_char_path = Path::new("/sys/dev/char"); + jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?; + let sys_devices_path = Path::new("/sys/devices"); + jail.mount_bind(sys_devices_path, sys_devices_path, false)?; + + let drm_dri_path = Path::new("/dev/dri"); + if drm_dri_path.exists() { + jail.mount_bind(drm_dri_path, drm_dri_path, false)?; + } + + // If the ARM specific devices exist on the host, bind mount them in. + let mali0_path = Path::new("/dev/mali0"); + if mali0_path.exists() { + jail.mount_bind(mali0_path, mali0_path, true)?; + } + + let pvr_sync_path = Path::new("/dev/pvr_sync"); + if pvr_sync_path.exists() { + jail.mount_bind(pvr_sync_path, pvr_sync_path, true)?; + } + + // If the udmabuf driver exists on the host, bind mount it in. + let udmabuf_path = Path::new("/dev/udmabuf"); + if udmabuf_path.exists() { + jail.mount_bind(udmabuf_path, udmabuf_path, true)?; + } + + // Libraries that are required when mesa drivers are dynamically loaded. + jail_mount_bind_if_exists( + &mut jail, + &[ + "/usr/lib", + "/usr/lib64", + "/lib", + "/lib64", + "/usr/share/drirc.d", + "/usr/share/glvnd", + "/usr/share/vulkan", + ], + )?; + + // pvr driver requires read access to /proc/self/task/*/comm. + let proc_path = Path::new("/proc"); + jail.mount( + proc_path, + proc_path, + "proc", + (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_RDONLY) as usize, + )?; + + // To enable perfetto tracing, we need to give access to the perfetto service IPC + // endpoints. + let perfetto_path = Path::new("/run/perfetto"); + if perfetto_path.exists() { + jail.mount_bind(perfetto_path, perfetto_path, true)?; + } + + Ok(Some(jail)) + } + None => Ok(None), + } +} + +pub struct GpuCacheInfo<'a> { + directory: Option<&'a str>, + environment: Vec<(&'a str, &'a str)>, +} + +pub fn get_gpu_cache_info<'a>( + cache_dir: Option<&'a String>, + cache_size: Option<&'a String>, + sandbox: bool, +) -> GpuCacheInfo<'a> { + let mut dir = None; + let mut env = Vec::new(); + + if let Some(cache_dir) = cache_dir { + if !Path::new(cache_dir).exists() { + warn!("shader caching dir {} does not exist", cache_dir); + env.push(("MESA_GLSL_CACHE_DISABLE", "true")); + } else if cfg!(any(target_arch = "arm", target_arch = "aarch64")) && sandbox { + warn!("shader caching not yet supported on ARM with sandbox enabled"); + env.push(("MESA_GLSL_CACHE_DISABLE", "true")); + } else { + dir = Some(cache_dir.as_str()); + + env.push(("MESA_GLSL_CACHE_DISABLE", "false")); + env.push(("MESA_GLSL_CACHE_DIR", cache_dir.as_str())); + if let Some(cache_size) = cache_size { + env.push(("MESA_GLSL_CACHE_MAX_SIZE", cache_size.as_str())); + } + } + } + + GpuCacheInfo { + directory: dir, + environment: env, + } +} + +pub fn create_gpu_device( + cfg: &Config, + exit_evt: &Event, + gpu_device_tube: Tube, + resource_bridges: Vec, + wayland_socket_path: Option<&PathBuf>, + x_display: Option, + render_server_fd: Option, + event_devices: Vec, + map_request: Arc>>, +) -> DeviceResult { + let mut display_backends = vec![ + virtio::DisplayBackend::X(x_display), + virtio::DisplayBackend::Stub, + ]; + + let wayland_socket_dirs = cfg + .wayland_socket_paths + .iter() + .map(|(_name, path)| path.parent()) + .collect::>>() + .ok_or_else(|| anyhow!("wayland socket path has no parent or file name"))?; + + if let Some(socket_path) = wayland_socket_path { + display_backends.insert( + 0, + virtio::DisplayBackend::Wayland(Some(socket_path.to_owned())), + ); + } + + let dev = virtio::Gpu::new( + exit_evt.try_clone().context("failed to clone event")?, + Some(gpu_device_tube), + resource_bridges, + display_backends, + cfg.gpu_parameters.as_ref().unwrap(), + render_server_fd, + event_devices, + map_request, + cfg.sandbox, + virtio::base_features(cfg.protected_vm), + cfg.wayland_socket_paths.clone(), + ); + + let jail = match gpu_jail(cfg, "gpu_device")? { + Some(mut jail) => { + // Prepare GPU shader disk cache directory. + let (cache_dir, cache_size) = cfg + .gpu_parameters + .as_ref() + .map(|params| (params.cache_path.as_ref(), params.cache_size.as_ref())) + .unwrap(); + let cache_info = get_gpu_cache_info(cache_dir, cache_size, cfg.sandbox); + + if let Some(dir) = cache_info.directory { + jail.mount_bind(dir, dir, true)?; + } + for (key, val) in cache_info.environment { + env::set_var(key, val); + } + + // Bind mount the wayland socket's directory into jail's root. This is necessary since + // each new wayland context must open() the socket. If the wayland socket is ever + // destroyed and remade in the same host directory, new connections will be possible + // without restarting the wayland device. + for dir in &wayland_socket_dirs { + jail.mount_bind(dir, dir, true)?; + } + + add_current_user_to_jail(&mut jail)?; + + Some(jail) + } + None => None, + }; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail, + }) +} + +pub fn get_gpu_render_server_environment(cache_info: &GpuCacheInfo) -> Result> { + let mut env = Vec::new(); + + let mut cache_env_keys = HashSet::with_capacity(cache_info.environment.len()); + for (key, val) in cache_info.environment.iter() { + env.push(format!("{}={}", key, val)); + cache_env_keys.insert(*key); + } + + for (key_os, val_os) in env::vars_os() { + // minijail should accept OsStr rather than str... + let into_string_err = |_| anyhow!("invalid environment key/val"); + let key = key_os.into_string().map_err(into_string_err)?; + let val = val_os.into_string().map_err(into_string_err)?; + + if !cache_env_keys.contains(key.as_str()) { + env.push(format!("{}={}", key, val)); + } + } + + Ok(env) +} + +pub struct ScopedMinijail(pub Minijail); + +impl Drop for ScopedMinijail { + fn drop(&mut self) { + let _ = self.0.kill(); + } +} + +pub fn start_gpu_render_server( + cfg: &Config, + render_server_parameters: &GpuRenderServerParameters, +) -> Result<(Minijail, SafeDescriptor)> { + let (server_socket, client_socket) = + UnixSeqpacket::pair().context("failed to create render server socket")?; + + let mut env = None; + let jail = match gpu_jail(cfg, "gpu_render_server")? { + Some(mut jail) => { + let cache_info = get_gpu_cache_info( + render_server_parameters.cache_path.as_ref(), + render_server_parameters.cache_size.as_ref(), + cfg.sandbox, + ); + + if let Some(dir) = cache_info.directory { + jail.mount_bind(dir, dir, true)?; + } + + if !cache_info.environment.is_empty() { + env = Some(get_gpu_render_server_environment(&cache_info)?); + } + + // bind mount /dev/log for syslog + let log_path = Path::new("/dev/log"); + if log_path.exists() { + jail.mount_bind(log_path, log_path, true)?; + } + + // Run as root in the jail to keep capabilities after execve, which is needed for + // mounting to work. All capabilities will be dropped afterwards. + add_current_user_as_root_to_jail(&mut jail)?; + + jail + } + None => Minijail::new().context("failed to create jail")?, + }; + + let inheritable_fds = [ + server_socket.as_raw_descriptor(), + libc::STDOUT_FILENO, + libc::STDERR_FILENO, + ]; + + let cmd = &render_server_parameters.path; + let cmd_str = cmd + .to_str() + .ok_or_else(|| anyhow!("invalid render server path"))?; + let fd_str = server_socket.as_raw_descriptor().to_string(); + let args = [cmd_str, "--socket-fd", &fd_str]; + + let mut envp: Option> = None; + if let Some(ref env) = env { + envp = Some(env.iter().map(AsRef::as_ref).collect()); + } + + jail.run_command(minijail::Command::new_for_path( + cmd, + &inheritable_fds, + &args, + envp.as_deref(), + )?) + .context("failed to start gpu render server")?; + + Ok((jail, SafeDescriptor::from(client_socket))) +} diff --git a/src/linux/jail_helpers.rs b/src/linux/jail_helpers.rs new file mode 100644 index 0000000000..f74fcd297f --- /dev/null +++ b/src/linux/jail_helpers.rs @@ -0,0 +1,188 @@ +// Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use std::path::{Path, PathBuf}; +use std::str; + +use libc::{self, c_ulong, gid_t, uid_t}; + +use anyhow::{bail, Context, Result}; +use base::*; +use minijail::{self, Minijail}; + +use crate::Config; + +pub(super) struct SandboxConfig<'a> { + pub(super) limit_caps: bool, + pub(super) log_failures: bool, + pub(super) seccomp_policy: &'a Path, + pub(super) uid_map: Option<&'a str>, + pub(super) gid_map: Option<&'a str>, + pub(super) remount_mode: Option, +} + +pub(super) fn create_base_minijail( + root: &Path, + r_limit: Option, + config: Option<&SandboxConfig>, +) -> Result { + // All child jails run in a new user namespace without any users mapped, + // they run as nobody unless otherwise configured. + let mut j = Minijail::new().context("failed to jail device")?; + + if let Some(config) = config { + j.namespace_pids(); + j.namespace_user(); + j.namespace_user_disable_setgroups(); + if config.limit_caps { + // Don't need any capabilities. + j.use_caps(0); + } + if let Some(uid_map) = config.uid_map { + j.uidmap(uid_map).context("error setting UID map")?; + } + if let Some(gid_map) = config.gid_map { + j.gidmap(gid_map).context("error setting GID map")?; + } + // Run in a new mount namespace. + j.namespace_vfs(); + + // Run in an empty network namespace. + j.namespace_net(); + + // Don't allow the device to gain new privileges. + j.no_new_privs(); + + // By default we'll prioritize using the pre-compiled .bpf over the .policy + // file (the .bpf is expected to be compiled using "trap" as the failure + // behavior instead of the default "kill" behavior). + // Refer to the code comment for the "seccomp-log-failures" + // command-line parameter for an explanation about why the |log_failures| + // flag forces the use of .policy files (and the build-time alternative to + // this run-time flag). + let bpf_policy_file = config.seccomp_policy.with_extension("bpf"); + if bpf_policy_file.exists() && !config.log_failures { + j.parse_seccomp_program(&bpf_policy_file) + .context("failed to parse precompiled seccomp policy")?; + } else { + // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, + // which will correctly kill the entire device process if a worker + // thread commits a seccomp violation. + j.set_seccomp_filter_tsync(); + if config.log_failures { + j.log_seccomp_filter_failures(); + } + j.parse_seccomp_filters(&config.seccomp_policy.with_extension("policy")) + .context("failed to parse seccomp policy")?; + } + j.use_seccomp_filter(); + // Don't do init setup. + j.run_as_init(); + // Set up requested remount mode instead of default MS_PRIVATE. + if let Some(mode) = config.remount_mode { + j.set_remount_mode(mode); + } + } + + // Only pivot_root if we are not re-using the current root directory. + if root != Path::new("/") { + // It's safe to call `namespace_vfs` multiple times. + j.namespace_vfs(); + j.enter_pivot_root(root) + .context("failed to pivot root device")?; + } + + // Most devices don't need to open many fds. + let limit = if let Some(r) = r_limit { r } else { 1024u64 }; + j.set_rlimit(libc::RLIMIT_NOFILE as i32, limit, limit) + .context("error setting max open files")?; + + Ok(j) +} + +pub(super) fn simple_jail(cfg: &Config, policy: &str) -> Result> { + if cfg.sandbox { + let pivot_root: &str = option_env!("DEFAULT_PIVOT_ROOT").unwrap_or("/var/empty"); + // A directory for a jailed device's pivot root. + let root_path = Path::new(pivot_root); + if !root_path.exists() { + bail!("{} doesn't exist, can't jail devices", pivot_root); + } + let policy_path: PathBuf = cfg.seccomp_policy_dir.join(policy); + let config = SandboxConfig { + limit_caps: true, + log_failures: cfg.seccomp_log_failures, + seccomp_policy: &policy_path, + uid_map: None, + gid_map: None, + remount_mode: None, + }; + Ok(Some(create_base_minijail(root_path, None, Some(&config))?)) + } else { + Ok(None) + } +} + +/// Mirror-mount all the directories in `dirs` into `jail` on a best-effort basis. +/// +/// This function will not return an error if any of the directories in `dirs` is missing. +#[cfg(any(feature = "gpu", feature = "video-decoder", feature = "video-encoder"))] +pub(super) fn jail_mount_bind_if_exists>( + jail: &mut Minijail, + dirs: &[P], +) -> Result<()> { + for dir in dirs { + let dir_path = Path::new(dir); + if dir_path.exists() { + jail.mount_bind(dir_path, dir_path, false)?; + } + } + + Ok(()) +} + +#[derive(Copy, Clone)] +#[cfg_attr(not(feature = "tpm"), allow(dead_code))] +pub(super) struct Ids { + pub(super) uid: uid_t, + pub(super) gid: gid_t, +} + +pub(super) fn add_current_user_as_root_to_jail(jail: &mut Minijail) -> Result { + let crosvm_uid = geteuid(); + let crosvm_gid = getegid(); + jail.uidmap(&format!("0 {0} 1", crosvm_uid)) + .context("error setting UID map")?; + jail.gidmap(&format!("0 {0} 1", crosvm_gid)) + .context("error setting GID map")?; + + Ok(Ids { + uid: crosvm_uid, + gid: crosvm_gid, + }) +} + +/// Set the uid/gid for the jailed process and give a basic id map. This is +/// required for bind mounts to work. +pub(super) fn add_current_user_to_jail(jail: &mut Minijail) -> Result { + let crosvm_uid = geteuid(); + let crosvm_gid = getegid(); + + jail.uidmap(&format!("{0} {0} 1", crosvm_uid)) + .context("error setting UID map")?; + jail.gidmap(&format!("{0} {0} 1", crosvm_gid)) + .context("error setting GID map")?; + + if crosvm_uid != 0 { + jail.change_uid(crosvm_uid); + } + if crosvm_gid != 0 { + jail.change_gid(crosvm_gid); + } + + Ok(Ids { + uid: crosvm_uid, + gid: crosvm_gid, + }) +} diff --git a/src/linux/mod.rs b/src/linux/mod.rs index 70506e53df..fdda4a42f1 100644 --- a/src/linux/mod.rs +++ b/src/linux/mod.rs @@ -3,70 +3,46 @@ // found in the LICENSE file. use std::cmp::{max, Reverse}; -use std::collections::{BTreeMap, HashSet}; -use std::convert::{TryFrom, TryInto}; -#[cfg(feature = "gpu")] -use std::env; +use std::collections::BTreeMap; +use std::convert::TryInto; use std::fs::{File, OpenOptions}; use std::io::prelude::*; use std::io::stdin; use std::iter; use std::mem; -use std::net::Ipv4Addr; -use std::os::unix::net::UnixListener; -use std::os::unix::{io::FromRawFd, net::UnixStream, prelude::OpenOptionsExt}; +use std::os::unix::{net::UnixStream, prelude::OpenOptionsExt}; use std::path::{Path, PathBuf}; use std::str; use std::sync::{mpsc, Arc, Barrier}; use std::time::Duration; use std::process; +#[cfg(all(target_arch = "x86_64", feature = "gdb"))] use std::thread; -use std::thread::JoinHandle; -use libc::{self, c_int, c_ulong, gid_t, uid_t}; +use libc; use acpi_tables::sdt::SDT; use anyhow::{anyhow, bail, Context, Result}; use base::net::{UnixSeqpacket, UnixSeqpacketListener, UnlinkUnixSeqpacketListener}; use base::*; -use devices::serial_device::{SerialHardware, SerialParameters}; +use devices::serial_device::SerialHardware; use devices::vfio::{VfioCommonSetup, VfioCommonTrait}; -#[cfg(feature = "audio_cras")] -use devices::virtio::snd::cras_backend::Parameters as CrasSndParameters; -use devices::virtio::vhost::user::proxy::VirtioVhostUser; -#[cfg(feature = "audio")] -use devices::virtio::vhost::user::vmm::Snd as VhostUserSnd; -use devices::virtio::vhost::user::vmm::{ - Block as VhostUserBlock, Console as VhostUserConsole, Fs as VhostUserFs, - Mac80211Hwsim as VhostUserMac80211Hwsim, Net as VhostUserNet, Vsock as VhostUserVsock, - Wl as VhostUserWl, -}; -#[cfg(any(feature = "video-decoder", feature = "video-encoder"))] -use devices::virtio::VideoBackendType; -use devices::virtio::{self, Console, VirtioDevice}; -#[cfg(feature = "gpu")] -use devices::virtio::{ - gpu::{GpuRenderServerParameters, DEFAULT_DISPLAY_HEIGHT, DEFAULT_DISPLAY_WIDTH}, - vhost::user::vmm::Gpu as VhostUserGpu, - EventDevice, -}; +use devices::virtio::{self, EventDevice}; #[cfg(feature = "audio")] use devices::Ac97Dev; use devices::{ - self, BusDeviceObj, HostHotPlugKey, HotPlugBus, IrqChip, IrqEventIndex, KvmKernelIrqChip, - PciAddress, PciBridge, PciDevice, PcieRootPort, StubPciDevice, VcpuRunState, VfioContainer, - VfioDevice, VfioPciDevice, VfioPlatformDevice, VirtioPciDevice, + self, BusDeviceObj, HostHotPlugKey, HotPlugBus, IrqEventIndex, KvmKernelIrqChip, PciAddress, + PciBridge, PciDevice, PcieRootPort, StubPciDevice, VfioContainer, VirtioPciDevice, }; use devices::{CoIommuDev, IommuDevType}; #[cfg(feature = "usb")] use devices::{HostBackendDeviceProvider, XhciController}; use hypervisor::kvm::{Kvm, KvmVcpu, KvmVm}; -use hypervisor::{HypervisorCap, ProtectionType, Vcpu, VcpuExit, VcpuRunHandle, Vm, VmCap}; +use hypervisor::{HypervisorCap, ProtectionType, Vm, VmCap}; use minijail::{self, Minijail}; -use net_util::{MacAddress, Tap}; -use resources::{Alloc, MmioType, SystemAllocator}; +use resources::{Alloc, SystemAllocator}; use rutabaga_gfx::RutabagaGralloc; use sync::Mutex; use vm_control::*; @@ -74,10 +50,7 @@ use vm_memory::{GuestAddress, GuestMemory, MemoryPolicy}; #[cfg(all(target_arch = "x86_64", feature = "gdb"))] use crate::gdb::{gdb_thread, GdbStub}; -use crate::{ - Config, DiskOption, Executable, SharedDir, SharedDirKind, TouchDeviceOption, VfioType, - VhostUserFsOption, VhostUserOption, VhostUserWlOption, VhostVsockDeviceParameter, -}; +use crate::{Config, Executable, SharedDir, SharedDirKind, VfioType, VhostUserOption}; use arch::{ self, LinuxArch, RunnableLinuxVm, VcpuAffinity, VirtioDeviceStub, VmComponents, VmImage, }; @@ -95,1418 +68,16 @@ use { x86_64::X8664arch as Arch, }; -enum TaggedControlTube { - Fs(Tube), - Vm(Tube), - VmMemory(Tube), - VmIrq(Tube), - VmMsync(Tube), -} - -impl AsRef for TaggedControlTube { - fn as_ref(&self) -> &Tube { - use self::TaggedControlTube::*; - match &self { - Fs(tube) | Vm(tube) | VmMemory(tube) | VmIrq(tube) | VmMsync(tube) => tube, - } - } -} - -impl AsRawDescriptor for TaggedControlTube { - fn as_raw_descriptor(&self) -> RawDescriptor { - self.as_ref().as_raw_descriptor() - } -} - -struct SandboxConfig<'a> { - limit_caps: bool, - log_failures: bool, - seccomp_policy: &'a Path, - uid_map: Option<&'a str>, - gid_map: Option<&'a str>, - remount_mode: Option, -} - -fn create_base_minijail( - root: &Path, - r_limit: Option, - config: Option<&SandboxConfig>, -) -> Result { - // All child jails run in a new user namespace without any users mapped, - // they run as nobody unless otherwise configured. - let mut j = Minijail::new().context("failed to jail device")?; - - if let Some(config) = config { - j.namespace_pids(); - j.namespace_user(); - j.namespace_user_disable_setgroups(); - if config.limit_caps { - // Don't need any capabilities. - j.use_caps(0); - } - if let Some(uid_map) = config.uid_map { - j.uidmap(uid_map).context("error setting UID map")?; - } - if let Some(gid_map) = config.gid_map { - j.gidmap(gid_map).context("error setting GID map")?; - } - // Run in a new mount namespace. - j.namespace_vfs(); - - // Run in an empty network namespace. - j.namespace_net(); - - // Don't allow the device to gain new privileges. - j.no_new_privs(); - - // By default we'll prioritize using the pre-compiled .bpf over the .policy - // file (the .bpf is expected to be compiled using "trap" as the failure - // behavior instead of the default "kill" behavior). - // Refer to the code comment for the "seccomp-log-failures" - // command-line parameter for an explanation about why the |log_failures| - // flag forces the use of .policy files (and the build-time alternative to - // this run-time flag). - let bpf_policy_file = config.seccomp_policy.with_extension("bpf"); - if bpf_policy_file.exists() && !config.log_failures { - j.parse_seccomp_program(&bpf_policy_file) - .context("failed to parse precompiled seccomp policy")?; - } else { - // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, - // which will correctly kill the entire device process if a worker - // thread commits a seccomp violation. - j.set_seccomp_filter_tsync(); - if config.log_failures { - j.log_seccomp_filter_failures(); - } - j.parse_seccomp_filters(&config.seccomp_policy.with_extension("policy")) - .context("failed to parse seccomp policy")?; - } - j.use_seccomp_filter(); - // Don't do init setup. - j.run_as_init(); - // Set up requested remount mode instead of default MS_PRIVATE. - if let Some(mode) = config.remount_mode { - j.set_remount_mode(mode); - } - } - - // Only pivot_root if we are not re-using the current root directory. - if root != Path::new("/") { - // It's safe to call `namespace_vfs` multiple times. - j.namespace_vfs(); - j.enter_pivot_root(root) - .context("failed to pivot root device")?; - } - - // Most devices don't need to open many fds. - let limit = if let Some(r) = r_limit { r } else { 1024u64 }; - j.set_rlimit(libc::RLIMIT_NOFILE as i32, limit, limit) - .context("error setting max open files")?; - - Ok(j) -} - -fn simple_jail(cfg: &Config, policy: &str) -> Result> { - if cfg.sandbox { - let pivot_root: &str = option_env!("DEFAULT_PIVOT_ROOT").unwrap_or("/var/empty"); - // A directory for a jailed device's pivot root. - let root_path = Path::new(pivot_root); - if !root_path.exists() { - bail!("{} doesn't exist, can't jail devices", pivot_root); - } - let policy_path: PathBuf = cfg.seccomp_policy_dir.join(policy); - let config = SandboxConfig { - limit_caps: true, - log_failures: cfg.seccomp_log_failures, - seccomp_policy: &policy_path, - uid_map: None, - gid_map: None, - remount_mode: None, - }; - Ok(Some(create_base_minijail(root_path, None, Some(&config))?)) - } else { - Ok(None) - } -} - -type DeviceResult = Result; - -fn create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult { - let raw_image: File = open_file(&disk.path, disk.read_only, disk.o_direct) - .with_context(|| format!("failed to load disk image {}", disk.path.display()))?; - // Lock the disk image to prevent other crosvm instances from using it. - let lock_op = if disk.read_only { - FlockOperation::LockShared - } else { - FlockOperation::LockExclusive - }; - flock(&raw_image, lock_op, true).context("failed to lock disk image")?; - - info!("Trying to attach block device: {}", disk.path.display()); - let dev = if disk::async_ok(&raw_image).context("failed to check disk async_ok")? { - let async_file = disk::create_async_disk_file(raw_image) - .context("failed to create async virtual disk")?; - Box::new( - virtio::BlockAsync::new( - virtio::base_features(cfg.protected_vm), - async_file, - disk.read_only, - disk.sparse, - disk.block_size, - disk.id, - Some(disk_device_tube), - ) - .context("failed to create block device")?, - ) as Box - } else { - let disk_file = disk::create_disk_file(raw_image, disk::MAX_NESTING_DEPTH) - .context("failed to create virtual disk")?; - Box::new( - virtio::Block::new( - virtio::base_features(cfg.protected_vm), - disk_file, - disk.read_only, - disk.sparse, - disk.block_size, - disk.id, - Some(disk_device_tube), - ) - .context("failed to create block device")?, - ) as Box - }; - - Ok(VirtioDeviceStub { - dev, - jail: simple_jail(cfg, "block_device")?, - }) -} - -fn create_vhost_user_block_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { - let dev = VhostUserBlock::new(virtio::base_features(cfg.protected_vm), &opt.socket) - .context("failed to set up vhost-user block device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - // no sandbox here because virtqueue handling is exported to a different process. - jail: None, - }) -} - -fn create_vhost_user_console_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { - let dev = VhostUserConsole::new(virtio::base_features(cfg.protected_vm), &opt.socket) - .context("failed to set up vhost-user console device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - // no sandbox here because virtqueue handling is exported to a different process. - jail: None, - }) -} - -fn create_vhost_user_fs_device(cfg: &Config, option: &VhostUserFsOption) -> DeviceResult { - let dev = VhostUserFs::new( - virtio::base_features(cfg.protected_vm), - &option.socket, - &option.tag, - ) - .context("failed to set up vhost-user fs device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - // no sandbox here because virtqueue handling is exported to a different process. - jail: None, - }) -} - -fn create_vhost_user_mac80211_hwsim_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { - let dev = VhostUserMac80211Hwsim::new(virtio::base_features(cfg.protected_vm), &opt.socket) - .context("failed to set up vhost-user mac80211_hwsim device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - // no sandbox here because virtqueue handling is exported to a different process. - jail: None, - }) -} - -#[cfg(feature = "audio")] -fn create_vhost_user_snd_device(cfg: &Config, option: &VhostUserOption) -> DeviceResult { - let dev = VhostUserSnd::new(virtio::base_features(cfg.protected_vm), &option.socket) - .context("failed to set up vhost-user snd device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - // no sandbox here because virtqueue handling is exported to a different process. - jail: None, - }) -} - -fn create_vvu_proxy_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { - let listener = UnixListener::bind(&opt.socket).map_err(|e| { - error!("failed to bind listener for vvu proxy device: {}", e); - e - })?; - - let dev = VirtioVhostUser::new(virtio::base_features(cfg.protected_vm), listener) - .context("failed to create VVU proxy device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "vvu_proxy_device")?, - }) -} - -fn create_rng_device(cfg: &Config) -> DeviceResult { - let dev = virtio::Rng::new(virtio::base_features(cfg.protected_vm)) - .context("failed to set up rng")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "rng_device")?, - }) -} - -#[cfg(feature = "audio_cras")] -fn create_cras_snd_device(cfg: &Config, cras_snd: CrasSndParameters) -> DeviceResult { - let dev = virtio::snd::cras_backend::VirtioSndCras::new( - virtio::base_features(cfg.protected_vm), - cras_snd, - ) - .context("failed to create cras sound device")?; - - let jail = match simple_jail(&cfg, "cras_snd_device")? { - Some(mut jail) => { - // Create a tmpfs in the device's root directory for cras_snd_device. - // The size is 20*1024, or 20 KB. - jail.mount_with_data( - Path::new("none"), - Path::new("/"), - "tmpfs", - (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, - "size=20480", - )?; - - let run_cras_path = Path::new("/run/cras"); - jail.mount_bind(run_cras_path, run_cras_path, true)?; - - add_current_user_to_jail(&mut jail)?; - - Some(jail) - } - None => None, - }; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail, - }) -} - -#[cfg(feature = "tpm")] -fn create_tpm_device(cfg: &Config) -> DeviceResult { - use std::ffi::CString; - use std::fs; - - let tpm_storage: PathBuf; - let mut tpm_jail = simple_jail(cfg, "tpm_device")?; - - match &mut tpm_jail { - Some(jail) => { - // Create a tmpfs in the device's root directory for tpm - // simulator storage. The size is 20*1024, or 20 KB. - jail.mount_with_data( - Path::new("none"), - Path::new("/"), - "tmpfs", - (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, - "size=20480", - )?; - - let crosvm_ids = add_current_user_to_jail(jail)?; - - let pid = process::id(); - let tpm_pid_dir = format!("/run/vm/tpm.{}", pid); - tpm_storage = Path::new(&tpm_pid_dir).to_owned(); - fs::create_dir_all(&tpm_storage).with_context(|| { - format!("failed to create tpm storage dir {}", tpm_storage.display()) - })?; - let tpm_pid_dir_c = CString::new(tpm_pid_dir).expect("no nul bytes"); - chown(&tpm_pid_dir_c, crosvm_ids.uid, crosvm_ids.gid) - .context("failed to chown tpm storage")?; - - jail.mount_bind(&tpm_storage, &tpm_storage, true)?; - } - None => { - // Path used inside cros_sdk which does not have /run/vm. - tpm_storage = Path::new("/tmp/tpm-simulator").to_owned(); - } - } - - let dev = virtio::Tpm::new(tpm_storage); - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: tpm_jail, - }) -} - -fn create_single_touch_device( - cfg: &Config, - single_touch_spec: &TouchDeviceOption, - idx: u32, -) -> DeviceResult { - let socket = single_touch_spec - .get_path() - .into_unix_stream() - .map_err(|e| { - error!("failed configuring virtio single touch: {:?}", e); - e - })?; - - let (width, height) = single_touch_spec.get_size(); - let dev = virtio::new_single_touch( - idx, - socket, - width, - height, - virtio::base_features(cfg.protected_vm), - ) - .context("failed to set up input device")?; - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "input_device")?, - }) -} - -fn create_multi_touch_device( - cfg: &Config, - multi_touch_spec: &TouchDeviceOption, - idx: u32, -) -> DeviceResult { - let socket = multi_touch_spec - .get_path() - .into_unix_stream() - .map_err(|e| { - error!("failed configuring virtio multi touch: {:?}", e); - e - })?; - - let (width, height) = multi_touch_spec.get_size(); - let dev = virtio::new_multi_touch( - idx, - socket, - width, - height, - virtio::base_features(cfg.protected_vm), - ) - .context("failed to set up input device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "input_device")?, - }) -} - -fn create_trackpad_device( - cfg: &Config, - trackpad_spec: &TouchDeviceOption, - idx: u32, -) -> DeviceResult { - let socket = trackpad_spec.get_path().into_unix_stream().map_err(|e| { - error!("failed configuring virtio trackpad: {:#}", e); - e - })?; - - let (width, height) = trackpad_spec.get_size(); - let dev = virtio::new_trackpad( - idx, - socket, - width, - height, - virtio::base_features(cfg.protected_vm), - ) - .context("failed to set up input device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "input_device")?, - }) -} - -fn create_mouse_device(cfg: &Config, mouse_socket: T, idx: u32) -> DeviceResult { - let socket = mouse_socket.into_unix_stream().map_err(|e| { - error!("failed configuring virtio mouse: {:#}", e); - e - })?; - - let dev = virtio::new_mouse(idx, socket, virtio::base_features(cfg.protected_vm)) - .context("failed to set up input device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "input_device")?, - }) -} - -fn create_keyboard_device( - cfg: &Config, - keyboard_socket: T, - idx: u32, -) -> DeviceResult { - let socket = keyboard_socket.into_unix_stream().map_err(|e| { - error!("failed configuring virtio keyboard: {:#}", e); - e - })?; - - let dev = virtio::new_keyboard(idx, socket, virtio::base_features(cfg.protected_vm)) - .context("failed to set up input device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "input_device")?, - }) -} - -fn create_switches_device( - cfg: &Config, - switches_socket: T, - idx: u32, -) -> DeviceResult { - let socket = switches_socket.into_unix_stream().map_err(|e| { - error!("failed configuring virtio switches: {:#}", e); - e - })?; - - let dev = virtio::new_switches(idx, socket, virtio::base_features(cfg.protected_vm)) - .context("failed to set up input device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "input_device")?, - }) -} - -fn create_vinput_device(cfg: &Config, dev_path: &Path) -> DeviceResult { - let dev_file = OpenOptions::new() - .read(true) - .write(true) - .open(dev_path) - .with_context(|| format!("failed to open vinput device {}", dev_path.display()))?; - - let dev = virtio::new_evdev(dev_file, virtio::base_features(cfg.protected_vm)) - .context("failed to set up input device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "input_device")?, - }) -} - -fn create_balloon_device( - cfg: &Config, - tube: Tube, - inflate_tube: Option, - init_balloon_size: u64, -) -> DeviceResult { - let dev = virtio::Balloon::new( - virtio::base_features(cfg.protected_vm), - tube, - inflate_tube, - init_balloon_size, - ) - .context("failed to create balloon")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "balloon_device")?, - }) -} - -/// Generic method for creating a network device. `create_device` is a closure that takes the virtio -/// features and number of queue pairs as parameters, and is responsible for creating the device -/// itself. -fn create_net_device(cfg: &Config, policy: &str, create_device: F) -> DeviceResult -where - F: Fn(u64, u16) -> Result, - T: VirtioDevice + 'static, -{ - let mut vq_pairs = cfg.net_vq_pairs.unwrap_or(1); - let vcpu_count = cfg.vcpu_count.unwrap_or(1); - if vcpu_count < vq_pairs as usize { - warn!("the number of net vq pairs must not exceed the vcpu count, falling back to single queue mode"); - vq_pairs = 1; - } - let features = virtio::base_features(cfg.protected_vm); - - let dev = create_device(features, vq_pairs)?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev) as Box, - jail: simple_jail(cfg, policy)?, - }) -} - -/// Returns a network device created from a new TAP interface configured with `host_ip`, `netmask`, -/// and `mac_address`. -fn create_net_device_from_config( - cfg: &Config, - host_ip: Ipv4Addr, - netmask: Ipv4Addr, - mac_address: MacAddress, -) -> DeviceResult { - let policy = if cfg.vhost_net { - "vhost_net_device" - } else { - "net_device" - }; - - if cfg.vhost_net { - create_net_device(cfg, policy, |features, _vq_pairs| { - virtio::vhost::Net::>::new( - &cfg.vhost_net_device_path, - features, - host_ip, - netmask, - mac_address, - ) - .context("failed to set up vhost networking") - }) - } else { - create_net_device(cfg, policy, |features, vq_pairs| { - virtio::Net::::new(features, host_ip, netmask, mac_address, vq_pairs) - .context("failed to create virtio network device") - }) - } -} - -/// Returns a network device from a file descriptor to a configured TAP interface. -fn create_tap_net_device_from_fd(cfg: &Config, tap_fd: RawDescriptor) -> DeviceResult { - create_net_device(cfg, "net_device", |features, vq_pairs| { - // Safe because we ensure that we get a unique handle to the fd. - let tap = unsafe { - Tap::from_raw_descriptor( - validate_raw_descriptor(tap_fd).context("failed to validate tap descriptor")?, - ) - .context("failed to create tap device")? - }; - - virtio::Net::from(features, tap, vq_pairs).context("failed to create tap net device") - }) -} - -/// Returns a network device created by opening the persistent, configured TAP interface `tap_name`. -fn create_tap_net_device_from_name(cfg: &Config, tap_name: &[u8]) -> DeviceResult { - create_net_device(cfg, "net_device", |features, vq_pairs| { - virtio::Net::::new_from_name(features, tap_name, vq_pairs) - .context("failed to create configured virtio network device") - }) -} - -fn create_vhost_user_net_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { - let dev = VhostUserNet::new(virtio::base_features(cfg.protected_vm), &opt.socket) - .context("failed to set up vhost-user net device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - // no sandbox here because virtqueue handling is exported to a different process. - jail: None, - }) -} - -fn create_vhost_user_vsock_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { - let dev = VhostUserVsock::new(virtio::base_features(cfg.protected_vm), &opt.socket) - .context("failed to set up vhost-user vsock device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - // no sandbox here because virtqueue handling is exported to a different process. - jail: None, - }) -} - -fn create_vhost_user_wl_device(cfg: &Config, opt: &VhostUserWlOption) -> DeviceResult { - // The crosvm wl device expects us to connect the tube before it will accept a vhost-user - // connection. - let dev = VhostUserWl::new(virtio::base_features(cfg.protected_vm), &opt.socket) - .context("failed to set up vhost-user wl device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - // no sandbox here because virtqueue handling is exported to a different process. - jail: None, - }) -} +mod device_helpers; +use device_helpers::*; +mod jail_helpers; +use jail_helpers::*; +mod vcpu; #[cfg(feature = "gpu")] -fn create_vhost_user_gpu_device( - cfg: &Config, - opt: &VhostUserOption, - host_tube: Tube, - device_tube: Tube, -) -> DeviceResult { - // The crosvm gpu device expects us to connect the tube before it will accept a vhost-user - // connection. - let dev = VhostUserGpu::new( - virtio::base_features(cfg.protected_vm), - &opt.socket, - host_tube, - device_tube, - ) - .context("failed to set up vhost-user gpu device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - // no sandbox here because virtqueue handling is exported to a different process. - jail: None, - }) -} - -/// Mirror-mount all the directories in `dirs` into `jail` on a best-effort basis. -/// -/// This function will not return an error if any of the directories in `dirs` is missing. -#[cfg(any(feature = "gpu", feature = "video-decoder", feature = "video-encoder"))] -fn jail_mount_bind_if_exists>( - jail: &mut Minijail, - dirs: &[P], -) -> Result<()> { - for dir in dirs { - let dir_path = Path::new(dir); - if dir_path.exists() { - jail.mount_bind(dir_path, dir_path, false)?; - } - } - - Ok(()) -} - +mod gpu; #[cfg(feature = "gpu")] -fn gpu_jail(cfg: &Config, policy: &str) -> Result> { - match simple_jail(cfg, policy)? { - Some(mut jail) => { - // Create a tmpfs in the device's root directory so that we can bind mount the - // dri directory into it. The size=67108864 is size=64*1024*1024 or size=64MB. - jail.mount_with_data( - Path::new("none"), - Path::new("/"), - "tmpfs", - (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, - "size=67108864", - )?; - - // Device nodes required for DRM. - let sys_dev_char_path = Path::new("/sys/dev/char"); - jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?; - let sys_devices_path = Path::new("/sys/devices"); - jail.mount_bind(sys_devices_path, sys_devices_path, false)?; - - let drm_dri_path = Path::new("/dev/dri"); - if drm_dri_path.exists() { - jail.mount_bind(drm_dri_path, drm_dri_path, false)?; - } - - // If the ARM specific devices exist on the host, bind mount them in. - let mali0_path = Path::new("/dev/mali0"); - if mali0_path.exists() { - jail.mount_bind(mali0_path, mali0_path, true)?; - } - - let pvr_sync_path = Path::new("/dev/pvr_sync"); - if pvr_sync_path.exists() { - jail.mount_bind(pvr_sync_path, pvr_sync_path, true)?; - } - - // If the udmabuf driver exists on the host, bind mount it in. - let udmabuf_path = Path::new("/dev/udmabuf"); - if udmabuf_path.exists() { - jail.mount_bind(udmabuf_path, udmabuf_path, true)?; - } - - // Libraries that are required when mesa drivers are dynamically loaded. - jail_mount_bind_if_exists( - &mut jail, - &[ - "/usr/lib", - "/usr/lib64", - "/lib", - "/lib64", - "/usr/share/drirc.d", - "/usr/share/glvnd", - "/usr/share/vulkan", - ], - )?; - - // pvr driver requires read access to /proc/self/task/*/comm. - let proc_path = Path::new("/proc"); - jail.mount( - proc_path, - proc_path, - "proc", - (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_RDONLY) as usize, - )?; - - // To enable perfetto tracing, we need to give access to the perfetto service IPC - // endpoints. - let perfetto_path = Path::new("/run/perfetto"); - if perfetto_path.exists() { - jail.mount_bind(perfetto_path, perfetto_path, true)?; - } - - Ok(Some(jail)) - } - None => Ok(None), - } -} - -#[cfg(feature = "gpu")] -struct GpuCacheInfo<'a> { - directory: Option<&'a str>, - environment: Vec<(&'a str, &'a str)>, -} - -#[cfg(feature = "gpu")] -fn get_gpu_cache_info<'a>( - cache_dir: Option<&'a String>, - cache_size: Option<&'a String>, - sandbox: bool, -) -> GpuCacheInfo<'a> { - let mut dir = None; - let mut env = Vec::new(); - - if let Some(cache_dir) = cache_dir { - if !Path::new(cache_dir).exists() { - warn!("shader caching dir {} does not exist", cache_dir); - env.push(("MESA_GLSL_CACHE_DISABLE", "true")); - } else if cfg!(any(target_arch = "arm", target_arch = "aarch64")) && sandbox { - warn!("shader caching not yet supported on ARM with sandbox enabled"); - env.push(("MESA_GLSL_CACHE_DISABLE", "true")); - } else { - dir = Some(cache_dir.as_str()); - - env.push(("MESA_GLSL_CACHE_DISABLE", "false")); - env.push(("MESA_GLSL_CACHE_DIR", cache_dir.as_str())); - if let Some(cache_size) = cache_size { - env.push(("MESA_GLSL_CACHE_MAX_SIZE", cache_size.as_str())); - } - } - } - - GpuCacheInfo { - directory: dir, - environment: env, - } -} - -#[cfg(feature = "gpu")] -fn create_gpu_device( - cfg: &Config, - exit_evt: &Event, - gpu_device_tube: Tube, - resource_bridges: Vec, - wayland_socket_path: Option<&PathBuf>, - x_display: Option, - render_server_fd: Option, - event_devices: Vec, - map_request: Arc>>, -) -> DeviceResult { - let mut display_backends = vec![ - virtio::DisplayBackend::X(x_display), - virtio::DisplayBackend::Stub, - ]; - - let wayland_socket_dirs = cfg - .wayland_socket_paths - .iter() - .map(|(_name, path)| path.parent()) - .collect::>>() - .ok_or_else(|| anyhow!("wayland socket path has no parent or file name"))?; - - if let Some(socket_path) = wayland_socket_path { - display_backends.insert( - 0, - virtio::DisplayBackend::Wayland(Some(socket_path.to_owned())), - ); - } - - let dev = virtio::Gpu::new( - exit_evt.try_clone().context("failed to clone event")?, - Some(gpu_device_tube), - resource_bridges, - display_backends, - cfg.gpu_parameters.as_ref().unwrap(), - render_server_fd, - event_devices, - map_request, - cfg.sandbox, - virtio::base_features(cfg.protected_vm), - cfg.wayland_socket_paths.clone(), - ); - - let jail = match gpu_jail(cfg, "gpu_device")? { - Some(mut jail) => { - // Prepare GPU shader disk cache directory. - let (cache_dir, cache_size) = cfg - .gpu_parameters - .as_ref() - .map(|params| (params.cache_path.as_ref(), params.cache_size.as_ref())) - .unwrap(); - let cache_info = get_gpu_cache_info(cache_dir, cache_size, cfg.sandbox); - - if let Some(dir) = cache_info.directory { - jail.mount_bind(dir, dir, true)?; - } - for (key, val) in cache_info.environment { - env::set_var(key, val); - } - - // Bind mount the wayland socket's directory into jail's root. This is necessary since - // each new wayland context must open() the socket. If the wayland socket is ever - // destroyed and remade in the same host directory, new connections will be possible - // without restarting the wayland device. - for dir in &wayland_socket_dirs { - jail.mount_bind(dir, dir, true)?; - } - - add_current_user_to_jail(&mut jail)?; - - Some(jail) - } - None => None, - }; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail, - }) -} - -#[cfg(feature = "gpu")] -fn get_gpu_render_server_environment(cache_info: &GpuCacheInfo) -> Result> { - let mut env = Vec::new(); - - let mut cache_env_keys = HashSet::with_capacity(cache_info.environment.len()); - for (key, val) in cache_info.environment.iter() { - env.push(format!("{}={}", key, val)); - cache_env_keys.insert(*key); - } - - for (key_os, val_os) in env::vars_os() { - // minijail should accept OsStr rather than str... - let into_string_err = |_| anyhow!("invalid environment key/val"); - let key = key_os.into_string().map_err(into_string_err)?; - let val = val_os.into_string().map_err(into_string_err)?; - - if !cache_env_keys.contains(key.as_str()) { - env.push(format!("{}={}", key, val)); - } - } - - Ok(env) -} - -#[cfg(feature = "gpu")] -struct ScopedMinijail(Minijail); - -#[cfg(feature = "gpu")] -impl Drop for ScopedMinijail { - fn drop(&mut self) { - let _ = self.0.kill(); - } -} - -#[cfg(feature = "gpu")] -fn start_gpu_render_server( - cfg: &Config, - render_server_parameters: &GpuRenderServerParameters, -) -> Result<(Minijail, SafeDescriptor)> { - let (server_socket, client_socket) = - UnixSeqpacket::pair().context("failed to create render server socket")?; - - let mut env = None; - let jail = match gpu_jail(cfg, "gpu_render_server")? { - Some(mut jail) => { - let cache_info = get_gpu_cache_info( - render_server_parameters.cache_path.as_ref(), - render_server_parameters.cache_size.as_ref(), - cfg.sandbox, - ); - - if let Some(dir) = cache_info.directory { - jail.mount_bind(dir, dir, true)?; - } - - if !cache_info.environment.is_empty() { - env = Some(get_gpu_render_server_environment(&cache_info)?); - } - - // bind mount /dev/log for syslog - let log_path = Path::new("/dev/log"); - if log_path.exists() { - jail.mount_bind(log_path, log_path, true)?; - } - - // Run as root in the jail to keep capabilities after execve, which is needed for - // mounting to work. All capabilities will be dropped afterwards. - add_current_user_as_root_to_jail(&mut jail)?; - - jail - } - None => Minijail::new().context("failed to create jail")?, - }; - - let inheritable_fds = [ - server_socket.as_raw_descriptor(), - libc::STDOUT_FILENO, - libc::STDERR_FILENO, - ]; - - let cmd = &render_server_parameters.path; - let cmd_str = cmd - .to_str() - .ok_or_else(|| anyhow!("invalid render server path"))?; - let fd_str = server_socket.as_raw_descriptor().to_string(); - let args = [cmd_str, "--socket-fd", &fd_str]; - - let mut envp: Option> = None; - if let Some(ref env) = env { - envp = Some(env.iter().map(AsRef::as_ref).collect()); - } - - jail.run_command(minijail::Command::new_for_path( - cmd, - &inheritable_fds, - &args, - envp.as_deref(), - )?) - .context("failed to start gpu render server")?; - - Ok((jail, SafeDescriptor::from(client_socket))) -} - -fn create_wayland_device( - cfg: &Config, - control_tube: Tube, - resource_bridge: Option, -) -> DeviceResult { - let wayland_socket_dirs = cfg - .wayland_socket_paths - .iter() - .map(|(_name, path)| path.parent()) - .collect::>>() - .ok_or_else(|| anyhow!("wayland socket path has no parent or file name"))?; - - let features = virtio::base_features(cfg.protected_vm); - let dev = virtio::Wl::new( - features, - cfg.wayland_socket_paths.clone(), - control_tube, - resource_bridge, - ) - .context("failed to create wayland device")?; - - let jail = match simple_jail(cfg, "wl_device")? { - Some(mut jail) => { - // Create a tmpfs in the device's root directory so that we can bind mount the wayland - // socket directory into it. The size=67108864 is size=64*1024*1024 or size=64MB. - jail.mount_with_data( - Path::new("none"), - Path::new("/"), - "tmpfs", - (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, - "size=67108864", - )?; - - // Bind mount the wayland socket's directory into jail's root. This is necessary since - // each new wayland context must open() the socket. If the wayland socket is ever - // destroyed and remade in the same host directory, new connections will be possible - // without restarting the wayland device. - for dir in &wayland_socket_dirs { - jail.mount_bind(dir, dir, true)?; - } - add_current_user_to_jail(&mut jail)?; - - Some(jail) - } - None => None, - }; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail, - }) -} - -#[cfg(any(feature = "video-decoder", feature = "video-encoder"))] -fn create_video_device( - backend: VideoBackendType, - cfg: &Config, - typ: devices::virtio::VideoDeviceType, - resource_bridge: Tube, -) -> DeviceResult { - let jail = match simple_jail(cfg, "video_device")? { - Some(mut jail) => { - match typ { - #[cfg(feature = "video-decoder")] - devices::virtio::VideoDeviceType::Decoder => add_current_user_to_jail(&mut jail)?, - #[cfg(feature = "video-encoder")] - devices::virtio::VideoDeviceType::Encoder => add_current_user_to_jail(&mut jail)?, - }; - - // Create a tmpfs in the device's root directory so that we can bind mount files. - jail.mount_with_data( - Path::new("none"), - Path::new("/"), - "tmpfs", - (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, - "size=67108864", - )?; - - #[cfg(feature = "libvda")] - // Render node for libvda. - if backend == VideoBackendType::Libvda || backend == VideoBackendType::LibvdaVd { - // follow the implementation at: - // https://source.corp.google.com/chromeos_public/src/platform/minigbm/cros_gralloc/cros_gralloc_driver.cc;l=90;bpv=0;cl=c06cc9cccb3cf3c7f9d2aec706c27c34cd6162a0 - const DRM_NUM_NODES: u32 = 63; - const DRM_RENDER_NODE_START: u32 = 128; - for offset in 0..DRM_NUM_NODES { - let path_str = format!("/dev/dri/renderD{}", DRM_RENDER_NODE_START + offset); - let dev_dri_path = Path::new(&path_str); - if !dev_dri_path.exists() { - break; - } - jail.mount_bind(dev_dri_path, dev_dri_path, false)?; - } - } - - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - // Device nodes used by libdrm through minigbm in libvda on AMD devices. - let sys_dev_char_path = Path::new("/sys/dev/char"); - jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?; - let sys_devices_path = Path::new("/sys/devices"); - jail.mount_bind(sys_devices_path, sys_devices_path, false)?; - - // Required for loading dri libraries loaded by minigbm on AMD devices. - jail_mount_bind_if_exists(&mut jail, &["/usr/lib64"])?; - } - - // Device nodes required by libchrome which establishes Mojo connection in libvda. - let dev_urandom_path = Path::new("/dev/urandom"); - jail.mount_bind(dev_urandom_path, dev_urandom_path, false)?; - let system_bus_socket_path = Path::new("/run/dbus/system_bus_socket"); - jail.mount_bind(system_bus_socket_path, system_bus_socket_path, true)?; - - Some(jail) - } - None => None, - }; - - Ok(VirtioDeviceStub { - dev: Box::new(devices::virtio::VideoDevice::new( - virtio::base_features(cfg.protected_vm), - typ, - backend, - Some(resource_bridge), - )), - jail, - }) -} - -#[cfg(any(feature = "video-decoder", feature = "video-encoder"))] -fn register_video_device( - backend: VideoBackendType, - devs: &mut Vec, - video_tube: Tube, - cfg: &Config, - typ: devices::virtio::VideoDeviceType, -) -> Result<()> { - devs.push(create_video_device(backend, cfg, typ, video_tube)?); - Ok(()) -} - -fn create_vhost_vsock_device(cfg: &Config, cid: u64) -> DeviceResult { - let features = virtio::base_features(cfg.protected_vm); - - let device_file = match cfg - .vhost_vsock_device - .as_ref() - .unwrap_or(&VhostVsockDeviceParameter::default()) - { - VhostVsockDeviceParameter::Fd(fd) => { - let fd = validate_raw_descriptor(*fd) - .context("failed to validate fd for virtual socker device")?; - // Safe because the `fd` is actually owned by this process and - // we have a unique handle to it. - unsafe { File::from_raw_fd(fd) } - } - VhostVsockDeviceParameter::Path(path) => OpenOptions::new() - .read(true) - .write(true) - .custom_flags(libc::O_CLOEXEC | libc::O_NONBLOCK) - .open(path) - .context("failed to open virtual socket device")?, - }; - - let dev = virtio::vhost::Vsock::new(device_file, features, cid) - .context("failed to set up virtual socket device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "vhost_vsock_device")?, - }) -} - -fn create_fs_device( - cfg: &Config, - uid_map: &str, - gid_map: &str, - src: &Path, - tag: &str, - fs_cfg: virtio::fs::passthrough::Config, - device_tube: Tube, -) -> DeviceResult { - let max_open_files = - base::get_max_open_files().context("failed to get max number of open files")?; - let j = if cfg.sandbox { - let seccomp_policy = cfg.seccomp_policy_dir.join("fs_device"); - let config = SandboxConfig { - limit_caps: false, - uid_map: Some(uid_map), - gid_map: Some(gid_map), - log_failures: cfg.seccomp_log_failures, - seccomp_policy: &seccomp_policy, - // We want bind mounts from the parent namespaces to propagate into the fs device's - // namespace. - remount_mode: Some(libc::MS_SLAVE), - }; - create_base_minijail(src, Some(max_open_files), Some(&config))? - } else { - create_base_minijail(src, Some(max_open_files), None)? - }; - - let features = virtio::base_features(cfg.protected_vm); - // TODO(chirantan): Use more than one worker once the kernel driver has been fixed to not panic - // when num_queues > 1. - let dev = virtio::fs::Fs::new(features, tag, 1, fs_cfg, device_tube) - .context("failed to create fs device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: Some(j), - }) -} - -fn create_9p_device( - cfg: &Config, - uid_map: &str, - gid_map: &str, - src: &Path, - tag: &str, - mut p9_cfg: p9::Config, -) -> DeviceResult { - let max_open_files = - base::get_max_open_files().context("failed to get max number of open files")?; - let (jail, root) = if cfg.sandbox { - let seccomp_policy = cfg.seccomp_policy_dir.join("9p_device"); - let config = SandboxConfig { - limit_caps: false, - uid_map: Some(uid_map), - gid_map: Some(gid_map), - log_failures: cfg.seccomp_log_failures, - seccomp_policy: &seccomp_policy, - // We want bind mounts from the parent namespaces to propagate into the 9p server's - // namespace. - remount_mode: Some(libc::MS_SLAVE), - }; - - let jail = create_base_minijail(src, Some(max_open_files), Some(&config))?; - - // The shared directory becomes the root of the device's file system. - let root = Path::new("/"); - (Some(jail), root) - } else { - // There's no mount namespace so we tell the server to treat the source directory as the - // root. - (None, src) - }; - - let features = virtio::base_features(cfg.protected_vm); - p9_cfg.root = root.into(); - let dev = virtio::P9::new(features, tag, p9_cfg).context("failed to create 9p device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail, - }) -} - -fn create_pmem_device( - cfg: &Config, - vm: &mut impl Vm, - resources: &mut SystemAllocator, - disk: &DiskOption, - index: usize, - pmem_device_tube: Tube, -) -> DeviceResult { - let fd = open_file(&disk.path, disk.read_only, false /*O_DIRECT*/) - .with_context(|| format!("failed to load disk image {}", disk.path.display()))?; - - let (disk_size, arena_size) = { - let metadata = std::fs::metadata(&disk.path).with_context(|| { - format!("failed to get disk image {} metadata", disk.path.display()) - })?; - let disk_len = metadata.len(); - // Linux requires pmem region sizes to be 2 MiB aligned. Linux will fill any partial page - // at the end of an mmap'd file and won't write back beyond the actual file length, but if - // we just align the size of the file to 2 MiB then access beyond the last page of the - // mapped file will generate SIGBUS. So use a memory mapping arena that will provide - // padding up to 2 MiB. - let alignment = 2 * 1024 * 1024; - let align_adjust = if disk_len % alignment != 0 { - alignment - (disk_len % alignment) - } else { - 0 - }; - ( - disk_len, - disk_len - .checked_add(align_adjust) - .ok_or_else(|| anyhow!("pmem device image too big"))?, - ) - }; - - let protection = { - if disk.read_only { - Protection::read() - } else { - Protection::read_write() - } - }; - - let arena = { - // Conversion from u64 to usize may fail on 32bit system. - let arena_size = usize::try_from(arena_size).context("pmem device image too big")?; - let disk_size = usize::try_from(disk_size).context("pmem device image too big")?; - - let mut arena = - MemoryMappingArena::new(arena_size).context("failed to reserve pmem memory")?; - arena - .add_fd_offset_protection(0, disk_size, &fd, 0, protection) - .context("failed to reserve pmem memory")?; - - // If the disk is not a multiple of the page size, the OS will fill the remaining part - // of the page with zeroes. However, the anonymous mapping added below must start on a - // page boundary, so round up the size before calculating the offset of the anon region. - let disk_size = round_up_to_page_size(disk_size); - - if arena_size > disk_size { - // Add an anonymous region with the same protection as the disk mapping if the arena - // size was aligned. - arena - .add_anon_protection(disk_size, arena_size - disk_size, protection) - .context("failed to reserve pmem padding")?; - } - arena - }; - - let mapping_address = resources - .mmio_allocator(MmioType::High) - .reverse_allocate_with_align( - arena_size, - Alloc::PmemDevice(index), - format!("pmem_disk_image_{}", index), - // Linux kernel requires pmem namespaces to be 128 MiB aligned. - 128 * 1024 * 1024, /* 128 MiB */ - ) - .context("failed to allocate memory for pmem device")?; - - let slot = vm - .add_memory_region( - GuestAddress(mapping_address), - Box::new(arena), - /* read_only = */ disk.read_only, - /* log_dirty_pages = */ false, - ) - .context("failed to add pmem device memory")?; - - let dev = virtio::Pmem::new( - virtio::base_features(cfg.protected_vm), - fd, - GuestAddress(mapping_address), - slot, - arena_size, - Some(pmem_device_tube), - ) - .context("failed to create pmem device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev) as Box, - jail: simple_jail(cfg, "pmem_device")?, - }) -} - -fn create_iommu_device( - cfg: &Config, - phys_max_addr: u64, - endpoints: BTreeMap>>, -) -> DeviceResult { - let dev = virtio::Iommu::new( - virtio::base_features(cfg.protected_vm), - endpoints, - phys_max_addr, - ) - .context("failed to create IOMMU device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "iommu_device")?, - }) -} - -fn create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult { - let mut keep_rds = Vec::new(); - let evt = Event::new().context("failed to create event")?; - let dev = param - .create_serial_device::(cfg.protected_vm, &evt, &mut keep_rds) - .context("failed to create console device")?; - - let jail = match simple_jail(cfg, "serial")? { - Some(mut jail) => { - // Create a tmpfs in the device's root directory so that we can bind mount the - // log socket directory into it. - // The size=67108864 is size=64*1024*1024 or size=64MB. - jail.mount_with_data( - Path::new("none"), - Path::new("/"), - "tmpfs", - (libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_NOSUID) as usize, - "size=67108864", - )?; - add_current_user_to_jail(&mut jail)?; - let res = param.add_bind_mounts(&mut jail); - if res.is_err() { - error!("failed to add bind mounts for console device"); - } - Some(jail) - } - None => None, - }; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail, // TODO(dverkamp): use a separate policy for console? - }) -} - -#[cfg(feature = "audio")] -fn create_sound_device(path: &Path, cfg: &Config) -> DeviceResult { - let dev = virtio::new_sound(path, virtio::base_features(cfg.protected_vm)) - .context("failed to create sound device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "vios_audio_device")?, - }) -} +use gpu::*; // gpu_device_tube is not used when GPU support is disabled. #[cfg_attr(not(feature = "gpu"), allow(unused_variables))] @@ -1710,8 +281,8 @@ fn create_virtio_devices( #[cfg(feature = "gpu")] { if let Some(gpu_parameters) = &cfg.gpu_parameters { - let mut gpu_display_w = DEFAULT_DISPLAY_WIDTH; - let mut gpu_display_h = DEFAULT_DISPLAY_HEIGHT; + let mut gpu_display_w = virtio::DEFAULT_DISPLAY_WIDTH; + let mut gpu_display_h = virtio::DEFAULT_DISPLAY_HEIGHT; if !gpu_parameters.displays.is_empty() { gpu_display_w = gpu_parameters.displays[0].width; gpu_display_h = gpu_parameters.displays[0].height; @@ -1858,111 +429,6 @@ fn create_virtio_devices( Ok(devs) } -fn create_vfio_device( - cfg: &Config, - vm: &impl Vm, - resources: &mut SystemAllocator, - control_tubes: &mut Vec, - vfio_path: &Path, - bus_num: Option, - iommu_endpoints: &mut BTreeMap>>, - coiommu_endpoints: Option<&mut Vec>, - iommu_dev: IommuDevType, -) -> DeviceResult<(Box, Option)> { - let vfio_container = VfioCommonSetup::vfio_get_container(iommu_dev, Some(vfio_path)) - .context("failed to get vfio container")?; - - // create MSI, MSI-X, and Mem request sockets for each vfio device - let (vfio_host_tube_msi, vfio_device_tube_msi) = - Tube::pair().context("failed to create tube")?; - control_tubes.push(TaggedControlTube::VmIrq(vfio_host_tube_msi)); - - let (vfio_host_tube_msix, vfio_device_tube_msix) = - Tube::pair().context("failed to create tube")?; - control_tubes.push(TaggedControlTube::VmIrq(vfio_host_tube_msix)); - - let (vfio_host_tube_mem, vfio_device_tube_mem) = - Tube::pair().context("failed to create tube")?; - control_tubes.push(TaggedControlTube::VmMemory(vfio_host_tube_mem)); - - let hotplug = bus_num.is_some(); - let vfio_device_tube_vm = if hotplug { - let (vfio_host_tube_vm, device_tube_vm) = Tube::pair().context("failed to create tube")?; - control_tubes.push(TaggedControlTube::Vm(vfio_host_tube_vm)); - Some(device_tube_vm) - } else { - None - }; - - let vfio_device = VfioDevice::new_passthrough( - &vfio_path, - vm, - vfio_container.clone(), - iommu_dev != IommuDevType::NoIommu, - ) - .context("failed to create vfio device")?; - let mut vfio_pci_device = Box::new(VfioPciDevice::new( - vfio_device, - bus_num, - vfio_device_tube_msi, - vfio_device_tube_msix, - vfio_device_tube_mem, - vfio_device_tube_vm, - )); - // early reservation for pass-through PCI devices. - let endpoint_addr = vfio_pci_device - .allocate_address(resources) - .context("failed to allocate resources early for vfio pci dev")?; - - match iommu_dev { - IommuDevType::NoIommu => {} - IommuDevType::VirtioIommu => { - iommu_endpoints.insert(endpoint_addr.to_u32(), vfio_container); - } - IommuDevType::CoIommu => { - if let Some(endpoints) = coiommu_endpoints { - endpoints.push(endpoint_addr.to_u32() as u16); - } else { - bail!("Missed coiommu_endpoints vector to store the endpoint addr"); - } - } - } - - if hotplug { - Ok((vfio_pci_device, None)) - } else { - Ok((vfio_pci_device, simple_jail(cfg, "vfio_device")?)) - } -} - -fn create_vfio_platform_device( - cfg: &Config, - vm: &impl Vm, - _resources: &mut SystemAllocator, - control_tubes: &mut Vec, - vfio_path: &Path, - _endpoints: &mut BTreeMap>>, - iommu_dev: IommuDevType, -) -> DeviceResult<(VfioPlatformDevice, Option)> { - let vfio_container = VfioCommonSetup::vfio_get_container(iommu_dev, Some(vfio_path)) - .context("Failed to create vfio device")?; - - let (vfio_host_tube_mem, vfio_device_tube_mem) = - Tube::pair().context("failed to create tube")?; - control_tubes.push(TaggedControlTube::VmMemory(vfio_host_tube_mem)); - - let vfio_device = VfioDevice::new_passthrough( - &vfio_path, - vm, - vfio_container, - iommu_dev != IommuDevType::NoIommu, - ) - .context("Failed to create vfio device")?; - let vfio_plat_dev = VfioPlatformDevice::new(vfio_device, vfio_device_tube_mem); - - Ok((vfio_plat_dev, simple_jail(cfg, "vfio_platform_device")?)) -} - fn create_devices( cfg: &Config, vm: &mut impl Vm, @@ -2195,636 +661,6 @@ fn create_file_backed_mappings( Ok(()) } -#[derive(Copy, Clone)] -#[cfg_attr(not(feature = "tpm"), allow(dead_code))] -struct Ids { - uid: uid_t, - gid: gid_t, -} - -// Set the uid/gid for the jailed process and give a basic id map. This is -// required for bind mounts to work. -fn add_current_user_to_jail(jail: &mut Minijail) -> Result { - let crosvm_uid = geteuid(); - let crosvm_gid = getegid(); - - jail.uidmap(&format!("{0} {0} 1", crosvm_uid)) - .context("error setting UID map")?; - jail.gidmap(&format!("{0} {0} 1", crosvm_gid)) - .context("error setting GID map")?; - - if crosvm_uid != 0 { - jail.change_uid(crosvm_uid); - } - if crosvm_gid != 0 { - jail.change_gid(crosvm_gid); - } - - Ok(Ids { - uid: crosvm_uid, - gid: crosvm_gid, - }) -} - -fn add_current_user_as_root_to_jail(jail: &mut Minijail) -> Result { - let crosvm_uid = geteuid(); - let crosvm_gid = getegid(); - jail.uidmap(&format!("0 {0} 1", crosvm_uid)) - .context("error setting UID map")?; - jail.gidmap(&format!("0 {0} 1", crosvm_gid)) - .context("error setting GID map")?; - - Ok(Ids { - uid: crosvm_uid, - gid: crosvm_gid, - }) -} - -trait IntoUnixStream { - fn into_unix_stream(self) -> Result; -} - -impl<'a> IntoUnixStream for &'a Path { - fn into_unix_stream(self) -> Result { - if let Some(fd) = safe_descriptor_from_path(self).context("failed to open event device")? { - Ok(fd.into()) - } else { - UnixStream::connect(self).context("failed to open event device") - } - } -} -impl<'a> IntoUnixStream for &'a PathBuf { - fn into_unix_stream(self) -> Result { - self.as_path().into_unix_stream() - } -} - -impl IntoUnixStream for UnixStream { - fn into_unix_stream(self) -> Result { - Ok(self) - } -} - -fn setup_vcpu_signal_handler(use_hypervisor_signals: bool) -> Result<()> { - if use_hypervisor_signals { - unsafe { - extern "C" fn handle_signal(_: c_int) {} - // Our signal handler does nothing and is trivially async signal safe. - register_rt_signal_handler(SIGRTMIN() + 0, handle_signal) - .context("error registering signal handler")?; - } - block_signal(SIGRTMIN() + 0).context("failed to block signal")?; - } else { - unsafe { - extern "C" fn handle_signal(_: c_int) { - T::set_local_immediate_exit(true); - } - register_rt_signal_handler(SIGRTMIN() + 0, handle_signal::) - .context("error registering signal handler")?; - } - } - Ok(()) -} - -// Sets up a vcpu and converts it into a runnable vcpu. -fn runnable_vcpu( - cpu_id: usize, - kvm_vcpu_id: usize, - vcpu: Option, - vm: impl VmArch, - irq_chip: &mut dyn IrqChipArch, - vcpu_count: usize, - run_rt: bool, - vcpu_affinity: Vec, - no_smt: bool, - has_bios: bool, - use_hypervisor_signals: bool, - enable_per_vm_core_scheduling: bool, - host_cpu_topology: bool, - vcpu_cgroup_tasks_file: Option, -) -> Result<(V, VcpuRunHandle)> -where - V: VcpuArch, -{ - let mut vcpu = match vcpu { - Some(v) => v, - None => { - // If vcpu is None, it means this arch/hypervisor requires create_vcpu to be called from - // the vcpu thread. - match vm - .create_vcpu(kvm_vcpu_id) - .context("failed to create vcpu")? - .downcast::() - { - Ok(v) => *v, - Err(_) => panic!("VM created wrong type of VCPU"), - } - } - }; - - irq_chip - .add_vcpu(cpu_id, &vcpu) - .context("failed to add vcpu to irq chip")?; - - if !vcpu_affinity.is_empty() { - if let Err(e) = set_cpu_affinity(vcpu_affinity) { - error!("Failed to set CPU affinity: {}", e); - } - } - - Arch::configure_vcpu( - &vm, - vm.get_hypervisor(), - irq_chip, - &mut vcpu, - cpu_id, - vcpu_count, - has_bios, - no_smt, - host_cpu_topology, - ) - .context("failed to configure vcpu")?; - - if !enable_per_vm_core_scheduling { - // Do per-vCPU core scheduling by setting a unique cookie to each vCPU. - if let Err(e) = enable_core_scheduling() { - error!("Failed to enable core scheduling: {}", e); - } - } - - // Move vcpu thread to cgroup - if let Some(mut f) = vcpu_cgroup_tasks_file { - f.write_all(base::gettid().to_string().as_bytes()) - .context("failed to write vcpu tid to cgroup tasks")?; - } - - if run_rt { - const DEFAULT_VCPU_RT_LEVEL: u16 = 6; - if let Err(e) = set_rt_prio_limit(u64::from(DEFAULT_VCPU_RT_LEVEL)) - .and_then(|_| set_rt_round_robin(i32::from(DEFAULT_VCPU_RT_LEVEL))) - { - warn!("Failed to set vcpu to real time: {}", e); - } - } - - if use_hypervisor_signals { - let mut v = get_blocked_signals().context("failed to retrieve signal mask for vcpu")?; - v.retain(|&x| x != SIGRTMIN() + 0); - vcpu.set_signal_mask(&v) - .context("failed to set the signal mask for vcpu")?; - } - - let vcpu_run_handle = vcpu - .take_run_handle(Some(SIGRTMIN() + 0)) - .context("failed to set thread id for vcpu")?; - - Ok((vcpu, vcpu_run_handle)) -} - -#[cfg(all(target_arch = "x86_64", feature = "gdb"))] -fn handle_debug_msg( - cpu_id: usize, - vcpu: &V, - guest_mem: &GuestMemory, - d: VcpuDebug, - reply_tube: &mpsc::Sender, -) -> Result<()> -where - V: VcpuArch + 'static, -{ - match d { - VcpuDebug::ReadRegs => { - let msg = VcpuDebugStatusMessage { - cpu: cpu_id as usize, - msg: VcpuDebugStatus::RegValues( - Arch::debug_read_registers(vcpu as &V) - .context("failed to handle a gdb ReadRegs command")?, - ), - }; - reply_tube - .send(msg) - .context("failed to send a debug status to GDB thread") - } - VcpuDebug::WriteRegs(regs) => { - Arch::debug_write_registers(vcpu as &V, ®s) - .context("failed to handle a gdb WriteRegs command")?; - reply_tube - .send(VcpuDebugStatusMessage { - cpu: cpu_id as usize, - msg: VcpuDebugStatus::CommandComplete, - }) - .context("failed to send a debug status to GDB thread") - } - VcpuDebug::ReadMem(vaddr, len) => { - let msg = VcpuDebugStatusMessage { - cpu: cpu_id as usize, - msg: VcpuDebugStatus::MemoryRegion( - Arch::debug_read_memory(vcpu as &V, guest_mem, vaddr, len) - .unwrap_or(Vec::new()), - ), - }; - reply_tube - .send(msg) - .context("failed to send a debug status to GDB thread") - } - VcpuDebug::WriteMem(vaddr, buf) => { - Arch::debug_write_memory(vcpu as &V, guest_mem, vaddr, &buf) - .context("failed to handle a gdb WriteMem command")?; - reply_tube - .send(VcpuDebugStatusMessage { - cpu: cpu_id as usize, - msg: VcpuDebugStatus::CommandComplete, - }) - .context("failed to send a debug status to GDB thread") - } - VcpuDebug::EnableSinglestep => { - Arch::debug_enable_singlestep(vcpu as &V) - .context("failed to handle a gdb EnableSingleStep command")?; - reply_tube - .send(VcpuDebugStatusMessage { - cpu: cpu_id as usize, - msg: VcpuDebugStatus::CommandComplete, - }) - .context("failed to send a debug status to GDB thread") - } - VcpuDebug::SetHwBreakPoint(addrs) => { - Arch::debug_set_hw_breakpoints(vcpu as &V, &addrs) - .context("failed to handle a gdb SetHwBreakPoint command")?; - reply_tube - .send(VcpuDebugStatusMessage { - cpu: cpu_id as usize, - msg: VcpuDebugStatus::CommandComplete, - }) - .context("failed to send a debug status to GDB thread") - } - } -} - -fn run_vcpu( - cpu_id: usize, - kvm_vcpu_id: usize, - vcpu: Option, - vm: impl VmArch + 'static, - mut irq_chip: Box, - vcpu_count: usize, - run_rt: bool, - vcpu_affinity: Vec, - delay_rt: bool, - no_smt: bool, - start_barrier: Arc, - has_bios: bool, - mut io_bus: devices::Bus, - mut mmio_bus: devices::Bus, - exit_evt: Event, - reset_evt: Event, - crash_evt: Event, - requires_pvclock_ctrl: bool, - from_main_tube: mpsc::Receiver, - use_hypervisor_signals: bool, - #[cfg(all(target_arch = "x86_64", feature = "gdb"))] to_gdb_tube: Option< - mpsc::Sender, - >, - enable_per_vm_core_scheduling: bool, - host_cpu_topology: bool, - vcpu_cgroup_tasks_file: Option, -) -> Result> -where - V: VcpuArch + 'static, -{ - thread::Builder::new() - .name(format!("crosvm_vcpu{}", cpu_id)) - .spawn(move || { - // The VCPU thread must trigger either `exit_evt` or `reset_event` in all paths. A - // `ScopedEvent`'s Drop implementation ensures that the `exit_evt` will be sent if - // anything happens before we get to writing the final event. - let scoped_exit_evt = ScopedEvent::from(exit_evt); - - #[cfg(all(target_arch = "x86_64", feature = "gdb"))] - let guest_mem = vm.get_memory().clone(); - let runnable_vcpu = runnable_vcpu( - cpu_id, - kvm_vcpu_id, - vcpu, - vm, - irq_chip.as_mut(), - vcpu_count, - run_rt && !delay_rt, - vcpu_affinity, - no_smt, - has_bios, - use_hypervisor_signals, - enable_per_vm_core_scheduling, - host_cpu_topology, - vcpu_cgroup_tasks_file, - ); - - start_barrier.wait(); - - let (vcpu, vcpu_run_handle) = match runnable_vcpu { - Ok(v) => v, - Err(e) => { - error!("failed to start vcpu {}: {:#}", cpu_id, e); - return; - } - }; - - #[allow(unused_mut)] - let mut run_mode = VmRunMode::Running; - #[cfg(all(target_arch = "x86_64", feature = "gdb"))] - if to_gdb_tube.is_some() { - // Wait until a GDB client attaches - run_mode = VmRunMode::Breakpoint; - } - - mmio_bus.set_access_id(cpu_id); - io_bus.set_access_id(cpu_id); - - let exit_reason = vcpu_loop( - run_mode, - cpu_id, - vcpu, - vcpu_run_handle, - irq_chip, - run_rt, - delay_rt, - io_bus, - mmio_bus, - requires_pvclock_ctrl, - from_main_tube, - use_hypervisor_signals, - #[cfg(all(target_arch = "x86_64", feature = "gdb"))] - to_gdb_tube, - #[cfg(all(target_arch = "x86_64", feature = "gdb"))] - guest_mem, - ); - - let exit_evt = scoped_exit_evt.into(); - let final_event = match exit_reason { - ExitState::Stop => exit_evt, - ExitState::Reset => reset_evt, - ExitState::Crash => crash_evt, - }; - if let Err(e) = final_event.write(1) { - error!( - "failed to send final event {:?} on vcpu {}: {}", - final_event, cpu_id, e - ) - } - }) - .context("failed to spawn VCPU thread") -} - -fn vcpu_loop( - mut run_mode: VmRunMode, - cpu_id: usize, - vcpu: V, - vcpu_run_handle: VcpuRunHandle, - irq_chip: Box, - run_rt: bool, - delay_rt: bool, - io_bus: devices::Bus, - mmio_bus: devices::Bus, - requires_pvclock_ctrl: bool, - from_main_tube: mpsc::Receiver, - use_hypervisor_signals: bool, - #[cfg(all(target_arch = "x86_64", feature = "gdb"))] to_gdb_tube: Option< - mpsc::Sender, - >, - #[cfg(all(target_arch = "x86_64", feature = "gdb"))] guest_mem: GuestMemory, -) -> ExitState -where - V: VcpuArch + 'static, -{ - let mut interrupted_by_signal = false; - - loop { - // Start by checking for messages to process and the run state of the CPU. - // An extra check here for Running so there isn't a need to call recv unless a - // message is likely to be ready because a signal was sent. - if interrupted_by_signal || run_mode != VmRunMode::Running { - 'state_loop: loop { - // Tries to get a pending message without blocking first. - let msg = match from_main_tube.try_recv() { - Ok(m) => m, - Err(mpsc::TryRecvError::Empty) if run_mode == VmRunMode::Running => { - // If the VM is running and no message is pending, the state won't - // change. - break 'state_loop; - } - Err(mpsc::TryRecvError::Empty) => { - // If the VM is not running, wait until a message is ready. - match from_main_tube.recv() { - Ok(m) => m, - Err(mpsc::RecvError) => { - error!("Failed to read from main tube in vcpu"); - return ExitState::Crash; - } - } - } - Err(mpsc::TryRecvError::Disconnected) => { - error!("Failed to read from main tube in vcpu"); - return ExitState::Crash; - } - }; - - // Collect all pending messages. - let mut messages = vec![msg]; - messages.append(&mut from_main_tube.try_iter().collect()); - - for msg in messages { - match msg { - VcpuControl::RunState(new_mode) => { - run_mode = new_mode; - match run_mode { - VmRunMode::Running => break 'state_loop, - VmRunMode::Suspending => { - // On KVM implementations that use a paravirtualized - // clock (e.g. x86), a flag must be set to indicate to - // the guest kernel that a vCPU was suspended. The guest - // kernel will use this flag to prevent the soft lockup - // detection from triggering when this vCPU resumes, - // which could happen days later in realtime. - if requires_pvclock_ctrl { - if let Err(e) = vcpu.pvclock_ctrl() { - error!( - "failed to tell hypervisor vcpu {} is suspending: {}", - cpu_id, e - ); - } - } - } - VmRunMode::Breakpoint => {} - VmRunMode::Exiting => return ExitState::Stop, - } - } - #[cfg(all(target_arch = "x86_64", feature = "gdb"))] - VcpuControl::Debug(d) => match &to_gdb_tube { - Some(ref ch) => { - if let Err(e) = handle_debug_msg(cpu_id, &vcpu, &guest_mem, d, ch) { - error!("Failed to handle gdb message: {}", e); - } - } - None => { - error!("VcpuControl::Debug received while GDB feature is disabled: {:?}", d); - } - }, - VcpuControl::MakeRT => { - if run_rt && delay_rt { - info!("Making vcpu {} RT\n", cpu_id); - const DEFAULT_VCPU_RT_LEVEL: u16 = 6; - if let Err(e) = set_rt_prio_limit(u64::from(DEFAULT_VCPU_RT_LEVEL)) - .and_then(|_| { - set_rt_round_robin(i32::from(DEFAULT_VCPU_RT_LEVEL)) - }) - { - warn!("Failed to set vcpu to real time: {}", e); - } - } - } - } - } - } - } - - interrupted_by_signal = false; - - // Vcpus may have run a HLT instruction, which puts them into a state other than - // VcpuRunState::Runnable. In that case, this call to wait_until_runnable blocks - // until either the irqchip receives an interrupt for this vcpu, or until the main - // thread kicks this vcpu as a result of some VmControl operation. In most IrqChip - // implementations HLT instructions do not make it to crosvm, and thus this is a - // no-op that always returns VcpuRunState::Runnable. - match irq_chip.wait_until_runnable(&vcpu) { - Ok(VcpuRunState::Runnable) => {} - Ok(VcpuRunState::Interrupted) => interrupted_by_signal = true, - Err(e) => error!( - "error waiting for vcpu {} to become runnable: {}", - cpu_id, e - ), - } - - if !interrupted_by_signal { - match vcpu.run(&vcpu_run_handle) { - Ok(VcpuExit::IoIn { port, mut size }) => { - let mut data = [0; 8]; - if size > data.len() { - error!( - "unsupported IoIn size of {} bytes at port {:#x}", - size, port - ); - size = data.len(); - } - io_bus.read(port as u64, &mut data[..size]); - if let Err(e) = vcpu.set_data(&data[..size]) { - error!( - "failed to set return data for IoIn at port {:#x}: {}", - port, e - ); - } - } - Ok(VcpuExit::IoOut { - port, - mut size, - data, - }) => { - if size > data.len() { - error!( - "unsupported IoOut size of {} bytes at port {:#x}", - size, port - ); - size = data.len(); - } - io_bus.write(port as u64, &data[..size]); - } - Ok(VcpuExit::MmioRead { address, size }) => { - let mut data = [0; 8]; - mmio_bus.read(address, &mut data[..size]); - // Setting data for mmio can not fail. - let _ = vcpu.set_data(&data[..size]); - } - Ok(VcpuExit::MmioWrite { - address, - size, - data, - }) => { - mmio_bus.write(address, &data[..size]); - } - Ok(VcpuExit::IoapicEoi { vector }) => { - if let Err(e) = irq_chip.broadcast_eoi(vector) { - error!( - "failed to broadcast eoi {} on vcpu {}: {}", - vector, cpu_id, e - ); - } - } - Ok(VcpuExit::IrqWindowOpen) => {} - Ok(VcpuExit::Hlt) => irq_chip.halted(cpu_id), - Ok(VcpuExit::Shutdown) => return ExitState::Stop, - Ok(VcpuExit::FailEntry { - hardware_entry_failure_reason, - }) => { - error!("vcpu hw run failure: {:#x}", hardware_entry_failure_reason); - return ExitState::Crash; - } - Ok(VcpuExit::SystemEventShutdown) => { - info!("system shutdown event on vcpu {}", cpu_id); - return ExitState::Stop; - } - Ok(VcpuExit::SystemEventReset) => { - info!("system reset event"); - return ExitState::Reset; - } - Ok(VcpuExit::SystemEventCrash) => { - info!("system crash event on vcpu {}", cpu_id); - return ExitState::Stop; - } - #[rustfmt::skip] Ok(VcpuExit::Debug { .. }) => { - #[cfg(all(target_arch = "x86_64", feature = "gdb"))] - { - let msg = VcpuDebugStatusMessage { - cpu: cpu_id as usize, - msg: VcpuDebugStatus::HitBreakPoint, - }; - if let Some(ref ch) = to_gdb_tube { - if let Err(e) = ch.send(msg) { - error!("failed to notify breakpoint to GDB thread: {}", e); - return ExitState::Crash; - } - } - run_mode = VmRunMode::Breakpoint; - } - } - Ok(r) => warn!("unexpected vcpu exit: {:?}", r), - Err(e) => match e.errno() { - libc::EINTR => interrupted_by_signal = true, - libc::EAGAIN => {} - _ => { - error!("vcpu hit unknown error: {}", e); - return ExitState::Crash; - } - }, - } - } - - if interrupted_by_signal { - if use_hypervisor_signals { - // Try to clear the signal that we use to kick VCPU if it is pending before - // attempting to handle pause requests. - if let Err(e) = clear_signal(SIGRTMIN() + 0) { - error!("failed to clear pending signal: {}", e); - return ExitState::Crash; - } - } else { - vcpu.set_immediate_exit(false); - } - } - - if let Err(e) = irq_chip.inject_interrupts(&vcpu) { - error!("failed to inject interrupts for vcpu {}: {}", cpu_id, e); - } - } -} - fn setup_vm_components(cfg: &Config) -> Result { let initrd_image = if let Some(initrd_path) = &cfg.initrd_path { Some( @@ -3453,24 +1289,6 @@ fn handle_vfio_command( } } -/// Signals all running VCPUs to vmexit, sends VcpuControl message to each VCPU tube, and tells -/// `irq_chip` to stop blocking halted VCPUs. The channel message is set first because both the -/// signal and the irq_chip kick could cause the VCPU thread to continue through the VCPU run -/// loop. -fn kick_all_vcpus( - vcpu_handles: &[(JoinHandle<()>, mpsc::Sender)], - irq_chip: &dyn IrqChip, - message: VcpuControl, -) { - for (handle, tube) in vcpu_handles { - if let Err(e) = tube.send(message.clone()) { - error!("failed to send VcpuControl: {}", e); - } - let _ = handle.kill(SIGRTMIN() + 0); - } - irq_chip.kick_halted_vcpus(); -} - fn run_control( mut linux: RunnableLinuxVm, mut sys_allocator: SystemAllocator, @@ -3555,7 +1373,7 @@ fn run_control( .vm .get_hypervisor() .check_capability(HypervisorCap::ImmediateExit); - setup_vcpu_signal_handler::(use_hypervisor_signals)?; + vcpu::setup_vcpu_signal_handler::(use_hypervisor_signals)?; let vcpus: Vec> = match linux.vcpus.take() { Some(vec) => vec.into_iter().map(Some).collect(), @@ -3586,7 +1404,7 @@ fn run_control( Some(VcpuAffinity::PerVcpu(mut m)) => m.remove(&cpu_id).unwrap_or_default(), None => Default::default(), }; - let handle = run_vcpu( + let handle = vcpu::run_vcpu( cpu_id, kvm_vcpu_ids[cpu_id], vcpu, @@ -3683,7 +1501,7 @@ fn run_control( Token::Suspend => { info!("VM requested suspend"); linux.suspend_evt.read().unwrap(); - kick_all_vcpus( + vcpu::kick_all_vcpus( &vcpu_handles, linux.irq_chip.as_irq_chip(), VcpuControl::RunState(VmRunMode::Suspending), @@ -3776,7 +1594,7 @@ fn run_control( dev.lock().resume_imminent(); } } - kick_all_vcpus( + vcpu::kick_all_vcpus( &vcpu_handles, linux.irq_chip.as_irq_chip(), VcpuControl::RunState(other), @@ -3961,7 +1779,7 @@ fn run_control( } } - kick_all_vcpus( + vcpu::kick_all_vcpus( &vcpu_handles, linux.irq_chip.as_irq_chip(), VcpuControl::RunState(VmRunMode::Exiting), diff --git a/src/linux/vcpu.rs b/src/linux/vcpu.rs new file mode 100644 index 0000000000..583ecec03a --- /dev/null +++ b/src/linux/vcpu.rs @@ -0,0 +1,615 @@ +// Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use std::fs::File; +use std::io::prelude::*; +use std::sync::{mpsc, Arc, Barrier}; + +use std::thread; +use std::thread::JoinHandle; + +use libc::{self, c_int}; + +use anyhow::{Context, Result}; +use base::*; +use devices::{self, IrqChip, VcpuRunState}; +use hypervisor::{Vcpu, VcpuExit, VcpuRunHandle}; +use vm_control::*; +#[cfg(all(target_arch = "x86_64", feature = "gdb"))] +use vm_memory::GuestMemory; + +use arch::{self, LinuxArch}; + +#[cfg(any(target_arch = "arm", target_arch = "aarch64"))] +use { + aarch64::AArch64 as Arch, + devices::IrqChipAArch64 as IrqChipArch, + hypervisor::{VcpuAArch64 as VcpuArch, VmAArch64 as VmArch}, +}; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use { + devices::IrqChipX86_64 as IrqChipArch, + hypervisor::{VcpuX86_64 as VcpuArch, VmX86_64 as VmArch}, + x86_64::X8664arch as Arch, +}; + +use super::ExitState; + +pub fn setup_vcpu_signal_handler(use_hypervisor_signals: bool) -> Result<()> { + if use_hypervisor_signals { + unsafe { + extern "C" fn handle_signal(_: c_int) {} + // Our signal handler does nothing and is trivially async signal safe. + register_rt_signal_handler(SIGRTMIN() + 0, handle_signal) + .context("error registering signal handler")?; + } + block_signal(SIGRTMIN() + 0).context("failed to block signal")?; + } else { + unsafe { + extern "C" fn handle_signal(_: c_int) { + T::set_local_immediate_exit(true); + } + register_rt_signal_handler(SIGRTMIN() + 0, handle_signal::) + .context("error registering signal handler")?; + } + } + Ok(()) +} + +// Sets up a vcpu and converts it into a runnable vcpu. +pub fn runnable_vcpu( + cpu_id: usize, + kvm_vcpu_id: usize, + vcpu: Option, + vm: impl VmArch, + irq_chip: &mut dyn IrqChipArch, + vcpu_count: usize, + run_rt: bool, + vcpu_affinity: Vec, + no_smt: bool, + has_bios: bool, + use_hypervisor_signals: bool, + enable_per_vm_core_scheduling: bool, + host_cpu_topology: bool, + vcpu_cgroup_tasks_file: Option, +) -> Result<(V, VcpuRunHandle)> +where + V: VcpuArch, +{ + let mut vcpu = match vcpu { + Some(v) => v, + None => { + // If vcpu is None, it means this arch/hypervisor requires create_vcpu to be called from + // the vcpu thread. + match vm + .create_vcpu(kvm_vcpu_id) + .context("failed to create vcpu")? + .downcast::() + { + Ok(v) => *v, + Err(_) => panic!("VM created wrong type of VCPU"), + } + } + }; + + irq_chip + .add_vcpu(cpu_id, &vcpu) + .context("failed to add vcpu to irq chip")?; + + if !vcpu_affinity.is_empty() { + if let Err(e) = set_cpu_affinity(vcpu_affinity) { + error!("Failed to set CPU affinity: {}", e); + } + } + + Arch::configure_vcpu( + &vm, + vm.get_hypervisor(), + irq_chip, + &mut vcpu, + cpu_id, + vcpu_count, + has_bios, + no_smt, + host_cpu_topology, + ) + .context("failed to configure vcpu")?; + + if !enable_per_vm_core_scheduling { + // Do per-vCPU core scheduling by setting a unique cookie to each vCPU. + if let Err(e) = enable_core_scheduling() { + error!("Failed to enable core scheduling: {}", e); + } + } + + // Move vcpu thread to cgroup + if let Some(mut f) = vcpu_cgroup_tasks_file { + f.write_all(base::gettid().to_string().as_bytes()) + .context("failed to write vcpu tid to cgroup tasks")?; + } + + if run_rt { + const DEFAULT_VCPU_RT_LEVEL: u16 = 6; + if let Err(e) = set_rt_prio_limit(u64::from(DEFAULT_VCPU_RT_LEVEL)) + .and_then(|_| set_rt_round_robin(i32::from(DEFAULT_VCPU_RT_LEVEL))) + { + warn!("Failed to set vcpu to real time: {}", e); + } + } + + if use_hypervisor_signals { + let mut v = get_blocked_signals().context("failed to retrieve signal mask for vcpu")?; + v.retain(|&x| x != SIGRTMIN() + 0); + vcpu.set_signal_mask(&v) + .context("failed to set the signal mask for vcpu")?; + } + + let vcpu_run_handle = vcpu + .take_run_handle(Some(SIGRTMIN() + 0)) + .context("failed to set thread id for vcpu")?; + + Ok((vcpu, vcpu_run_handle)) +} + +#[cfg(all(target_arch = "x86_64", feature = "gdb"))] +fn handle_debug_msg( + cpu_id: usize, + vcpu: &V, + guest_mem: &GuestMemory, + d: VcpuDebug, + reply_tube: &mpsc::Sender, +) -> Result<()> +where + V: VcpuArch + 'static, +{ + match d { + VcpuDebug::ReadRegs => { + let msg = VcpuDebugStatusMessage { + cpu: cpu_id as usize, + msg: VcpuDebugStatus::RegValues( + Arch::debug_read_registers(vcpu as &V) + .context("failed to handle a gdb ReadRegs command")?, + ), + }; + reply_tube + .send(msg) + .context("failed to send a debug status to GDB thread") + } + VcpuDebug::WriteRegs(regs) => { + Arch::debug_write_registers(vcpu as &V, ®s) + .context("failed to handle a gdb WriteRegs command")?; + reply_tube + .send(VcpuDebugStatusMessage { + cpu: cpu_id as usize, + msg: VcpuDebugStatus::CommandComplete, + }) + .context("failed to send a debug status to GDB thread") + } + VcpuDebug::ReadMem(vaddr, len) => { + let msg = VcpuDebugStatusMessage { + cpu: cpu_id as usize, + msg: VcpuDebugStatus::MemoryRegion( + Arch::debug_read_memory(vcpu as &V, guest_mem, vaddr, len) + .unwrap_or(Vec::new()), + ), + }; + reply_tube + .send(msg) + .context("failed to send a debug status to GDB thread") + } + VcpuDebug::WriteMem(vaddr, buf) => { + Arch::debug_write_memory(vcpu as &V, guest_mem, vaddr, &buf) + .context("failed to handle a gdb WriteMem command")?; + reply_tube + .send(VcpuDebugStatusMessage { + cpu: cpu_id as usize, + msg: VcpuDebugStatus::CommandComplete, + }) + .context("failed to send a debug status to GDB thread") + } + VcpuDebug::EnableSinglestep => { + Arch::debug_enable_singlestep(vcpu as &V) + .context("failed to handle a gdb EnableSingleStep command")?; + reply_tube + .send(VcpuDebugStatusMessage { + cpu: cpu_id as usize, + msg: VcpuDebugStatus::CommandComplete, + }) + .context("failed to send a debug status to GDB thread") + } + VcpuDebug::SetHwBreakPoint(addrs) => { + Arch::debug_set_hw_breakpoints(vcpu as &V, &addrs) + .context("failed to handle a gdb SetHwBreakPoint command")?; + reply_tube + .send(VcpuDebugStatusMessage { + cpu: cpu_id as usize, + msg: VcpuDebugStatus::CommandComplete, + }) + .context("failed to send a debug status to GDB thread") + } + } +} + +fn vcpu_loop( + mut run_mode: VmRunMode, + cpu_id: usize, + vcpu: V, + vcpu_run_handle: VcpuRunHandle, + irq_chip: Box, + run_rt: bool, + delay_rt: bool, + io_bus: devices::Bus, + mmio_bus: devices::Bus, + requires_pvclock_ctrl: bool, + from_main_tube: mpsc::Receiver, + use_hypervisor_signals: bool, + #[cfg(all(target_arch = "x86_64", feature = "gdb"))] to_gdb_tube: Option< + mpsc::Sender, + >, + #[cfg(all(target_arch = "x86_64", feature = "gdb"))] guest_mem: GuestMemory, +) -> ExitState +where + V: VcpuArch + 'static, +{ + let mut interrupted_by_signal = false; + + loop { + // Start by checking for messages to process and the run state of the CPU. + // An extra check here for Running so there isn't a need to call recv unless a + // message is likely to be ready because a signal was sent. + if interrupted_by_signal || run_mode != VmRunMode::Running { + 'state_loop: loop { + // Tries to get a pending message without blocking first. + let msg = match from_main_tube.try_recv() { + Ok(m) => m, + Err(mpsc::TryRecvError::Empty) if run_mode == VmRunMode::Running => { + // If the VM is running and no message is pending, the state won't + // change. + break 'state_loop; + } + Err(mpsc::TryRecvError::Empty) => { + // If the VM is not running, wait until a message is ready. + match from_main_tube.recv() { + Ok(m) => m, + Err(mpsc::RecvError) => { + error!("Failed to read from main tube in vcpu"); + return ExitState::Crash; + } + } + } + Err(mpsc::TryRecvError::Disconnected) => { + error!("Failed to read from main tube in vcpu"); + return ExitState::Crash; + } + }; + + // Collect all pending messages. + let mut messages = vec![msg]; + messages.append(&mut from_main_tube.try_iter().collect()); + + for msg in messages { + match msg { + VcpuControl::RunState(new_mode) => { + run_mode = new_mode; + match run_mode { + VmRunMode::Running => break 'state_loop, + VmRunMode::Suspending => { + // On KVM implementations that use a paravirtualized + // clock (e.g. x86), a flag must be set to indicate to + // the guest kernel that a vCPU was suspended. The guest + // kernel will use this flag to prevent the soft lockup + // detection from triggering when this vCPU resumes, + // which could happen days later in realtime. + if requires_pvclock_ctrl { + if let Err(e) = vcpu.pvclock_ctrl() { + error!( + "failed to tell hypervisor vcpu {} is suspending: {}", + cpu_id, e + ); + } + } + } + VmRunMode::Breakpoint => {} + VmRunMode::Exiting => return ExitState::Stop, + } + } + #[cfg(all(target_arch = "x86_64", feature = "gdb"))] + VcpuControl::Debug(d) => match &to_gdb_tube { + Some(ref ch) => { + if let Err(e) = handle_debug_msg(cpu_id, &vcpu, &guest_mem, d, ch) { + error!("Failed to handle gdb message: {}", e); + } + } + None => { + error!("VcpuControl::Debug received while GDB feature is disabled: {:?}", d); + } + }, + VcpuControl::MakeRT => { + if run_rt && delay_rt { + info!("Making vcpu {} RT\n", cpu_id); + const DEFAULT_VCPU_RT_LEVEL: u16 = 6; + if let Err(e) = set_rt_prio_limit(u64::from(DEFAULT_VCPU_RT_LEVEL)) + .and_then(|_| { + set_rt_round_robin(i32::from(DEFAULT_VCPU_RT_LEVEL)) + }) + { + warn!("Failed to set vcpu to real time: {}", e); + } + } + } + } + } + } + } + + interrupted_by_signal = false; + + // Vcpus may have run a HLT instruction, which puts them into a state other than + // VcpuRunState::Runnable. In that case, this call to wait_until_runnable blocks + // until either the irqchip receives an interrupt for this vcpu, or until the main + // thread kicks this vcpu as a result of some VmControl operation. In most IrqChip + // implementations HLT instructions do not make it to crosvm, and thus this is a + // no-op that always returns VcpuRunState::Runnable. + match irq_chip.wait_until_runnable(&vcpu) { + Ok(VcpuRunState::Runnable) => {} + Ok(VcpuRunState::Interrupted) => interrupted_by_signal = true, + Err(e) => error!( + "error waiting for vcpu {} to become runnable: {}", + cpu_id, e + ), + } + + if !interrupted_by_signal { + match vcpu.run(&vcpu_run_handle) { + Ok(VcpuExit::IoIn { port, mut size }) => { + let mut data = [0; 8]; + if size > data.len() { + error!( + "unsupported IoIn size of {} bytes at port {:#x}", + size, port + ); + size = data.len(); + } + io_bus.read(port as u64, &mut data[..size]); + if let Err(e) = vcpu.set_data(&data[..size]) { + error!( + "failed to set return data for IoIn at port {:#x}: {}", + port, e + ); + } + } + Ok(VcpuExit::IoOut { + port, + mut size, + data, + }) => { + if size > data.len() { + error!( + "unsupported IoOut size of {} bytes at port {:#x}", + size, port + ); + size = data.len(); + } + io_bus.write(port as u64, &data[..size]); + } + Ok(VcpuExit::MmioRead { address, size }) => { + let mut data = [0; 8]; + mmio_bus.read(address, &mut data[..size]); + // Setting data for mmio can not fail. + let _ = vcpu.set_data(&data[..size]); + } + Ok(VcpuExit::MmioWrite { + address, + size, + data, + }) => { + mmio_bus.write(address, &data[..size]); + } + Ok(VcpuExit::IoapicEoi { vector }) => { + if let Err(e) = irq_chip.broadcast_eoi(vector) { + error!( + "failed to broadcast eoi {} on vcpu {}: {}", + vector, cpu_id, e + ); + } + } + Ok(VcpuExit::IrqWindowOpen) => {} + Ok(VcpuExit::Hlt) => irq_chip.halted(cpu_id), + Ok(VcpuExit::Shutdown) => return ExitState::Stop, + Ok(VcpuExit::FailEntry { + hardware_entry_failure_reason, + }) => { + error!("vcpu hw run failure: {:#x}", hardware_entry_failure_reason); + return ExitState::Crash; + } + Ok(VcpuExit::SystemEventShutdown) => { + info!("system shutdown event on vcpu {}", cpu_id); + return ExitState::Stop; + } + Ok(VcpuExit::SystemEventReset) => { + info!("system reset event"); + return ExitState::Reset; + } + Ok(VcpuExit::SystemEventCrash) => { + info!("system crash event on vcpu {}", cpu_id); + return ExitState::Stop; + } + #[rustfmt::skip] Ok(VcpuExit::Debug { .. }) => { + #[cfg(all(target_arch = "x86_64", feature = "gdb"))] + { + let msg = VcpuDebugStatusMessage { + cpu: cpu_id as usize, + msg: VcpuDebugStatus::HitBreakPoint, + }; + if let Some(ref ch) = to_gdb_tube { + if let Err(e) = ch.send(msg) { + error!("failed to notify breakpoint to GDB thread: {}", e); + return ExitState::Crash; + } + } + run_mode = VmRunMode::Breakpoint; + } + } + Ok(r) => warn!("unexpected vcpu exit: {:?}", r), + Err(e) => match e.errno() { + libc::EINTR => interrupted_by_signal = true, + libc::EAGAIN => {} + _ => { + error!("vcpu hit unknown error: {}", e); + return ExitState::Crash; + } + }, + } + } + + if interrupted_by_signal { + if use_hypervisor_signals { + // Try to clear the signal that we use to kick VCPU if it is pending before + // attempting to handle pause requests. + if let Err(e) = clear_signal(SIGRTMIN() + 0) { + error!("failed to clear pending signal: {}", e); + return ExitState::Crash; + } + } else { + vcpu.set_immediate_exit(false); + } + } + + if let Err(e) = irq_chip.inject_interrupts(&vcpu) { + error!("failed to inject interrupts for vcpu {}: {}", cpu_id, e); + } + } +} + +pub fn run_vcpu( + cpu_id: usize, + kvm_vcpu_id: usize, + vcpu: Option, + vm: impl VmArch + 'static, + mut irq_chip: Box, + vcpu_count: usize, + run_rt: bool, + vcpu_affinity: Vec, + delay_rt: bool, + no_smt: bool, + start_barrier: Arc, + has_bios: bool, + mut io_bus: devices::Bus, + mut mmio_bus: devices::Bus, + exit_evt: Event, + reset_evt: Event, + crash_evt: Event, + requires_pvclock_ctrl: bool, + from_main_tube: mpsc::Receiver, + use_hypervisor_signals: bool, + #[cfg(all(target_arch = "x86_64", feature = "gdb"))] to_gdb_tube: Option< + mpsc::Sender, + >, + enable_per_vm_core_scheduling: bool, + host_cpu_topology: bool, + vcpu_cgroup_tasks_file: Option, +) -> Result> +where + V: VcpuArch + 'static, +{ + thread::Builder::new() + .name(format!("crosvm_vcpu{}", cpu_id)) + .spawn(move || { + // The VCPU thread must trigger either `exit_evt` or `reset_event` in all paths. A + // `ScopedEvent`'s Drop implementation ensures that the `exit_evt` will be sent if + // anything happens before we get to writing the final event. + let scoped_exit_evt = ScopedEvent::from(exit_evt); + + #[cfg(all(target_arch = "x86_64", feature = "gdb"))] + let guest_mem = vm.get_memory().clone(); + let runnable_vcpu = runnable_vcpu( + cpu_id, + kvm_vcpu_id, + vcpu, + vm, + irq_chip.as_mut(), + vcpu_count, + run_rt && !delay_rt, + vcpu_affinity, + no_smt, + has_bios, + use_hypervisor_signals, + enable_per_vm_core_scheduling, + host_cpu_topology, + vcpu_cgroup_tasks_file, + ); + + start_barrier.wait(); + + let (vcpu, vcpu_run_handle) = match runnable_vcpu { + Ok(v) => v, + Err(e) => { + error!("failed to start vcpu {}: {:#}", cpu_id, e); + return; + } + }; + + #[allow(unused_mut)] + let mut run_mode = VmRunMode::Running; + #[cfg(all(target_arch = "x86_64", feature = "gdb"))] + if to_gdb_tube.is_some() { + // Wait until a GDB client attaches + run_mode = VmRunMode::Breakpoint; + } + + mmio_bus.set_access_id(cpu_id); + io_bus.set_access_id(cpu_id); + + let exit_reason = vcpu_loop( + run_mode, + cpu_id, + vcpu, + vcpu_run_handle, + irq_chip, + run_rt, + delay_rt, + io_bus, + mmio_bus, + requires_pvclock_ctrl, + from_main_tube, + use_hypervisor_signals, + #[cfg(all(target_arch = "x86_64", feature = "gdb"))] + to_gdb_tube, + #[cfg(all(target_arch = "x86_64", feature = "gdb"))] + guest_mem, + ); + + let exit_evt = scoped_exit_evt.into(); + let final_event = match exit_reason { + ExitState::Stop => exit_evt, + ExitState::Reset => reset_evt, + ExitState::Crash => crash_evt, + }; + if let Err(e) = final_event.write(1) { + error!( + "failed to send final event {:?} on vcpu {}: {}", + final_event, cpu_id, e + ) + } + }) + .context("failed to spawn VCPU thread") +} + +/// Signals all running VCPUs to vmexit, sends VcpuControl message to each VCPU tube, and tells +/// `irq_chip` to stop blocking halted VCPUs. The channel message is set first because both the +/// signal and the irq_chip kick could cause the VCPU thread to continue through the VCPU run +/// loop. +pub fn kick_all_vcpus( + vcpu_handles: &[(JoinHandle<()>, mpsc::Sender)], + irq_chip: &dyn IrqChip, + message: VcpuControl, +) { + for (handle, tube) in vcpu_handles { + if let Err(e) = tube.send(message.clone()) { + error!("failed to send VcpuControl: {}", e); + } + let _ = handle.kill(SIGRTMIN() + 0); + } + irq_chip.kick_halted_vcpus(); +}