From 5acc0f52f5e5b8f6b8950f961addd3c631751cfe Mon Sep 17 00:00:00 2001 From: Anton Romanov Date: Fri, 28 Jan 2022 00:18:11 +0000 Subject: [PATCH] linux: split out linux mod into multiple At nearly 4k loc its harder to maintain. This change only moves some things around without changing any code. Input on symbol visibility is welcome - in reality it doesn't really matter if symb is pub/pub(super)/pub(crate) as mods themselves are private to linux mod. I plan to invest more into splitting things apart if possible (especially the main loop) but its a start TEST=./tools/presubmit BUG=n/a Change-Id: I2792dd0acdb5627f1c9b5d0fb998c976c6fe5e15 Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/crosvm/+/3422266 Reviewed-by: Daniel Verkamp Tested-by: kokoro Reviewed-by: Noah Gold Reviewed-by: Anton Romanov Commit-Queue: Anton Romanov Auto-Submit: Anton Romanov --- ARCHITECTURE.md | 18 +- docs/book/src/appendix/minijail.md | 4 +- src/linux/device_helpers.rs | 1147 ++++++++++++++ src/linux/gpu.rs | 331 +++++ src/linux/jail_helpers.rs | 188 +++ src/linux/mod.rs | 2234 +--------------------------- src/linux/vcpu.rs | 615 ++++++++ 7 files changed, 2318 insertions(+), 2219 deletions(-) create mode 100644 src/linux/device_helpers.rs create mode 100644 src/linux/gpu.rs create mode 100644 src/linux/jail_helpers.rs create mode 100644 src/linux/vcpu.rs diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 3129e6a432..91df4773ff 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -8,14 +8,14 @@ The principle characteristics of crosvm are: - Written in Rust for security and safety A typical session of crosvm starts in `main.rs` where command line parsing is done to build up a -`Config` structure. The `Config` is used by `run_config` in `linux.rs` to setup and execute a VM. -Broken down into rough steps: +`Config` structure. The `Config` is used by `run_config` in `linux/mod.rs` to setup and execute a +VM. Broken down into rough steps: 1. Load the linux kernel from an ELF file. 1. Create a handful of control sockets used by the virtual devices. 1. Invoke the architecture specific VM builder `Arch::build_vm` (located in `x86_64/src/lib.rs` or `aarch64/src/lib.rs`). -1. `Arch::build_vm` will itself invoke the provided `create_devices` function from `linux.rs` +1. `Arch::build_vm` will itself invoke the provided `create_devices` function from `linux/mod.rs` 1. `create_devices` creates every PCI device, including the virtio devices, that were configured in `Config`, along with matching [minijail] configs for each. 1. `Arch::generate_pci_root`, using a list of every PCI device with optional `Minijail`, will @@ -35,12 +35,12 @@ invalid. ## Sandboxing Policy -Every sandbox is made with [minijail] and starts with `create_base_minijail` in `linux.rs` which set -some very restrictive settings. Linux namespaces and seccomp filters are used extensively. Each -seccomp policy can be found under `seccomp/{arch}/{device}.policy` and should start by -`@include`-ing the `common_device.policy`. With the exception of architecture specific devices (such -as `Pl030` on ARM or `I8042` on x86_64), every device will need a different policy for each -supported architecture. +Every sandbox is made with [minijail] and starts with `create_base_minijail` in +`linux/jail_helpers.rs` which set some very restrictive settings. Linux namespaces and seccomp +filters are used extensively. Each seccomp policy can be found under +`seccomp/{arch}/{device}.policy` and should start by `@include`-ing the `common_device.policy`. With +the exception of architecture specific devices (such as `Pl030` on ARM or `I8042` on x86_64), every +device will need a different policy for each supported architecture. ## The VM Control Sockets diff --git a/docs/book/src/appendix/minijail.md b/docs/book/src/appendix/minijail.md index 0408261fe7..9a48906558 100644 --- a/docs/book/src/appendix/minijail.md +++ b/docs/book/src/appendix/minijail.md @@ -8,8 +8,8 @@ The fact that minijail was written, maintained, and continuously tested by a pro team more than makes up for its being written in an memory unsafe language. The exact configuration of the sandbox varies by device, but they are mostly alike. See -`create_base_minijail` from `linux.rs`. The set of security constraints explicitly used in crosvm -are: +`create_base_minijail` from `linux/jail_helpers.rs`. The set of security constraints explicitly used +in crosvm are: - PID Namespace - Runs as init diff --git a/src/linux/device_helpers.rs b/src/linux/device_helpers.rs new file mode 100644 index 0000000000..7c80e02135 --- /dev/null +++ b/src/linux/device_helpers.rs @@ -0,0 +1,1147 @@ +// Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use std::collections::BTreeMap; +use std::convert::TryFrom; +use std::fs::{File, OpenOptions}; +use std::net::Ipv4Addr; +use std::os::unix::net::UnixListener; +use std::os::unix::{io::FromRawFd, net::UnixStream, prelude::OpenOptionsExt}; +use std::path::{Path, PathBuf}; +use std::str; +use std::sync::Arc; + +use anyhow::{anyhow, bail, Context, Result}; +use base::*; +use devices::serial_device::SerialParameters; +use devices::vfio::{VfioCommonSetup, VfioCommonTrait}; +#[cfg(feature = "audio_cras")] +use devices::virtio::snd::cras_backend::Parameters as CrasSndParameters; +use devices::virtio::vhost::user::proxy::VirtioVhostUser; +#[cfg(feature = "audio")] +use devices::virtio::vhost::user::vmm::Snd as VhostUserSnd; +use devices::virtio::vhost::user::vmm::{ + Block as VhostUserBlock, Console as VhostUserConsole, Fs as VhostUserFs, + Mac80211Hwsim as VhostUserMac80211Hwsim, Net as VhostUserNet, Vsock as VhostUserVsock, + Wl as VhostUserWl, +}; +#[cfg(any(feature = "video-decoder", feature = "video-encoder"))] +use devices::virtio::VideoBackendType; +use devices::virtio::{self, Console, VirtioDevice}; +use devices::IommuDevType; +use devices::{self, PciDevice, VfioContainer, VfioDevice, VfioPciDevice, VfioPlatformDevice}; +use hypervisor::Vm; +use minijail::{self, Minijail}; +use net_util::{MacAddress, Tap}; +use resources::{Alloc, MmioType, SystemAllocator}; +use sync::Mutex; +use vm_memory::GuestAddress; + +use crate::{ + Config, DiskOption, TouchDeviceOption, VhostUserFsOption, VhostUserOption, VhostUserWlOption, + VhostVsockDeviceParameter, +}; +use arch::{self, VirtioDeviceStub}; + +use super::jail_helpers::*; + +pub enum TaggedControlTube { + Fs(Tube), + Vm(Tube), + VmMemory(Tube), + VmIrq(Tube), + VmMsync(Tube), +} + +impl AsRef for TaggedControlTube { + fn as_ref(&self) -> &Tube { + use self::TaggedControlTube::*; + match &self { + Fs(tube) | Vm(tube) | VmMemory(tube) | VmIrq(tube) | VmMsync(tube) => tube, + } + } +} + +impl AsRawDescriptor for TaggedControlTube { + fn as_raw_descriptor(&self) -> RawDescriptor { + self.as_ref().as_raw_descriptor() + } +} + +pub trait IntoUnixStream { + fn into_unix_stream(self) -> Result; +} + +impl<'a> IntoUnixStream for &'a Path { + fn into_unix_stream(self) -> Result { + if let Some(fd) = safe_descriptor_from_path(self).context("failed to open event device")? { + Ok(fd.into()) + } else { + UnixStream::connect(self).context("failed to open event device") + } + } +} + +impl<'a> IntoUnixStream for &'a PathBuf { + fn into_unix_stream(self) -> Result { + self.as_path().into_unix_stream() + } +} + +impl IntoUnixStream for UnixStream { + fn into_unix_stream(self) -> Result { + Ok(self) + } +} + +pub type DeviceResult = Result; + +pub fn create_block_device( + cfg: &Config, + disk: &DiskOption, + disk_device_tube: Tube, +) -> DeviceResult { + let raw_image: File = open_file(&disk.path, disk.read_only, disk.o_direct) + .with_context(|| format!("failed to load disk image {}", disk.path.display()))?; + // Lock the disk image to prevent other crosvm instances from using it. + let lock_op = if disk.read_only { + FlockOperation::LockShared + } else { + FlockOperation::LockExclusive + }; + flock(&raw_image, lock_op, true).context("failed to lock disk image")?; + + info!("Trying to attach block device: {}", disk.path.display()); + let dev = if disk::async_ok(&raw_image).context("failed to check disk async_ok")? { + let async_file = disk::create_async_disk_file(raw_image) + .context("failed to create async virtual disk")?; + Box::new( + virtio::BlockAsync::new( + virtio::base_features(cfg.protected_vm), + async_file, + disk.read_only, + disk.sparse, + disk.block_size, + disk.id, + Some(disk_device_tube), + ) + .context("failed to create block device")?, + ) as Box + } else { + let disk_file = disk::create_disk_file(raw_image, disk::MAX_NESTING_DEPTH) + .context("failed to create virtual disk")?; + Box::new( + virtio::Block::new( + virtio::base_features(cfg.protected_vm), + disk_file, + disk.read_only, + disk.sparse, + disk.block_size, + disk.id, + Some(disk_device_tube), + ) + .context("failed to create block device")?, + ) as Box + }; + + Ok(VirtioDeviceStub { + dev, + jail: simple_jail(cfg, "block_device")?, + }) +} + +pub fn create_vhost_user_block_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { + let dev = VhostUserBlock::new(virtio::base_features(cfg.protected_vm), &opt.socket) + .context("failed to set up vhost-user block device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + // no sandbox here because virtqueue handling is exported to a different process. + jail: None, + }) +} + +pub fn create_vhost_user_console_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { + let dev = VhostUserConsole::new(virtio::base_features(cfg.protected_vm), &opt.socket) + .context("failed to set up vhost-user console device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + // no sandbox here because virtqueue handling is exported to a different process. + jail: None, + }) +} + +pub fn create_vhost_user_fs_device(cfg: &Config, option: &VhostUserFsOption) -> DeviceResult { + let dev = VhostUserFs::new( + virtio::base_features(cfg.protected_vm), + &option.socket, + &option.tag, + ) + .context("failed to set up vhost-user fs device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + // no sandbox here because virtqueue handling is exported to a different process. + jail: None, + }) +} + +pub fn create_vhost_user_mac80211_hwsim_device( + cfg: &Config, + opt: &VhostUserOption, +) -> DeviceResult { + let dev = VhostUserMac80211Hwsim::new(virtio::base_features(cfg.protected_vm), &opt.socket) + .context("failed to set up vhost-user mac80211_hwsim device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + // no sandbox here because virtqueue handling is exported to a different process. + jail: None, + }) +} + +#[cfg(feature = "audio")] +pub fn create_vhost_user_snd_device(cfg: &Config, option: &VhostUserOption) -> DeviceResult { + let dev = VhostUserSnd::new(virtio::base_features(cfg.protected_vm), &option.socket) + .context("failed to set up vhost-user snd device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + // no sandbox here because virtqueue handling is exported to a different process. + jail: None, + }) +} + +pub fn create_vvu_proxy_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { + let listener = UnixListener::bind(&opt.socket).map_err(|e| { + error!("failed to bind listener for vvu proxy device: {}", e); + e + })?; + + let dev = VirtioVhostUser::new(virtio::base_features(cfg.protected_vm), listener) + .context("failed to create VVU proxy device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "vvu_proxy_device")?, + }) +} + +pub fn create_rng_device(cfg: &Config) -> DeviceResult { + let dev = virtio::Rng::new(virtio::base_features(cfg.protected_vm)) + .context("failed to set up rng")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "rng_device")?, + }) +} + +#[cfg(feature = "audio_cras")] +pub fn create_cras_snd_device(cfg: &Config, cras_snd: CrasSndParameters) -> DeviceResult { + let dev = virtio::snd::cras_backend::VirtioSndCras::new( + virtio::base_features(cfg.protected_vm), + cras_snd, + ) + .context("failed to create cras sound device")?; + + let jail = match simple_jail(&cfg, "cras_snd_device")? { + Some(mut jail) => { + // Create a tmpfs in the device's root directory for cras_snd_device. + // The size is 20*1024, or 20 KB. + jail.mount_with_data( + Path::new("none"), + Path::new("/"), + "tmpfs", + (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, + "size=20480", + )?; + + let run_cras_path = Path::new("/run/cras"); + jail.mount_bind(run_cras_path, run_cras_path, true)?; + + add_current_user_to_jail(&mut jail)?; + + Some(jail) + } + None => None, + }; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail, + }) +} + +#[cfg(feature = "tpm")] +pub fn create_tpm_device(cfg: &Config) -> DeviceResult { + use std::ffi::CString; + use std::fs; + use std::process; + + let tpm_storage: PathBuf; + let mut tpm_jail = simple_jail(cfg, "tpm_device")?; + + match &mut tpm_jail { + Some(jail) => { + // Create a tmpfs in the device's root directory for tpm + // simulator storage. The size is 20*1024, or 20 KB. + jail.mount_with_data( + Path::new("none"), + Path::new("/"), + "tmpfs", + (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, + "size=20480", + )?; + + let crosvm_ids = add_current_user_to_jail(jail)?; + + let pid = process::id(); + let tpm_pid_dir = format!("/run/vm/tpm.{}", pid); + tpm_storage = Path::new(&tpm_pid_dir).to_owned(); + fs::create_dir_all(&tpm_storage).with_context(|| { + format!("failed to create tpm storage dir {}", tpm_storage.display()) + })?; + let tpm_pid_dir_c = CString::new(tpm_pid_dir).expect("no nul bytes"); + chown(&tpm_pid_dir_c, crosvm_ids.uid, crosvm_ids.gid) + .context("failed to chown tpm storage")?; + + jail.mount_bind(&tpm_storage, &tpm_storage, true)?; + } + None => { + // Path used inside cros_sdk which does not have /run/vm. + tpm_storage = Path::new("/tmp/tpm-simulator").to_owned(); + } + } + + let dev = virtio::Tpm::new(tpm_storage); + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: tpm_jail, + }) +} + +pub fn create_single_touch_device( + cfg: &Config, + single_touch_spec: &TouchDeviceOption, + idx: u32, +) -> DeviceResult { + let socket = single_touch_spec + .get_path() + .into_unix_stream() + .map_err(|e| { + error!("failed configuring virtio single touch: {:?}", e); + e + })?; + + let (width, height) = single_touch_spec.get_size(); + let dev = virtio::new_single_touch( + idx, + socket, + width, + height, + virtio::base_features(cfg.protected_vm), + ) + .context("failed to set up input device")?; + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "input_device")?, + }) +} + +pub fn create_multi_touch_device( + cfg: &Config, + multi_touch_spec: &TouchDeviceOption, + idx: u32, +) -> DeviceResult { + let socket = multi_touch_spec + .get_path() + .into_unix_stream() + .map_err(|e| { + error!("failed configuring virtio multi touch: {:?}", e); + e + })?; + + let (width, height) = multi_touch_spec.get_size(); + let dev = virtio::new_multi_touch( + idx, + socket, + width, + height, + virtio::base_features(cfg.protected_vm), + ) + .context("failed to set up input device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "input_device")?, + }) +} + +pub fn create_trackpad_device( + cfg: &Config, + trackpad_spec: &TouchDeviceOption, + idx: u32, +) -> DeviceResult { + let socket = trackpad_spec.get_path().into_unix_stream().map_err(|e| { + error!("failed configuring virtio trackpad: {:#}", e); + e + })?; + + let (width, height) = trackpad_spec.get_size(); + let dev = virtio::new_trackpad( + idx, + socket, + width, + height, + virtio::base_features(cfg.protected_vm), + ) + .context("failed to set up input device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "input_device")?, + }) +} + +pub fn create_mouse_device( + cfg: &Config, + mouse_socket: T, + idx: u32, +) -> DeviceResult { + let socket = mouse_socket.into_unix_stream().map_err(|e| { + error!("failed configuring virtio mouse: {:#}", e); + e + })?; + + let dev = virtio::new_mouse(idx, socket, virtio::base_features(cfg.protected_vm)) + .context("failed to set up input device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "input_device")?, + }) +} + +pub fn create_keyboard_device( + cfg: &Config, + keyboard_socket: T, + idx: u32, +) -> DeviceResult { + let socket = keyboard_socket.into_unix_stream().map_err(|e| { + error!("failed configuring virtio keyboard: {:#}", e); + e + })?; + + let dev = virtio::new_keyboard(idx, socket, virtio::base_features(cfg.protected_vm)) + .context("failed to set up input device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "input_device")?, + }) +} + +pub fn create_switches_device( + cfg: &Config, + switches_socket: T, + idx: u32, +) -> DeviceResult { + let socket = switches_socket.into_unix_stream().map_err(|e| { + error!("failed configuring virtio switches: {:#}", e); + e + })?; + + let dev = virtio::new_switches(idx, socket, virtio::base_features(cfg.protected_vm)) + .context("failed to set up input device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "input_device")?, + }) +} + +pub fn create_vinput_device(cfg: &Config, dev_path: &Path) -> DeviceResult { + let dev_file = OpenOptions::new() + .read(true) + .write(true) + .open(dev_path) + .with_context(|| format!("failed to open vinput device {}", dev_path.display()))?; + + let dev = virtio::new_evdev(dev_file, virtio::base_features(cfg.protected_vm)) + .context("failed to set up input device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "input_device")?, + }) +} + +pub fn create_balloon_device( + cfg: &Config, + tube: Tube, + inflate_tube: Option, + init_balloon_size: u64, +) -> DeviceResult { + let dev = virtio::Balloon::new( + virtio::base_features(cfg.protected_vm), + tube, + inflate_tube, + init_balloon_size, + ) + .context("failed to create balloon")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "balloon_device")?, + }) +} + +/// Generic method for creating a network device. `create_device` is a closure that takes the virtio +/// features and number of queue pairs as parameters, and is responsible for creating the device +/// itself. +pub fn create_net_device(cfg: &Config, policy: &str, create_device: F) -> DeviceResult +where + F: Fn(u64, u16) -> Result, + T: VirtioDevice + 'static, +{ + let mut vq_pairs = cfg.net_vq_pairs.unwrap_or(1); + let vcpu_count = cfg.vcpu_count.unwrap_or(1); + if vcpu_count < vq_pairs as usize { + warn!("the number of net vq pairs must not exceed the vcpu count, falling back to single queue mode"); + vq_pairs = 1; + } + let features = virtio::base_features(cfg.protected_vm); + + let dev = create_device(features, vq_pairs)?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev) as Box, + jail: simple_jail(cfg, policy)?, + }) +} + +/// Returns a network device created from a new TAP interface configured with `host_ip`, `netmask`, +/// and `mac_address`. +pub fn create_net_device_from_config( + cfg: &Config, + host_ip: Ipv4Addr, + netmask: Ipv4Addr, + mac_address: MacAddress, +) -> DeviceResult { + let policy = if cfg.vhost_net { + "vhost_net_device" + } else { + "net_device" + }; + + if cfg.vhost_net { + create_net_device(cfg, policy, |features, _vq_pairs| { + virtio::vhost::Net::>::new( + &cfg.vhost_net_device_path, + features, + host_ip, + netmask, + mac_address, + ) + .context("failed to set up vhost networking") + }) + } else { + create_net_device(cfg, policy, |features, vq_pairs| { + virtio::Net::::new(features, host_ip, netmask, mac_address, vq_pairs) + .context("failed to create virtio network device") + }) + } +} + +/// Returns a network device from a file descriptor to a configured TAP interface. +pub fn create_tap_net_device_from_fd(cfg: &Config, tap_fd: RawDescriptor) -> DeviceResult { + create_net_device(cfg, "net_device", |features, vq_pairs| { + // Safe because we ensure that we get a unique handle to the fd. + let tap = unsafe { + Tap::from_raw_descriptor( + validate_raw_descriptor(tap_fd).context("failed to validate tap descriptor")?, + ) + .context("failed to create tap device")? + }; + + virtio::Net::from(features, tap, vq_pairs).context("failed to create tap net device") + }) +} + +/// Returns a network device created by opening the persistent, configured TAP interface `tap_name`. +pub fn create_tap_net_device_from_name(cfg: &Config, tap_name: &[u8]) -> DeviceResult { + create_net_device(cfg, "net_device", |features, vq_pairs| { + virtio::Net::::new_from_name(features, tap_name, vq_pairs) + .context("failed to create configured virtio network device") + }) +} + +pub fn create_vhost_user_net_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { + let dev = VhostUserNet::new(virtio::base_features(cfg.protected_vm), &opt.socket) + .context("failed to set up vhost-user net device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + // no sandbox here because virtqueue handling is exported to a different process. + jail: None, + }) +} + +pub fn create_vhost_user_vsock_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { + let dev = VhostUserVsock::new(virtio::base_features(cfg.protected_vm), &opt.socket) + .context("failed to set up vhost-user vsock device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + // no sandbox here because virtqueue handling is exported to a different process. + jail: None, + }) +} + +pub fn create_vhost_user_wl_device(cfg: &Config, opt: &VhostUserWlOption) -> DeviceResult { + // The crosvm wl device expects us to connect the tube before it will accept a vhost-user + // connection. + let dev = VhostUserWl::new(virtio::base_features(cfg.protected_vm), &opt.socket) + .context("failed to set up vhost-user wl device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + // no sandbox here because virtqueue handling is exported to a different process. + jail: None, + }) +} + +pub fn create_wayland_device( + cfg: &Config, + control_tube: Tube, + resource_bridge: Option, +) -> DeviceResult { + let wayland_socket_dirs = cfg + .wayland_socket_paths + .iter() + .map(|(_name, path)| path.parent()) + .collect::>>() + .ok_or_else(|| anyhow!("wayland socket path has no parent or file name"))?; + + let features = virtio::base_features(cfg.protected_vm); + let dev = virtio::Wl::new( + features, + cfg.wayland_socket_paths.clone(), + control_tube, + resource_bridge, + ) + .context("failed to create wayland device")?; + + let jail = match simple_jail(cfg, "wl_device")? { + Some(mut jail) => { + // Create a tmpfs in the device's root directory so that we can bind mount the wayland + // socket directory into it. The size=67108864 is size=64*1024*1024 or size=64MB. + jail.mount_with_data( + Path::new("none"), + Path::new("/"), + "tmpfs", + (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, + "size=67108864", + )?; + + // Bind mount the wayland socket's directory into jail's root. This is necessary since + // each new wayland context must open() the socket. If the wayland socket is ever + // destroyed and remade in the same host directory, new connections will be possible + // without restarting the wayland device. + for dir in &wayland_socket_dirs { + jail.mount_bind(dir, dir, true)?; + } + add_current_user_to_jail(&mut jail)?; + + Some(jail) + } + None => None, + }; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail, + }) +} + +#[cfg(any(feature = "video-decoder", feature = "video-encoder"))] +pub fn create_video_device( + backend: VideoBackendType, + cfg: &Config, + typ: devices::virtio::VideoDeviceType, + resource_bridge: Tube, +) -> DeviceResult { + let jail = match simple_jail(cfg, "video_device")? { + Some(mut jail) => { + match typ { + #[cfg(feature = "video-decoder")] + devices::virtio::VideoDeviceType::Decoder => add_current_user_to_jail(&mut jail)?, + #[cfg(feature = "video-encoder")] + devices::virtio::VideoDeviceType::Encoder => add_current_user_to_jail(&mut jail)?, + }; + + // Create a tmpfs in the device's root directory so that we can bind mount files. + jail.mount_with_data( + Path::new("none"), + Path::new("/"), + "tmpfs", + (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, + "size=67108864", + )?; + + #[cfg(feature = "libvda")] + // Render node for libvda. + if backend == VideoBackendType::Libvda || backend == VideoBackendType::LibvdaVd { + // follow the implementation at: + // https://chromium.googlesource.com/chromiumos/platform/minigbm/+/c06cc9cccb3cf3c7f9d2aec706c27c34cd6162a0/cros_gralloc/cros_gralloc_driver.cc#90 + const DRM_NUM_NODES: u32 = 63; + const DRM_RENDER_NODE_START: u32 = 128; + for offset in 0..DRM_NUM_NODES { + let path_str = format!("/dev/dri/renderD{}", DRM_RENDER_NODE_START + offset); + let dev_dri_path = Path::new(&path_str); + if !dev_dri_path.exists() { + break; + } + jail.mount_bind(dev_dri_path, dev_dri_path, false)?; + } + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + // Device nodes used by libdrm through minigbm in libvda on AMD devices. + let sys_dev_char_path = Path::new("/sys/dev/char"); + jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?; + let sys_devices_path = Path::new("/sys/devices"); + jail.mount_bind(sys_devices_path, sys_devices_path, false)?; + + // Required for loading dri libraries loaded by minigbm on AMD devices. + jail_mount_bind_if_exists(&mut jail, &["/usr/lib64"])?; + } + + // Device nodes required by libchrome which establishes Mojo connection in libvda. + let dev_urandom_path = Path::new("/dev/urandom"); + jail.mount_bind(dev_urandom_path, dev_urandom_path, false)?; + let system_bus_socket_path = Path::new("/run/dbus/system_bus_socket"); + jail.mount_bind(system_bus_socket_path, system_bus_socket_path, true)?; + + Some(jail) + } + None => None, + }; + + Ok(VirtioDeviceStub { + dev: Box::new(devices::virtio::VideoDevice::new( + virtio::base_features(cfg.protected_vm), + typ, + backend, + Some(resource_bridge), + )), + jail, + }) +} + +#[cfg(any(feature = "video-decoder", feature = "video-encoder"))] +pub fn register_video_device( + backend: VideoBackendType, + devs: &mut Vec, + video_tube: Tube, + cfg: &Config, + typ: devices::virtio::VideoDeviceType, +) -> Result<()> { + devs.push(create_video_device(backend, cfg, typ, video_tube)?); + Ok(()) +} + +pub fn create_vhost_vsock_device(cfg: &Config, cid: u64) -> DeviceResult { + let features = virtio::base_features(cfg.protected_vm); + + let device_file = match cfg + .vhost_vsock_device + .as_ref() + .unwrap_or(&VhostVsockDeviceParameter::default()) + { + VhostVsockDeviceParameter::Fd(fd) => { + let fd = validate_raw_descriptor(*fd) + .context("failed to validate fd for virtual socker device")?; + // Safe because the `fd` is actually owned by this process and + // we have a unique handle to it. + unsafe { File::from_raw_fd(fd) } + } + VhostVsockDeviceParameter::Path(path) => OpenOptions::new() + .read(true) + .write(true) + .custom_flags(libc::O_CLOEXEC | libc::O_NONBLOCK) + .open(path) + .context("failed to open virtual socket device")?, + }; + + let dev = virtio::vhost::Vsock::new(device_file, features, cid) + .context("failed to set up virtual socket device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "vhost_vsock_device")?, + }) +} + +pub fn create_fs_device( + cfg: &Config, + uid_map: &str, + gid_map: &str, + src: &Path, + tag: &str, + fs_cfg: virtio::fs::passthrough::Config, + device_tube: Tube, +) -> DeviceResult { + let max_open_files = + base::get_max_open_files().context("failed to get max number of open files")?; + let j = if cfg.sandbox { + let seccomp_policy = cfg.seccomp_policy_dir.join("fs_device"); + let config = SandboxConfig { + limit_caps: false, + uid_map: Some(uid_map), + gid_map: Some(gid_map), + log_failures: cfg.seccomp_log_failures, + seccomp_policy: &seccomp_policy, + // We want bind mounts from the parent namespaces to propagate into the fs device's + // namespace. + remount_mode: Some(libc::MS_SLAVE), + }; + create_base_minijail(src, Some(max_open_files), Some(&config))? + } else { + create_base_minijail(src, Some(max_open_files), None)? + }; + + let features = virtio::base_features(cfg.protected_vm); + // TODO(chirantan): Use more than one worker once the kernel driver has been fixed to not panic + // when num_queues > 1. + let dev = virtio::fs::Fs::new(features, tag, 1, fs_cfg, device_tube) + .context("failed to create fs device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: Some(j), + }) +} + +pub fn create_9p_device( + cfg: &Config, + uid_map: &str, + gid_map: &str, + src: &Path, + tag: &str, + mut p9_cfg: p9::Config, +) -> DeviceResult { + let max_open_files = + base::get_max_open_files().context("failed to get max number of open files")?; + let (jail, root) = if cfg.sandbox { + let seccomp_policy = cfg.seccomp_policy_dir.join("9p_device"); + let config = SandboxConfig { + limit_caps: false, + uid_map: Some(uid_map), + gid_map: Some(gid_map), + log_failures: cfg.seccomp_log_failures, + seccomp_policy: &seccomp_policy, + // We want bind mounts from the parent namespaces to propagate into the 9p server's + // namespace. + remount_mode: Some(libc::MS_SLAVE), + }; + + let jail = create_base_minijail(src, Some(max_open_files), Some(&config))?; + + // The shared directory becomes the root of the device's file system. + let root = Path::new("/"); + (Some(jail), root) + } else { + // There's no mount namespace so we tell the server to treat the source directory as the + // root. + (None, src) + }; + + let features = virtio::base_features(cfg.protected_vm); + p9_cfg.root = root.into(); + let dev = virtio::P9::new(features, tag, p9_cfg).context("failed to create 9p device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail, + }) +} + +pub fn create_pmem_device( + cfg: &Config, + vm: &mut impl Vm, + resources: &mut SystemAllocator, + disk: &DiskOption, + index: usize, + pmem_device_tube: Tube, +) -> DeviceResult { + let fd = open_file(&disk.path, disk.read_only, false /*O_DIRECT*/) + .with_context(|| format!("failed to load disk image {}", disk.path.display()))?; + + let (disk_size, arena_size) = { + let metadata = std::fs::metadata(&disk.path).with_context(|| { + format!("failed to get disk image {} metadata", disk.path.display()) + })?; + let disk_len = metadata.len(); + // Linux requires pmem region sizes to be 2 MiB aligned. Linux will fill any partial page + // at the end of an mmap'd file and won't write back beyond the actual file length, but if + // we just align the size of the file to 2 MiB then access beyond the last page of the + // mapped file will generate SIGBUS. So use a memory mapping arena that will provide + // padding up to 2 MiB. + let alignment = 2 * 1024 * 1024; + let align_adjust = if disk_len % alignment != 0 { + alignment - (disk_len % alignment) + } else { + 0 + }; + ( + disk_len, + disk_len + .checked_add(align_adjust) + .ok_or_else(|| anyhow!("pmem device image too big"))?, + ) + }; + + let protection = { + if disk.read_only { + Protection::read() + } else { + Protection::read_write() + } + }; + + let arena = { + // Conversion from u64 to usize may fail on 32bit system. + let arena_size = usize::try_from(arena_size).context("pmem device image too big")?; + let disk_size = usize::try_from(disk_size).context("pmem device image too big")?; + + let mut arena = + MemoryMappingArena::new(arena_size).context("failed to reserve pmem memory")?; + arena + .add_fd_offset_protection(0, disk_size, &fd, 0, protection) + .context("failed to reserve pmem memory")?; + + // If the disk is not a multiple of the page size, the OS will fill the remaining part + // of the page with zeroes. However, the anonymous mapping added below must start on a + // page boundary, so round up the size before calculating the offset of the anon region. + let disk_size = round_up_to_page_size(disk_size); + + if arena_size > disk_size { + // Add an anonymous region with the same protection as the disk mapping if the arena + // size was aligned. + arena + .add_anon_protection(disk_size, arena_size - disk_size, protection) + .context("failed to reserve pmem padding")?; + } + arena + }; + + let mapping_address = resources + .mmio_allocator(MmioType::High) + .reverse_allocate_with_align( + arena_size, + Alloc::PmemDevice(index), + format!("pmem_disk_image_{}", index), + // Linux kernel requires pmem namespaces to be 128 MiB aligned. + 128 * 1024 * 1024, /* 128 MiB */ + ) + .context("failed to allocate memory for pmem device")?; + + let slot = vm + .add_memory_region( + GuestAddress(mapping_address), + Box::new(arena), + /* read_only = */ disk.read_only, + /* log_dirty_pages = */ false, + ) + .context("failed to add pmem device memory")?; + + let dev = virtio::Pmem::new( + virtio::base_features(cfg.protected_vm), + fd, + GuestAddress(mapping_address), + slot, + arena_size, + Some(pmem_device_tube), + ) + .context("failed to create pmem device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev) as Box, + jail: simple_jail(cfg, "pmem_device")?, + }) +} + +pub fn create_iommu_device( + cfg: &Config, + phys_max_addr: u64, + endpoints: BTreeMap>>, +) -> DeviceResult { + let dev = virtio::Iommu::new( + virtio::base_features(cfg.protected_vm), + endpoints, + phys_max_addr, + ) + .context("failed to create IOMMU device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "iommu_device")?, + }) +} + +pub fn create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult { + let mut keep_rds = Vec::new(); + let evt = Event::new().context("failed to create event")?; + let dev = param + .create_serial_device::(cfg.protected_vm, &evt, &mut keep_rds) + .context("failed to create console device")?; + + let jail = match simple_jail(cfg, "serial")? { + Some(mut jail) => { + // Create a tmpfs in the device's root directory so that we can bind mount the + // log socket directory into it. + // The size=67108864 is size=64*1024*1024 or size=64MB. + jail.mount_with_data( + Path::new("none"), + Path::new("/"), + "tmpfs", + (libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_NOSUID) as usize, + "size=67108864", + )?; + add_current_user_to_jail(&mut jail)?; + let res = param.add_bind_mounts(&mut jail); + if res.is_err() { + error!("failed to add bind mounts for console device"); + } + Some(jail) + } + None => None, + }; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail, // TODO(dverkamp): use a separate policy for console? + }) +} + +#[cfg(feature = "audio")] +pub fn create_sound_device(path: &Path, cfg: &Config) -> DeviceResult { + let dev = virtio::new_sound(path, virtio::base_features(cfg.protected_vm)) + .context("failed to create sound device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: simple_jail(cfg, "vios_audio_device")?, + }) +} + +pub fn create_vfio_device( + cfg: &Config, + vm: &impl Vm, + resources: &mut SystemAllocator, + control_tubes: &mut Vec, + vfio_path: &Path, + bus_num: Option, + iommu_endpoints: &mut BTreeMap>>, + coiommu_endpoints: Option<&mut Vec>, + iommu_dev: IommuDevType, +) -> DeviceResult<(Box, Option)> { + let vfio_container = VfioCommonSetup::vfio_get_container(iommu_dev, Some(vfio_path)) + .context("failed to get vfio container")?; + + // create MSI, MSI-X, and Mem request sockets for each vfio device + let (vfio_host_tube_msi, vfio_device_tube_msi) = + Tube::pair().context("failed to create tube")?; + control_tubes.push(TaggedControlTube::VmIrq(vfio_host_tube_msi)); + + let (vfio_host_tube_msix, vfio_device_tube_msix) = + Tube::pair().context("failed to create tube")?; + control_tubes.push(TaggedControlTube::VmIrq(vfio_host_tube_msix)); + + let (vfio_host_tube_mem, vfio_device_tube_mem) = + Tube::pair().context("failed to create tube")?; + control_tubes.push(TaggedControlTube::VmMemory(vfio_host_tube_mem)); + + let hotplug = bus_num.is_some(); + let vfio_device_tube_vm = if hotplug { + let (vfio_host_tube_vm, device_tube_vm) = Tube::pair().context("failed to create tube")?; + control_tubes.push(TaggedControlTube::Vm(vfio_host_tube_vm)); + Some(device_tube_vm) + } else { + None + }; + + let vfio_device = VfioDevice::new_passthrough( + &vfio_path, + vm, + vfio_container.clone(), + iommu_dev != IommuDevType::NoIommu, + ) + .context("failed to create vfio device")?; + let mut vfio_pci_device = Box::new(VfioPciDevice::new( + vfio_device, + bus_num, + vfio_device_tube_msi, + vfio_device_tube_msix, + vfio_device_tube_mem, + vfio_device_tube_vm, + )); + // early reservation for pass-through PCI devices. + let endpoint_addr = vfio_pci_device + .allocate_address(resources) + .context("failed to allocate resources early for vfio pci dev")?; + + match iommu_dev { + IommuDevType::NoIommu => {} + IommuDevType::VirtioIommu => { + iommu_endpoints.insert(endpoint_addr.to_u32(), vfio_container); + } + IommuDevType::CoIommu => { + if let Some(endpoints) = coiommu_endpoints { + endpoints.push(endpoint_addr.to_u32() as u16); + } else { + bail!("Missed coiommu_endpoints vector to store the endpoint addr"); + } + } + } + + if hotplug { + Ok((vfio_pci_device, None)) + } else { + Ok((vfio_pci_device, simple_jail(cfg, "vfio_device")?)) + } +} + +pub fn create_vfio_platform_device( + cfg: &Config, + vm: &impl Vm, + _resources: &mut SystemAllocator, + control_tubes: &mut Vec, + vfio_path: &Path, + _endpoints: &mut BTreeMap>>, + iommu_dev: IommuDevType, +) -> DeviceResult<(VfioPlatformDevice, Option)> { + let vfio_container = VfioCommonSetup::vfio_get_container(iommu_dev, Some(vfio_path)) + .context("Failed to create vfio device")?; + + let (vfio_host_tube_mem, vfio_device_tube_mem) = + Tube::pair().context("failed to create tube")?; + control_tubes.push(TaggedControlTube::VmMemory(vfio_host_tube_mem)); + + let vfio_device = VfioDevice::new_passthrough( + &vfio_path, + vm, + vfio_container, + iommu_dev != IommuDevType::NoIommu, + ) + .context("Failed to create vfio device")?; + let vfio_plat_dev = VfioPlatformDevice::new(vfio_device, vfio_device_tube_mem); + + Ok((vfio_plat_dev, simple_jail(cfg, "vfio_platform_device")?)) +} diff --git a/src/linux/gpu.rs b/src/linux/gpu.rs new file mode 100644 index 0000000000..38cfad2db2 --- /dev/null +++ b/src/linux/gpu.rs @@ -0,0 +1,331 @@ +// Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +//! GPU related things +//! depends on "gpu" feature +use std::collections::HashSet; +use std::env; + +use devices::virtio::vhost::user::vmm::Gpu as VhostUserGpu; +use devices::virtio::GpuRenderServerParameters; + +use super::*; + +pub fn create_vhost_user_gpu_device( + cfg: &Config, + opt: &VhostUserOption, + host_tube: Tube, + device_tube: Tube, +) -> DeviceResult { + // The crosvm gpu device expects us to connect the tube before it will accept a vhost-user + // connection. + let dev = VhostUserGpu::new( + virtio::base_features(cfg.protected_vm), + &opt.socket, + host_tube, + device_tube, + ) + .context("failed to set up vhost-user gpu device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + // no sandbox here because virtqueue handling is exported to a different process. + jail: None, + }) +} + +pub fn gpu_jail(cfg: &Config, policy: &str) -> Result> { + match simple_jail(cfg, policy)? { + Some(mut jail) => { + // Create a tmpfs in the device's root directory so that we can bind mount the + // dri directory into it. The size=67108864 is size=64*1024*1024 or size=64MB. + jail.mount_with_data( + Path::new("none"), + Path::new("/"), + "tmpfs", + (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, + "size=67108864", + )?; + + // Device nodes required for DRM. + let sys_dev_char_path = Path::new("/sys/dev/char"); + jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?; + let sys_devices_path = Path::new("/sys/devices"); + jail.mount_bind(sys_devices_path, sys_devices_path, false)?; + + let drm_dri_path = Path::new("/dev/dri"); + if drm_dri_path.exists() { + jail.mount_bind(drm_dri_path, drm_dri_path, false)?; + } + + // If the ARM specific devices exist on the host, bind mount them in. + let mali0_path = Path::new("/dev/mali0"); + if mali0_path.exists() { + jail.mount_bind(mali0_path, mali0_path, true)?; + } + + let pvr_sync_path = Path::new("/dev/pvr_sync"); + if pvr_sync_path.exists() { + jail.mount_bind(pvr_sync_path, pvr_sync_path, true)?; + } + + // If the udmabuf driver exists on the host, bind mount it in. + let udmabuf_path = Path::new("/dev/udmabuf"); + if udmabuf_path.exists() { + jail.mount_bind(udmabuf_path, udmabuf_path, true)?; + } + + // Libraries that are required when mesa drivers are dynamically loaded. + jail_mount_bind_if_exists( + &mut jail, + &[ + "/usr/lib", + "/usr/lib64", + "/lib", + "/lib64", + "/usr/share/drirc.d", + "/usr/share/glvnd", + "/usr/share/vulkan", + ], + )?; + + // pvr driver requires read access to /proc/self/task/*/comm. + let proc_path = Path::new("/proc"); + jail.mount( + proc_path, + proc_path, + "proc", + (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_RDONLY) as usize, + )?; + + // To enable perfetto tracing, we need to give access to the perfetto service IPC + // endpoints. + let perfetto_path = Path::new("/run/perfetto"); + if perfetto_path.exists() { + jail.mount_bind(perfetto_path, perfetto_path, true)?; + } + + Ok(Some(jail)) + } + None => Ok(None), + } +} + +pub struct GpuCacheInfo<'a> { + directory: Option<&'a str>, + environment: Vec<(&'a str, &'a str)>, +} + +pub fn get_gpu_cache_info<'a>( + cache_dir: Option<&'a String>, + cache_size: Option<&'a String>, + sandbox: bool, +) -> GpuCacheInfo<'a> { + let mut dir = None; + let mut env = Vec::new(); + + if let Some(cache_dir) = cache_dir { + if !Path::new(cache_dir).exists() { + warn!("shader caching dir {} does not exist", cache_dir); + env.push(("MESA_GLSL_CACHE_DISABLE", "true")); + } else if cfg!(any(target_arch = "arm", target_arch = "aarch64")) && sandbox { + warn!("shader caching not yet supported on ARM with sandbox enabled"); + env.push(("MESA_GLSL_CACHE_DISABLE", "true")); + } else { + dir = Some(cache_dir.as_str()); + + env.push(("MESA_GLSL_CACHE_DISABLE", "false")); + env.push(("MESA_GLSL_CACHE_DIR", cache_dir.as_str())); + if let Some(cache_size) = cache_size { + env.push(("MESA_GLSL_CACHE_MAX_SIZE", cache_size.as_str())); + } + } + } + + GpuCacheInfo { + directory: dir, + environment: env, + } +} + +pub fn create_gpu_device( + cfg: &Config, + exit_evt: &Event, + gpu_device_tube: Tube, + resource_bridges: Vec, + wayland_socket_path: Option<&PathBuf>, + x_display: Option, + render_server_fd: Option, + event_devices: Vec, + map_request: Arc>>, +) -> DeviceResult { + let mut display_backends = vec![ + virtio::DisplayBackend::X(x_display), + virtio::DisplayBackend::Stub, + ]; + + let wayland_socket_dirs = cfg + .wayland_socket_paths + .iter() + .map(|(_name, path)| path.parent()) + .collect::>>() + .ok_or_else(|| anyhow!("wayland socket path has no parent or file name"))?; + + if let Some(socket_path) = wayland_socket_path { + display_backends.insert( + 0, + virtio::DisplayBackend::Wayland(Some(socket_path.to_owned())), + ); + } + + let dev = virtio::Gpu::new( + exit_evt.try_clone().context("failed to clone event")?, + Some(gpu_device_tube), + resource_bridges, + display_backends, + cfg.gpu_parameters.as_ref().unwrap(), + render_server_fd, + event_devices, + map_request, + cfg.sandbox, + virtio::base_features(cfg.protected_vm), + cfg.wayland_socket_paths.clone(), + ); + + let jail = match gpu_jail(cfg, "gpu_device")? { + Some(mut jail) => { + // Prepare GPU shader disk cache directory. + let (cache_dir, cache_size) = cfg + .gpu_parameters + .as_ref() + .map(|params| (params.cache_path.as_ref(), params.cache_size.as_ref())) + .unwrap(); + let cache_info = get_gpu_cache_info(cache_dir, cache_size, cfg.sandbox); + + if let Some(dir) = cache_info.directory { + jail.mount_bind(dir, dir, true)?; + } + for (key, val) in cache_info.environment { + env::set_var(key, val); + } + + // Bind mount the wayland socket's directory into jail's root. This is necessary since + // each new wayland context must open() the socket. If the wayland socket is ever + // destroyed and remade in the same host directory, new connections will be possible + // without restarting the wayland device. + for dir in &wayland_socket_dirs { + jail.mount_bind(dir, dir, true)?; + } + + add_current_user_to_jail(&mut jail)?; + + Some(jail) + } + None => None, + }; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail, + }) +} + +pub fn get_gpu_render_server_environment(cache_info: &GpuCacheInfo) -> Result> { + let mut env = Vec::new(); + + let mut cache_env_keys = HashSet::with_capacity(cache_info.environment.len()); + for (key, val) in cache_info.environment.iter() { + env.push(format!("{}={}", key, val)); + cache_env_keys.insert(*key); + } + + for (key_os, val_os) in env::vars_os() { + // minijail should accept OsStr rather than str... + let into_string_err = |_| anyhow!("invalid environment key/val"); + let key = key_os.into_string().map_err(into_string_err)?; + let val = val_os.into_string().map_err(into_string_err)?; + + if !cache_env_keys.contains(key.as_str()) { + env.push(format!("{}={}", key, val)); + } + } + + Ok(env) +} + +pub struct ScopedMinijail(pub Minijail); + +impl Drop for ScopedMinijail { + fn drop(&mut self) { + let _ = self.0.kill(); + } +} + +pub fn start_gpu_render_server( + cfg: &Config, + render_server_parameters: &GpuRenderServerParameters, +) -> Result<(Minijail, SafeDescriptor)> { + let (server_socket, client_socket) = + UnixSeqpacket::pair().context("failed to create render server socket")?; + + let mut env = None; + let jail = match gpu_jail(cfg, "gpu_render_server")? { + Some(mut jail) => { + let cache_info = get_gpu_cache_info( + render_server_parameters.cache_path.as_ref(), + render_server_parameters.cache_size.as_ref(), + cfg.sandbox, + ); + + if let Some(dir) = cache_info.directory { + jail.mount_bind(dir, dir, true)?; + } + + if !cache_info.environment.is_empty() { + env = Some(get_gpu_render_server_environment(&cache_info)?); + } + + // bind mount /dev/log for syslog + let log_path = Path::new("/dev/log"); + if log_path.exists() { + jail.mount_bind(log_path, log_path, true)?; + } + + // Run as root in the jail to keep capabilities after execve, which is needed for + // mounting to work. All capabilities will be dropped afterwards. + add_current_user_as_root_to_jail(&mut jail)?; + + jail + } + None => Minijail::new().context("failed to create jail")?, + }; + + let inheritable_fds = [ + server_socket.as_raw_descriptor(), + libc::STDOUT_FILENO, + libc::STDERR_FILENO, + ]; + + let cmd = &render_server_parameters.path; + let cmd_str = cmd + .to_str() + .ok_or_else(|| anyhow!("invalid render server path"))?; + let fd_str = server_socket.as_raw_descriptor().to_string(); + let args = [cmd_str, "--socket-fd", &fd_str]; + + let mut envp: Option> = None; + if let Some(ref env) = env { + envp = Some(env.iter().map(AsRef::as_ref).collect()); + } + + jail.run_command(minijail::Command::new_for_path( + cmd, + &inheritable_fds, + &args, + envp.as_deref(), + )?) + .context("failed to start gpu render server")?; + + Ok((jail, SafeDescriptor::from(client_socket))) +} diff --git a/src/linux/jail_helpers.rs b/src/linux/jail_helpers.rs new file mode 100644 index 0000000000..f74fcd297f --- /dev/null +++ b/src/linux/jail_helpers.rs @@ -0,0 +1,188 @@ +// Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use std::path::{Path, PathBuf}; +use std::str; + +use libc::{self, c_ulong, gid_t, uid_t}; + +use anyhow::{bail, Context, Result}; +use base::*; +use minijail::{self, Minijail}; + +use crate::Config; + +pub(super) struct SandboxConfig<'a> { + pub(super) limit_caps: bool, + pub(super) log_failures: bool, + pub(super) seccomp_policy: &'a Path, + pub(super) uid_map: Option<&'a str>, + pub(super) gid_map: Option<&'a str>, + pub(super) remount_mode: Option, +} + +pub(super) fn create_base_minijail( + root: &Path, + r_limit: Option, + config: Option<&SandboxConfig>, +) -> Result { + // All child jails run in a new user namespace without any users mapped, + // they run as nobody unless otherwise configured. + let mut j = Minijail::new().context("failed to jail device")?; + + if let Some(config) = config { + j.namespace_pids(); + j.namespace_user(); + j.namespace_user_disable_setgroups(); + if config.limit_caps { + // Don't need any capabilities. + j.use_caps(0); + } + if let Some(uid_map) = config.uid_map { + j.uidmap(uid_map).context("error setting UID map")?; + } + if let Some(gid_map) = config.gid_map { + j.gidmap(gid_map).context("error setting GID map")?; + } + // Run in a new mount namespace. + j.namespace_vfs(); + + // Run in an empty network namespace. + j.namespace_net(); + + // Don't allow the device to gain new privileges. + j.no_new_privs(); + + // By default we'll prioritize using the pre-compiled .bpf over the .policy + // file (the .bpf is expected to be compiled using "trap" as the failure + // behavior instead of the default "kill" behavior). + // Refer to the code comment for the "seccomp-log-failures" + // command-line parameter for an explanation about why the |log_failures| + // flag forces the use of .policy files (and the build-time alternative to + // this run-time flag). + let bpf_policy_file = config.seccomp_policy.with_extension("bpf"); + if bpf_policy_file.exists() && !config.log_failures { + j.parse_seccomp_program(&bpf_policy_file) + .context("failed to parse precompiled seccomp policy")?; + } else { + // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, + // which will correctly kill the entire device process if a worker + // thread commits a seccomp violation. + j.set_seccomp_filter_tsync(); + if config.log_failures { + j.log_seccomp_filter_failures(); + } + j.parse_seccomp_filters(&config.seccomp_policy.with_extension("policy")) + .context("failed to parse seccomp policy")?; + } + j.use_seccomp_filter(); + // Don't do init setup. + j.run_as_init(); + // Set up requested remount mode instead of default MS_PRIVATE. + if let Some(mode) = config.remount_mode { + j.set_remount_mode(mode); + } + } + + // Only pivot_root if we are not re-using the current root directory. + if root != Path::new("/") { + // It's safe to call `namespace_vfs` multiple times. + j.namespace_vfs(); + j.enter_pivot_root(root) + .context("failed to pivot root device")?; + } + + // Most devices don't need to open many fds. + let limit = if let Some(r) = r_limit { r } else { 1024u64 }; + j.set_rlimit(libc::RLIMIT_NOFILE as i32, limit, limit) + .context("error setting max open files")?; + + Ok(j) +} + +pub(super) fn simple_jail(cfg: &Config, policy: &str) -> Result> { + if cfg.sandbox { + let pivot_root: &str = option_env!("DEFAULT_PIVOT_ROOT").unwrap_or("/var/empty"); + // A directory for a jailed device's pivot root. + let root_path = Path::new(pivot_root); + if !root_path.exists() { + bail!("{} doesn't exist, can't jail devices", pivot_root); + } + let policy_path: PathBuf = cfg.seccomp_policy_dir.join(policy); + let config = SandboxConfig { + limit_caps: true, + log_failures: cfg.seccomp_log_failures, + seccomp_policy: &policy_path, + uid_map: None, + gid_map: None, + remount_mode: None, + }; + Ok(Some(create_base_minijail(root_path, None, Some(&config))?)) + } else { + Ok(None) + } +} + +/// Mirror-mount all the directories in `dirs` into `jail` on a best-effort basis. +/// +/// This function will not return an error if any of the directories in `dirs` is missing. +#[cfg(any(feature = "gpu", feature = "video-decoder", feature = "video-encoder"))] +pub(super) fn jail_mount_bind_if_exists>( + jail: &mut Minijail, + dirs: &[P], +) -> Result<()> { + for dir in dirs { + let dir_path = Path::new(dir); + if dir_path.exists() { + jail.mount_bind(dir_path, dir_path, false)?; + } + } + + Ok(()) +} + +#[derive(Copy, Clone)] +#[cfg_attr(not(feature = "tpm"), allow(dead_code))] +pub(super) struct Ids { + pub(super) uid: uid_t, + pub(super) gid: gid_t, +} + +pub(super) fn add_current_user_as_root_to_jail(jail: &mut Minijail) -> Result { + let crosvm_uid = geteuid(); + let crosvm_gid = getegid(); + jail.uidmap(&format!("0 {0} 1", crosvm_uid)) + .context("error setting UID map")?; + jail.gidmap(&format!("0 {0} 1", crosvm_gid)) + .context("error setting GID map")?; + + Ok(Ids { + uid: crosvm_uid, + gid: crosvm_gid, + }) +} + +/// Set the uid/gid for the jailed process and give a basic id map. This is +/// required for bind mounts to work. +pub(super) fn add_current_user_to_jail(jail: &mut Minijail) -> Result { + let crosvm_uid = geteuid(); + let crosvm_gid = getegid(); + + jail.uidmap(&format!("{0} {0} 1", crosvm_uid)) + .context("error setting UID map")?; + jail.gidmap(&format!("{0} {0} 1", crosvm_gid)) + .context("error setting GID map")?; + + if crosvm_uid != 0 { + jail.change_uid(crosvm_uid); + } + if crosvm_gid != 0 { + jail.change_gid(crosvm_gid); + } + + Ok(Ids { + uid: crosvm_uid, + gid: crosvm_gid, + }) +} diff --git a/src/linux/mod.rs b/src/linux/mod.rs index 70506e53df..fdda4a42f1 100644 --- a/src/linux/mod.rs +++ b/src/linux/mod.rs @@ -3,70 +3,46 @@ // found in the LICENSE file. use std::cmp::{max, Reverse}; -use std::collections::{BTreeMap, HashSet}; -use std::convert::{TryFrom, TryInto}; -#[cfg(feature = "gpu")] -use std::env; +use std::collections::BTreeMap; +use std::convert::TryInto; use std::fs::{File, OpenOptions}; use std::io::prelude::*; use std::io::stdin; use std::iter; use std::mem; -use std::net::Ipv4Addr; -use std::os::unix::net::UnixListener; -use std::os::unix::{io::FromRawFd, net::UnixStream, prelude::OpenOptionsExt}; +use std::os::unix::{net::UnixStream, prelude::OpenOptionsExt}; use std::path::{Path, PathBuf}; use std::str; use std::sync::{mpsc, Arc, Barrier}; use std::time::Duration; use std::process; +#[cfg(all(target_arch = "x86_64", feature = "gdb"))] use std::thread; -use std::thread::JoinHandle; -use libc::{self, c_int, c_ulong, gid_t, uid_t}; +use libc; use acpi_tables::sdt::SDT; use anyhow::{anyhow, bail, Context, Result}; use base::net::{UnixSeqpacket, UnixSeqpacketListener, UnlinkUnixSeqpacketListener}; use base::*; -use devices::serial_device::{SerialHardware, SerialParameters}; +use devices::serial_device::SerialHardware; use devices::vfio::{VfioCommonSetup, VfioCommonTrait}; -#[cfg(feature = "audio_cras")] -use devices::virtio::snd::cras_backend::Parameters as CrasSndParameters; -use devices::virtio::vhost::user::proxy::VirtioVhostUser; -#[cfg(feature = "audio")] -use devices::virtio::vhost::user::vmm::Snd as VhostUserSnd; -use devices::virtio::vhost::user::vmm::{ - Block as VhostUserBlock, Console as VhostUserConsole, Fs as VhostUserFs, - Mac80211Hwsim as VhostUserMac80211Hwsim, Net as VhostUserNet, Vsock as VhostUserVsock, - Wl as VhostUserWl, -}; -#[cfg(any(feature = "video-decoder", feature = "video-encoder"))] -use devices::virtio::VideoBackendType; -use devices::virtio::{self, Console, VirtioDevice}; -#[cfg(feature = "gpu")] -use devices::virtio::{ - gpu::{GpuRenderServerParameters, DEFAULT_DISPLAY_HEIGHT, DEFAULT_DISPLAY_WIDTH}, - vhost::user::vmm::Gpu as VhostUserGpu, - EventDevice, -}; +use devices::virtio::{self, EventDevice}; #[cfg(feature = "audio")] use devices::Ac97Dev; use devices::{ - self, BusDeviceObj, HostHotPlugKey, HotPlugBus, IrqChip, IrqEventIndex, KvmKernelIrqChip, - PciAddress, PciBridge, PciDevice, PcieRootPort, StubPciDevice, VcpuRunState, VfioContainer, - VfioDevice, VfioPciDevice, VfioPlatformDevice, VirtioPciDevice, + self, BusDeviceObj, HostHotPlugKey, HotPlugBus, IrqEventIndex, KvmKernelIrqChip, PciAddress, + PciBridge, PciDevice, PcieRootPort, StubPciDevice, VfioContainer, VirtioPciDevice, }; use devices::{CoIommuDev, IommuDevType}; #[cfg(feature = "usb")] use devices::{HostBackendDeviceProvider, XhciController}; use hypervisor::kvm::{Kvm, KvmVcpu, KvmVm}; -use hypervisor::{HypervisorCap, ProtectionType, Vcpu, VcpuExit, VcpuRunHandle, Vm, VmCap}; +use hypervisor::{HypervisorCap, ProtectionType, Vm, VmCap}; use minijail::{self, Minijail}; -use net_util::{MacAddress, Tap}; -use resources::{Alloc, MmioType, SystemAllocator}; +use resources::{Alloc, SystemAllocator}; use rutabaga_gfx::RutabagaGralloc; use sync::Mutex; use vm_control::*; @@ -74,10 +50,7 @@ use vm_memory::{GuestAddress, GuestMemory, MemoryPolicy}; #[cfg(all(target_arch = "x86_64", feature = "gdb"))] use crate::gdb::{gdb_thread, GdbStub}; -use crate::{ - Config, DiskOption, Executable, SharedDir, SharedDirKind, TouchDeviceOption, VfioType, - VhostUserFsOption, VhostUserOption, VhostUserWlOption, VhostVsockDeviceParameter, -}; +use crate::{Config, Executable, SharedDir, SharedDirKind, VfioType, VhostUserOption}; use arch::{ self, LinuxArch, RunnableLinuxVm, VcpuAffinity, VirtioDeviceStub, VmComponents, VmImage, }; @@ -95,1418 +68,16 @@ use { x86_64::X8664arch as Arch, }; -enum TaggedControlTube { - Fs(Tube), - Vm(Tube), - VmMemory(Tube), - VmIrq(Tube), - VmMsync(Tube), -} - -impl AsRef for TaggedControlTube { - fn as_ref(&self) -> &Tube { - use self::TaggedControlTube::*; - match &self { - Fs(tube) | Vm(tube) | VmMemory(tube) | VmIrq(tube) | VmMsync(tube) => tube, - } - } -} - -impl AsRawDescriptor for TaggedControlTube { - fn as_raw_descriptor(&self) -> RawDescriptor { - self.as_ref().as_raw_descriptor() - } -} - -struct SandboxConfig<'a> { - limit_caps: bool, - log_failures: bool, - seccomp_policy: &'a Path, - uid_map: Option<&'a str>, - gid_map: Option<&'a str>, - remount_mode: Option, -} - -fn create_base_minijail( - root: &Path, - r_limit: Option, - config: Option<&SandboxConfig>, -) -> Result { - // All child jails run in a new user namespace without any users mapped, - // they run as nobody unless otherwise configured. - let mut j = Minijail::new().context("failed to jail device")?; - - if let Some(config) = config { - j.namespace_pids(); - j.namespace_user(); - j.namespace_user_disable_setgroups(); - if config.limit_caps { - // Don't need any capabilities. - j.use_caps(0); - } - if let Some(uid_map) = config.uid_map { - j.uidmap(uid_map).context("error setting UID map")?; - } - if let Some(gid_map) = config.gid_map { - j.gidmap(gid_map).context("error setting GID map")?; - } - // Run in a new mount namespace. - j.namespace_vfs(); - - // Run in an empty network namespace. - j.namespace_net(); - - // Don't allow the device to gain new privileges. - j.no_new_privs(); - - // By default we'll prioritize using the pre-compiled .bpf over the .policy - // file (the .bpf is expected to be compiled using "trap" as the failure - // behavior instead of the default "kill" behavior). - // Refer to the code comment for the "seccomp-log-failures" - // command-line parameter for an explanation about why the |log_failures| - // flag forces the use of .policy files (and the build-time alternative to - // this run-time flag). - let bpf_policy_file = config.seccomp_policy.with_extension("bpf"); - if bpf_policy_file.exists() && !config.log_failures { - j.parse_seccomp_program(&bpf_policy_file) - .context("failed to parse precompiled seccomp policy")?; - } else { - // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, - // which will correctly kill the entire device process if a worker - // thread commits a seccomp violation. - j.set_seccomp_filter_tsync(); - if config.log_failures { - j.log_seccomp_filter_failures(); - } - j.parse_seccomp_filters(&config.seccomp_policy.with_extension("policy")) - .context("failed to parse seccomp policy")?; - } - j.use_seccomp_filter(); - // Don't do init setup. - j.run_as_init(); - // Set up requested remount mode instead of default MS_PRIVATE. - if let Some(mode) = config.remount_mode { - j.set_remount_mode(mode); - } - } - - // Only pivot_root if we are not re-using the current root directory. - if root != Path::new("/") { - // It's safe to call `namespace_vfs` multiple times. - j.namespace_vfs(); - j.enter_pivot_root(root) - .context("failed to pivot root device")?; - } - - // Most devices don't need to open many fds. - let limit = if let Some(r) = r_limit { r } else { 1024u64 }; - j.set_rlimit(libc::RLIMIT_NOFILE as i32, limit, limit) - .context("error setting max open files")?; - - Ok(j) -} - -fn simple_jail(cfg: &Config, policy: &str) -> Result> { - if cfg.sandbox { - let pivot_root: &str = option_env!("DEFAULT_PIVOT_ROOT").unwrap_or("/var/empty"); - // A directory for a jailed device's pivot root. - let root_path = Path::new(pivot_root); - if !root_path.exists() { - bail!("{} doesn't exist, can't jail devices", pivot_root); - } - let policy_path: PathBuf = cfg.seccomp_policy_dir.join(policy); - let config = SandboxConfig { - limit_caps: true, - log_failures: cfg.seccomp_log_failures, - seccomp_policy: &policy_path, - uid_map: None, - gid_map: None, - remount_mode: None, - }; - Ok(Some(create_base_minijail(root_path, None, Some(&config))?)) - } else { - Ok(None) - } -} - -type DeviceResult = Result; - -fn create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult { - let raw_image: File = open_file(&disk.path, disk.read_only, disk.o_direct) - .with_context(|| format!("failed to load disk image {}", disk.path.display()))?; - // Lock the disk image to prevent other crosvm instances from using it. - let lock_op = if disk.read_only { - FlockOperation::LockShared - } else { - FlockOperation::LockExclusive - }; - flock(&raw_image, lock_op, true).context("failed to lock disk image")?; - - info!("Trying to attach block device: {}", disk.path.display()); - let dev = if disk::async_ok(&raw_image).context("failed to check disk async_ok")? { - let async_file = disk::create_async_disk_file(raw_image) - .context("failed to create async virtual disk")?; - Box::new( - virtio::BlockAsync::new( - virtio::base_features(cfg.protected_vm), - async_file, - disk.read_only, - disk.sparse, - disk.block_size, - disk.id, - Some(disk_device_tube), - ) - .context("failed to create block device")?, - ) as Box - } else { - let disk_file = disk::create_disk_file(raw_image, disk::MAX_NESTING_DEPTH) - .context("failed to create virtual disk")?; - Box::new( - virtio::Block::new( - virtio::base_features(cfg.protected_vm), - disk_file, - disk.read_only, - disk.sparse, - disk.block_size, - disk.id, - Some(disk_device_tube), - ) - .context("failed to create block device")?, - ) as Box - }; - - Ok(VirtioDeviceStub { - dev, - jail: simple_jail(cfg, "block_device")?, - }) -} - -fn create_vhost_user_block_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { - let dev = VhostUserBlock::new(virtio::base_features(cfg.protected_vm), &opt.socket) - .context("failed to set up vhost-user block device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - // no sandbox here because virtqueue handling is exported to a different process. - jail: None, - }) -} - -fn create_vhost_user_console_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { - let dev = VhostUserConsole::new(virtio::base_features(cfg.protected_vm), &opt.socket) - .context("failed to set up vhost-user console device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - // no sandbox here because virtqueue handling is exported to a different process. - jail: None, - }) -} - -fn create_vhost_user_fs_device(cfg: &Config, option: &VhostUserFsOption) -> DeviceResult { - let dev = VhostUserFs::new( - virtio::base_features(cfg.protected_vm), - &option.socket, - &option.tag, - ) - .context("failed to set up vhost-user fs device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - // no sandbox here because virtqueue handling is exported to a different process. - jail: None, - }) -} - -fn create_vhost_user_mac80211_hwsim_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { - let dev = VhostUserMac80211Hwsim::new(virtio::base_features(cfg.protected_vm), &opt.socket) - .context("failed to set up vhost-user mac80211_hwsim device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - // no sandbox here because virtqueue handling is exported to a different process. - jail: None, - }) -} - -#[cfg(feature = "audio")] -fn create_vhost_user_snd_device(cfg: &Config, option: &VhostUserOption) -> DeviceResult { - let dev = VhostUserSnd::new(virtio::base_features(cfg.protected_vm), &option.socket) - .context("failed to set up vhost-user snd device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - // no sandbox here because virtqueue handling is exported to a different process. - jail: None, - }) -} - -fn create_vvu_proxy_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { - let listener = UnixListener::bind(&opt.socket).map_err(|e| { - error!("failed to bind listener for vvu proxy device: {}", e); - e - })?; - - let dev = VirtioVhostUser::new(virtio::base_features(cfg.protected_vm), listener) - .context("failed to create VVU proxy device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "vvu_proxy_device")?, - }) -} - -fn create_rng_device(cfg: &Config) -> DeviceResult { - let dev = virtio::Rng::new(virtio::base_features(cfg.protected_vm)) - .context("failed to set up rng")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "rng_device")?, - }) -} - -#[cfg(feature = "audio_cras")] -fn create_cras_snd_device(cfg: &Config, cras_snd: CrasSndParameters) -> DeviceResult { - let dev = virtio::snd::cras_backend::VirtioSndCras::new( - virtio::base_features(cfg.protected_vm), - cras_snd, - ) - .context("failed to create cras sound device")?; - - let jail = match simple_jail(&cfg, "cras_snd_device")? { - Some(mut jail) => { - // Create a tmpfs in the device's root directory for cras_snd_device. - // The size is 20*1024, or 20 KB. - jail.mount_with_data( - Path::new("none"), - Path::new("/"), - "tmpfs", - (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, - "size=20480", - )?; - - let run_cras_path = Path::new("/run/cras"); - jail.mount_bind(run_cras_path, run_cras_path, true)?; - - add_current_user_to_jail(&mut jail)?; - - Some(jail) - } - None => None, - }; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail, - }) -} - -#[cfg(feature = "tpm")] -fn create_tpm_device(cfg: &Config) -> DeviceResult { - use std::ffi::CString; - use std::fs; - - let tpm_storage: PathBuf; - let mut tpm_jail = simple_jail(cfg, "tpm_device")?; - - match &mut tpm_jail { - Some(jail) => { - // Create a tmpfs in the device's root directory for tpm - // simulator storage. The size is 20*1024, or 20 KB. - jail.mount_with_data( - Path::new("none"), - Path::new("/"), - "tmpfs", - (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, - "size=20480", - )?; - - let crosvm_ids = add_current_user_to_jail(jail)?; - - let pid = process::id(); - let tpm_pid_dir = format!("/run/vm/tpm.{}", pid); - tpm_storage = Path::new(&tpm_pid_dir).to_owned(); - fs::create_dir_all(&tpm_storage).with_context(|| { - format!("failed to create tpm storage dir {}", tpm_storage.display()) - })?; - let tpm_pid_dir_c = CString::new(tpm_pid_dir).expect("no nul bytes"); - chown(&tpm_pid_dir_c, crosvm_ids.uid, crosvm_ids.gid) - .context("failed to chown tpm storage")?; - - jail.mount_bind(&tpm_storage, &tpm_storage, true)?; - } - None => { - // Path used inside cros_sdk which does not have /run/vm. - tpm_storage = Path::new("/tmp/tpm-simulator").to_owned(); - } - } - - let dev = virtio::Tpm::new(tpm_storage); - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: tpm_jail, - }) -} - -fn create_single_touch_device( - cfg: &Config, - single_touch_spec: &TouchDeviceOption, - idx: u32, -) -> DeviceResult { - let socket = single_touch_spec - .get_path() - .into_unix_stream() - .map_err(|e| { - error!("failed configuring virtio single touch: {:?}", e); - e - })?; - - let (width, height) = single_touch_spec.get_size(); - let dev = virtio::new_single_touch( - idx, - socket, - width, - height, - virtio::base_features(cfg.protected_vm), - ) - .context("failed to set up input device")?; - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "input_device")?, - }) -} - -fn create_multi_touch_device( - cfg: &Config, - multi_touch_spec: &TouchDeviceOption, - idx: u32, -) -> DeviceResult { - let socket = multi_touch_spec - .get_path() - .into_unix_stream() - .map_err(|e| { - error!("failed configuring virtio multi touch: {:?}", e); - e - })?; - - let (width, height) = multi_touch_spec.get_size(); - let dev = virtio::new_multi_touch( - idx, - socket, - width, - height, - virtio::base_features(cfg.protected_vm), - ) - .context("failed to set up input device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "input_device")?, - }) -} - -fn create_trackpad_device( - cfg: &Config, - trackpad_spec: &TouchDeviceOption, - idx: u32, -) -> DeviceResult { - let socket = trackpad_spec.get_path().into_unix_stream().map_err(|e| { - error!("failed configuring virtio trackpad: {:#}", e); - e - })?; - - let (width, height) = trackpad_spec.get_size(); - let dev = virtio::new_trackpad( - idx, - socket, - width, - height, - virtio::base_features(cfg.protected_vm), - ) - .context("failed to set up input device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "input_device")?, - }) -} - -fn create_mouse_device(cfg: &Config, mouse_socket: T, idx: u32) -> DeviceResult { - let socket = mouse_socket.into_unix_stream().map_err(|e| { - error!("failed configuring virtio mouse: {:#}", e); - e - })?; - - let dev = virtio::new_mouse(idx, socket, virtio::base_features(cfg.protected_vm)) - .context("failed to set up input device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "input_device")?, - }) -} - -fn create_keyboard_device( - cfg: &Config, - keyboard_socket: T, - idx: u32, -) -> DeviceResult { - let socket = keyboard_socket.into_unix_stream().map_err(|e| { - error!("failed configuring virtio keyboard: {:#}", e); - e - })?; - - let dev = virtio::new_keyboard(idx, socket, virtio::base_features(cfg.protected_vm)) - .context("failed to set up input device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "input_device")?, - }) -} - -fn create_switches_device( - cfg: &Config, - switches_socket: T, - idx: u32, -) -> DeviceResult { - let socket = switches_socket.into_unix_stream().map_err(|e| { - error!("failed configuring virtio switches: {:#}", e); - e - })?; - - let dev = virtio::new_switches(idx, socket, virtio::base_features(cfg.protected_vm)) - .context("failed to set up input device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "input_device")?, - }) -} - -fn create_vinput_device(cfg: &Config, dev_path: &Path) -> DeviceResult { - let dev_file = OpenOptions::new() - .read(true) - .write(true) - .open(dev_path) - .with_context(|| format!("failed to open vinput device {}", dev_path.display()))?; - - let dev = virtio::new_evdev(dev_file, virtio::base_features(cfg.protected_vm)) - .context("failed to set up input device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "input_device")?, - }) -} - -fn create_balloon_device( - cfg: &Config, - tube: Tube, - inflate_tube: Option, - init_balloon_size: u64, -) -> DeviceResult { - let dev = virtio::Balloon::new( - virtio::base_features(cfg.protected_vm), - tube, - inflate_tube, - init_balloon_size, - ) - .context("failed to create balloon")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "balloon_device")?, - }) -} - -/// Generic method for creating a network device. `create_device` is a closure that takes the virtio -/// features and number of queue pairs as parameters, and is responsible for creating the device -/// itself. -fn create_net_device(cfg: &Config, policy: &str, create_device: F) -> DeviceResult -where - F: Fn(u64, u16) -> Result, - T: VirtioDevice + 'static, -{ - let mut vq_pairs = cfg.net_vq_pairs.unwrap_or(1); - let vcpu_count = cfg.vcpu_count.unwrap_or(1); - if vcpu_count < vq_pairs as usize { - warn!("the number of net vq pairs must not exceed the vcpu count, falling back to single queue mode"); - vq_pairs = 1; - } - let features = virtio::base_features(cfg.protected_vm); - - let dev = create_device(features, vq_pairs)?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev) as Box, - jail: simple_jail(cfg, policy)?, - }) -} - -/// Returns a network device created from a new TAP interface configured with `host_ip`, `netmask`, -/// and `mac_address`. -fn create_net_device_from_config( - cfg: &Config, - host_ip: Ipv4Addr, - netmask: Ipv4Addr, - mac_address: MacAddress, -) -> DeviceResult { - let policy = if cfg.vhost_net { - "vhost_net_device" - } else { - "net_device" - }; - - if cfg.vhost_net { - create_net_device(cfg, policy, |features, _vq_pairs| { - virtio::vhost::Net::>::new( - &cfg.vhost_net_device_path, - features, - host_ip, - netmask, - mac_address, - ) - .context("failed to set up vhost networking") - }) - } else { - create_net_device(cfg, policy, |features, vq_pairs| { - virtio::Net::::new(features, host_ip, netmask, mac_address, vq_pairs) - .context("failed to create virtio network device") - }) - } -} - -/// Returns a network device from a file descriptor to a configured TAP interface. -fn create_tap_net_device_from_fd(cfg: &Config, tap_fd: RawDescriptor) -> DeviceResult { - create_net_device(cfg, "net_device", |features, vq_pairs| { - // Safe because we ensure that we get a unique handle to the fd. - let tap = unsafe { - Tap::from_raw_descriptor( - validate_raw_descriptor(tap_fd).context("failed to validate tap descriptor")?, - ) - .context("failed to create tap device")? - }; - - virtio::Net::from(features, tap, vq_pairs).context("failed to create tap net device") - }) -} - -/// Returns a network device created by opening the persistent, configured TAP interface `tap_name`. -fn create_tap_net_device_from_name(cfg: &Config, tap_name: &[u8]) -> DeviceResult { - create_net_device(cfg, "net_device", |features, vq_pairs| { - virtio::Net::::new_from_name(features, tap_name, vq_pairs) - .context("failed to create configured virtio network device") - }) -} - -fn create_vhost_user_net_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { - let dev = VhostUserNet::new(virtio::base_features(cfg.protected_vm), &opt.socket) - .context("failed to set up vhost-user net device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - // no sandbox here because virtqueue handling is exported to a different process. - jail: None, - }) -} - -fn create_vhost_user_vsock_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult { - let dev = VhostUserVsock::new(virtio::base_features(cfg.protected_vm), &opt.socket) - .context("failed to set up vhost-user vsock device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - // no sandbox here because virtqueue handling is exported to a different process. - jail: None, - }) -} - -fn create_vhost_user_wl_device(cfg: &Config, opt: &VhostUserWlOption) -> DeviceResult { - // The crosvm wl device expects us to connect the tube before it will accept a vhost-user - // connection. - let dev = VhostUserWl::new(virtio::base_features(cfg.protected_vm), &opt.socket) - .context("failed to set up vhost-user wl device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - // no sandbox here because virtqueue handling is exported to a different process. - jail: None, - }) -} +mod device_helpers; +use device_helpers::*; +mod jail_helpers; +use jail_helpers::*; +mod vcpu; #[cfg(feature = "gpu")] -fn create_vhost_user_gpu_device( - cfg: &Config, - opt: &VhostUserOption, - host_tube: Tube, - device_tube: Tube, -) -> DeviceResult { - // The crosvm gpu device expects us to connect the tube before it will accept a vhost-user - // connection. - let dev = VhostUserGpu::new( - virtio::base_features(cfg.protected_vm), - &opt.socket, - host_tube, - device_tube, - ) - .context("failed to set up vhost-user gpu device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - // no sandbox here because virtqueue handling is exported to a different process. - jail: None, - }) -} - -/// Mirror-mount all the directories in `dirs` into `jail` on a best-effort basis. -/// -/// This function will not return an error if any of the directories in `dirs` is missing. -#[cfg(any(feature = "gpu", feature = "video-decoder", feature = "video-encoder"))] -fn jail_mount_bind_if_exists>( - jail: &mut Minijail, - dirs: &[P], -) -> Result<()> { - for dir in dirs { - let dir_path = Path::new(dir); - if dir_path.exists() { - jail.mount_bind(dir_path, dir_path, false)?; - } - } - - Ok(()) -} - +mod gpu; #[cfg(feature = "gpu")] -fn gpu_jail(cfg: &Config, policy: &str) -> Result> { - match simple_jail(cfg, policy)? { - Some(mut jail) => { - // Create a tmpfs in the device's root directory so that we can bind mount the - // dri directory into it. The size=67108864 is size=64*1024*1024 or size=64MB. - jail.mount_with_data( - Path::new("none"), - Path::new("/"), - "tmpfs", - (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, - "size=67108864", - )?; - - // Device nodes required for DRM. - let sys_dev_char_path = Path::new("/sys/dev/char"); - jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?; - let sys_devices_path = Path::new("/sys/devices"); - jail.mount_bind(sys_devices_path, sys_devices_path, false)?; - - let drm_dri_path = Path::new("/dev/dri"); - if drm_dri_path.exists() { - jail.mount_bind(drm_dri_path, drm_dri_path, false)?; - } - - // If the ARM specific devices exist on the host, bind mount them in. - let mali0_path = Path::new("/dev/mali0"); - if mali0_path.exists() { - jail.mount_bind(mali0_path, mali0_path, true)?; - } - - let pvr_sync_path = Path::new("/dev/pvr_sync"); - if pvr_sync_path.exists() { - jail.mount_bind(pvr_sync_path, pvr_sync_path, true)?; - } - - // If the udmabuf driver exists on the host, bind mount it in. - let udmabuf_path = Path::new("/dev/udmabuf"); - if udmabuf_path.exists() { - jail.mount_bind(udmabuf_path, udmabuf_path, true)?; - } - - // Libraries that are required when mesa drivers are dynamically loaded. - jail_mount_bind_if_exists( - &mut jail, - &[ - "/usr/lib", - "/usr/lib64", - "/lib", - "/lib64", - "/usr/share/drirc.d", - "/usr/share/glvnd", - "/usr/share/vulkan", - ], - )?; - - // pvr driver requires read access to /proc/self/task/*/comm. - let proc_path = Path::new("/proc"); - jail.mount( - proc_path, - proc_path, - "proc", - (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_RDONLY) as usize, - )?; - - // To enable perfetto tracing, we need to give access to the perfetto service IPC - // endpoints. - let perfetto_path = Path::new("/run/perfetto"); - if perfetto_path.exists() { - jail.mount_bind(perfetto_path, perfetto_path, true)?; - } - - Ok(Some(jail)) - } - None => Ok(None), - } -} - -#[cfg(feature = "gpu")] -struct GpuCacheInfo<'a> { - directory: Option<&'a str>, - environment: Vec<(&'a str, &'a str)>, -} - -#[cfg(feature = "gpu")] -fn get_gpu_cache_info<'a>( - cache_dir: Option<&'a String>, - cache_size: Option<&'a String>, - sandbox: bool, -) -> GpuCacheInfo<'a> { - let mut dir = None; - let mut env = Vec::new(); - - if let Some(cache_dir) = cache_dir { - if !Path::new(cache_dir).exists() { - warn!("shader caching dir {} does not exist", cache_dir); - env.push(("MESA_GLSL_CACHE_DISABLE", "true")); - } else if cfg!(any(target_arch = "arm", target_arch = "aarch64")) && sandbox { - warn!("shader caching not yet supported on ARM with sandbox enabled"); - env.push(("MESA_GLSL_CACHE_DISABLE", "true")); - } else { - dir = Some(cache_dir.as_str()); - - env.push(("MESA_GLSL_CACHE_DISABLE", "false")); - env.push(("MESA_GLSL_CACHE_DIR", cache_dir.as_str())); - if let Some(cache_size) = cache_size { - env.push(("MESA_GLSL_CACHE_MAX_SIZE", cache_size.as_str())); - } - } - } - - GpuCacheInfo { - directory: dir, - environment: env, - } -} - -#[cfg(feature = "gpu")] -fn create_gpu_device( - cfg: &Config, - exit_evt: &Event, - gpu_device_tube: Tube, - resource_bridges: Vec, - wayland_socket_path: Option<&PathBuf>, - x_display: Option, - render_server_fd: Option, - event_devices: Vec, - map_request: Arc>>, -) -> DeviceResult { - let mut display_backends = vec![ - virtio::DisplayBackend::X(x_display), - virtio::DisplayBackend::Stub, - ]; - - let wayland_socket_dirs = cfg - .wayland_socket_paths - .iter() - .map(|(_name, path)| path.parent()) - .collect::>>() - .ok_or_else(|| anyhow!("wayland socket path has no parent or file name"))?; - - if let Some(socket_path) = wayland_socket_path { - display_backends.insert( - 0, - virtio::DisplayBackend::Wayland(Some(socket_path.to_owned())), - ); - } - - let dev = virtio::Gpu::new( - exit_evt.try_clone().context("failed to clone event")?, - Some(gpu_device_tube), - resource_bridges, - display_backends, - cfg.gpu_parameters.as_ref().unwrap(), - render_server_fd, - event_devices, - map_request, - cfg.sandbox, - virtio::base_features(cfg.protected_vm), - cfg.wayland_socket_paths.clone(), - ); - - let jail = match gpu_jail(cfg, "gpu_device")? { - Some(mut jail) => { - // Prepare GPU shader disk cache directory. - let (cache_dir, cache_size) = cfg - .gpu_parameters - .as_ref() - .map(|params| (params.cache_path.as_ref(), params.cache_size.as_ref())) - .unwrap(); - let cache_info = get_gpu_cache_info(cache_dir, cache_size, cfg.sandbox); - - if let Some(dir) = cache_info.directory { - jail.mount_bind(dir, dir, true)?; - } - for (key, val) in cache_info.environment { - env::set_var(key, val); - } - - // Bind mount the wayland socket's directory into jail's root. This is necessary since - // each new wayland context must open() the socket. If the wayland socket is ever - // destroyed and remade in the same host directory, new connections will be possible - // without restarting the wayland device. - for dir in &wayland_socket_dirs { - jail.mount_bind(dir, dir, true)?; - } - - add_current_user_to_jail(&mut jail)?; - - Some(jail) - } - None => None, - }; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail, - }) -} - -#[cfg(feature = "gpu")] -fn get_gpu_render_server_environment(cache_info: &GpuCacheInfo) -> Result> { - let mut env = Vec::new(); - - let mut cache_env_keys = HashSet::with_capacity(cache_info.environment.len()); - for (key, val) in cache_info.environment.iter() { - env.push(format!("{}={}", key, val)); - cache_env_keys.insert(*key); - } - - for (key_os, val_os) in env::vars_os() { - // minijail should accept OsStr rather than str... - let into_string_err = |_| anyhow!("invalid environment key/val"); - let key = key_os.into_string().map_err(into_string_err)?; - let val = val_os.into_string().map_err(into_string_err)?; - - if !cache_env_keys.contains(key.as_str()) { - env.push(format!("{}={}", key, val)); - } - } - - Ok(env) -} - -#[cfg(feature = "gpu")] -struct ScopedMinijail(Minijail); - -#[cfg(feature = "gpu")] -impl Drop for ScopedMinijail { - fn drop(&mut self) { - let _ = self.0.kill(); - } -} - -#[cfg(feature = "gpu")] -fn start_gpu_render_server( - cfg: &Config, - render_server_parameters: &GpuRenderServerParameters, -) -> Result<(Minijail, SafeDescriptor)> { - let (server_socket, client_socket) = - UnixSeqpacket::pair().context("failed to create render server socket")?; - - let mut env = None; - let jail = match gpu_jail(cfg, "gpu_render_server")? { - Some(mut jail) => { - let cache_info = get_gpu_cache_info( - render_server_parameters.cache_path.as_ref(), - render_server_parameters.cache_size.as_ref(), - cfg.sandbox, - ); - - if let Some(dir) = cache_info.directory { - jail.mount_bind(dir, dir, true)?; - } - - if !cache_info.environment.is_empty() { - env = Some(get_gpu_render_server_environment(&cache_info)?); - } - - // bind mount /dev/log for syslog - let log_path = Path::new("/dev/log"); - if log_path.exists() { - jail.mount_bind(log_path, log_path, true)?; - } - - // Run as root in the jail to keep capabilities after execve, which is needed for - // mounting to work. All capabilities will be dropped afterwards. - add_current_user_as_root_to_jail(&mut jail)?; - - jail - } - None => Minijail::new().context("failed to create jail")?, - }; - - let inheritable_fds = [ - server_socket.as_raw_descriptor(), - libc::STDOUT_FILENO, - libc::STDERR_FILENO, - ]; - - let cmd = &render_server_parameters.path; - let cmd_str = cmd - .to_str() - .ok_or_else(|| anyhow!("invalid render server path"))?; - let fd_str = server_socket.as_raw_descriptor().to_string(); - let args = [cmd_str, "--socket-fd", &fd_str]; - - let mut envp: Option> = None; - if let Some(ref env) = env { - envp = Some(env.iter().map(AsRef::as_ref).collect()); - } - - jail.run_command(minijail::Command::new_for_path( - cmd, - &inheritable_fds, - &args, - envp.as_deref(), - )?) - .context("failed to start gpu render server")?; - - Ok((jail, SafeDescriptor::from(client_socket))) -} - -fn create_wayland_device( - cfg: &Config, - control_tube: Tube, - resource_bridge: Option, -) -> DeviceResult { - let wayland_socket_dirs = cfg - .wayland_socket_paths - .iter() - .map(|(_name, path)| path.parent()) - .collect::>>() - .ok_or_else(|| anyhow!("wayland socket path has no parent or file name"))?; - - let features = virtio::base_features(cfg.protected_vm); - let dev = virtio::Wl::new( - features, - cfg.wayland_socket_paths.clone(), - control_tube, - resource_bridge, - ) - .context("failed to create wayland device")?; - - let jail = match simple_jail(cfg, "wl_device")? { - Some(mut jail) => { - // Create a tmpfs in the device's root directory so that we can bind mount the wayland - // socket directory into it. The size=67108864 is size=64*1024*1024 or size=64MB. - jail.mount_with_data( - Path::new("none"), - Path::new("/"), - "tmpfs", - (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, - "size=67108864", - )?; - - // Bind mount the wayland socket's directory into jail's root. This is necessary since - // each new wayland context must open() the socket. If the wayland socket is ever - // destroyed and remade in the same host directory, new connections will be possible - // without restarting the wayland device. - for dir in &wayland_socket_dirs { - jail.mount_bind(dir, dir, true)?; - } - add_current_user_to_jail(&mut jail)?; - - Some(jail) - } - None => None, - }; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail, - }) -} - -#[cfg(any(feature = "video-decoder", feature = "video-encoder"))] -fn create_video_device( - backend: VideoBackendType, - cfg: &Config, - typ: devices::virtio::VideoDeviceType, - resource_bridge: Tube, -) -> DeviceResult { - let jail = match simple_jail(cfg, "video_device")? { - Some(mut jail) => { - match typ { - #[cfg(feature = "video-decoder")] - devices::virtio::VideoDeviceType::Decoder => add_current_user_to_jail(&mut jail)?, - #[cfg(feature = "video-encoder")] - devices::virtio::VideoDeviceType::Encoder => add_current_user_to_jail(&mut jail)?, - }; - - // Create a tmpfs in the device's root directory so that we can bind mount files. - jail.mount_with_data( - Path::new("none"), - Path::new("/"), - "tmpfs", - (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize, - "size=67108864", - )?; - - #[cfg(feature = "libvda")] - // Render node for libvda. - if backend == VideoBackendType::Libvda || backend == VideoBackendType::LibvdaVd { - // follow the implementation at: - // https://source.corp.google.com/chromeos_public/src/platform/minigbm/cros_gralloc/cros_gralloc_driver.cc;l=90;bpv=0;cl=c06cc9cccb3cf3c7f9d2aec706c27c34cd6162a0 - const DRM_NUM_NODES: u32 = 63; - const DRM_RENDER_NODE_START: u32 = 128; - for offset in 0..DRM_NUM_NODES { - let path_str = format!("/dev/dri/renderD{}", DRM_RENDER_NODE_START + offset); - let dev_dri_path = Path::new(&path_str); - if !dev_dri_path.exists() { - break; - } - jail.mount_bind(dev_dri_path, dev_dri_path, false)?; - } - } - - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - // Device nodes used by libdrm through minigbm in libvda on AMD devices. - let sys_dev_char_path = Path::new("/sys/dev/char"); - jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?; - let sys_devices_path = Path::new("/sys/devices"); - jail.mount_bind(sys_devices_path, sys_devices_path, false)?; - - // Required for loading dri libraries loaded by minigbm on AMD devices. - jail_mount_bind_if_exists(&mut jail, &["/usr/lib64"])?; - } - - // Device nodes required by libchrome which establishes Mojo connection in libvda. - let dev_urandom_path = Path::new("/dev/urandom"); - jail.mount_bind(dev_urandom_path, dev_urandom_path, false)?; - let system_bus_socket_path = Path::new("/run/dbus/system_bus_socket"); - jail.mount_bind(system_bus_socket_path, system_bus_socket_path, true)?; - - Some(jail) - } - None => None, - }; - - Ok(VirtioDeviceStub { - dev: Box::new(devices::virtio::VideoDevice::new( - virtio::base_features(cfg.protected_vm), - typ, - backend, - Some(resource_bridge), - )), - jail, - }) -} - -#[cfg(any(feature = "video-decoder", feature = "video-encoder"))] -fn register_video_device( - backend: VideoBackendType, - devs: &mut Vec, - video_tube: Tube, - cfg: &Config, - typ: devices::virtio::VideoDeviceType, -) -> Result<()> { - devs.push(create_video_device(backend, cfg, typ, video_tube)?); - Ok(()) -} - -fn create_vhost_vsock_device(cfg: &Config, cid: u64) -> DeviceResult { - let features = virtio::base_features(cfg.protected_vm); - - let device_file = match cfg - .vhost_vsock_device - .as_ref() - .unwrap_or(&VhostVsockDeviceParameter::default()) - { - VhostVsockDeviceParameter::Fd(fd) => { - let fd = validate_raw_descriptor(*fd) - .context("failed to validate fd for virtual socker device")?; - // Safe because the `fd` is actually owned by this process and - // we have a unique handle to it. - unsafe { File::from_raw_fd(fd) } - } - VhostVsockDeviceParameter::Path(path) => OpenOptions::new() - .read(true) - .write(true) - .custom_flags(libc::O_CLOEXEC | libc::O_NONBLOCK) - .open(path) - .context("failed to open virtual socket device")?, - }; - - let dev = virtio::vhost::Vsock::new(device_file, features, cid) - .context("failed to set up virtual socket device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "vhost_vsock_device")?, - }) -} - -fn create_fs_device( - cfg: &Config, - uid_map: &str, - gid_map: &str, - src: &Path, - tag: &str, - fs_cfg: virtio::fs::passthrough::Config, - device_tube: Tube, -) -> DeviceResult { - let max_open_files = - base::get_max_open_files().context("failed to get max number of open files")?; - let j = if cfg.sandbox { - let seccomp_policy = cfg.seccomp_policy_dir.join("fs_device"); - let config = SandboxConfig { - limit_caps: false, - uid_map: Some(uid_map), - gid_map: Some(gid_map), - log_failures: cfg.seccomp_log_failures, - seccomp_policy: &seccomp_policy, - // We want bind mounts from the parent namespaces to propagate into the fs device's - // namespace. - remount_mode: Some(libc::MS_SLAVE), - }; - create_base_minijail(src, Some(max_open_files), Some(&config))? - } else { - create_base_minijail(src, Some(max_open_files), None)? - }; - - let features = virtio::base_features(cfg.protected_vm); - // TODO(chirantan): Use more than one worker once the kernel driver has been fixed to not panic - // when num_queues > 1. - let dev = virtio::fs::Fs::new(features, tag, 1, fs_cfg, device_tube) - .context("failed to create fs device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: Some(j), - }) -} - -fn create_9p_device( - cfg: &Config, - uid_map: &str, - gid_map: &str, - src: &Path, - tag: &str, - mut p9_cfg: p9::Config, -) -> DeviceResult { - let max_open_files = - base::get_max_open_files().context("failed to get max number of open files")?; - let (jail, root) = if cfg.sandbox { - let seccomp_policy = cfg.seccomp_policy_dir.join("9p_device"); - let config = SandboxConfig { - limit_caps: false, - uid_map: Some(uid_map), - gid_map: Some(gid_map), - log_failures: cfg.seccomp_log_failures, - seccomp_policy: &seccomp_policy, - // We want bind mounts from the parent namespaces to propagate into the 9p server's - // namespace. - remount_mode: Some(libc::MS_SLAVE), - }; - - let jail = create_base_minijail(src, Some(max_open_files), Some(&config))?; - - // The shared directory becomes the root of the device's file system. - let root = Path::new("/"); - (Some(jail), root) - } else { - // There's no mount namespace so we tell the server to treat the source directory as the - // root. - (None, src) - }; - - let features = virtio::base_features(cfg.protected_vm); - p9_cfg.root = root.into(); - let dev = virtio::P9::new(features, tag, p9_cfg).context("failed to create 9p device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail, - }) -} - -fn create_pmem_device( - cfg: &Config, - vm: &mut impl Vm, - resources: &mut SystemAllocator, - disk: &DiskOption, - index: usize, - pmem_device_tube: Tube, -) -> DeviceResult { - let fd = open_file(&disk.path, disk.read_only, false /*O_DIRECT*/) - .with_context(|| format!("failed to load disk image {}", disk.path.display()))?; - - let (disk_size, arena_size) = { - let metadata = std::fs::metadata(&disk.path).with_context(|| { - format!("failed to get disk image {} metadata", disk.path.display()) - })?; - let disk_len = metadata.len(); - // Linux requires pmem region sizes to be 2 MiB aligned. Linux will fill any partial page - // at the end of an mmap'd file and won't write back beyond the actual file length, but if - // we just align the size of the file to 2 MiB then access beyond the last page of the - // mapped file will generate SIGBUS. So use a memory mapping arena that will provide - // padding up to 2 MiB. - let alignment = 2 * 1024 * 1024; - let align_adjust = if disk_len % alignment != 0 { - alignment - (disk_len % alignment) - } else { - 0 - }; - ( - disk_len, - disk_len - .checked_add(align_adjust) - .ok_or_else(|| anyhow!("pmem device image too big"))?, - ) - }; - - let protection = { - if disk.read_only { - Protection::read() - } else { - Protection::read_write() - } - }; - - let arena = { - // Conversion from u64 to usize may fail on 32bit system. - let arena_size = usize::try_from(arena_size).context("pmem device image too big")?; - let disk_size = usize::try_from(disk_size).context("pmem device image too big")?; - - let mut arena = - MemoryMappingArena::new(arena_size).context("failed to reserve pmem memory")?; - arena - .add_fd_offset_protection(0, disk_size, &fd, 0, protection) - .context("failed to reserve pmem memory")?; - - // If the disk is not a multiple of the page size, the OS will fill the remaining part - // of the page with zeroes. However, the anonymous mapping added below must start on a - // page boundary, so round up the size before calculating the offset of the anon region. - let disk_size = round_up_to_page_size(disk_size); - - if arena_size > disk_size { - // Add an anonymous region with the same protection as the disk mapping if the arena - // size was aligned. - arena - .add_anon_protection(disk_size, arena_size - disk_size, protection) - .context("failed to reserve pmem padding")?; - } - arena - }; - - let mapping_address = resources - .mmio_allocator(MmioType::High) - .reverse_allocate_with_align( - arena_size, - Alloc::PmemDevice(index), - format!("pmem_disk_image_{}", index), - // Linux kernel requires pmem namespaces to be 128 MiB aligned. - 128 * 1024 * 1024, /* 128 MiB */ - ) - .context("failed to allocate memory for pmem device")?; - - let slot = vm - .add_memory_region( - GuestAddress(mapping_address), - Box::new(arena), - /* read_only = */ disk.read_only, - /* log_dirty_pages = */ false, - ) - .context("failed to add pmem device memory")?; - - let dev = virtio::Pmem::new( - virtio::base_features(cfg.protected_vm), - fd, - GuestAddress(mapping_address), - slot, - arena_size, - Some(pmem_device_tube), - ) - .context("failed to create pmem device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev) as Box, - jail: simple_jail(cfg, "pmem_device")?, - }) -} - -fn create_iommu_device( - cfg: &Config, - phys_max_addr: u64, - endpoints: BTreeMap>>, -) -> DeviceResult { - let dev = virtio::Iommu::new( - virtio::base_features(cfg.protected_vm), - endpoints, - phys_max_addr, - ) - .context("failed to create IOMMU device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "iommu_device")?, - }) -} - -fn create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult { - let mut keep_rds = Vec::new(); - let evt = Event::new().context("failed to create event")?; - let dev = param - .create_serial_device::(cfg.protected_vm, &evt, &mut keep_rds) - .context("failed to create console device")?; - - let jail = match simple_jail(cfg, "serial")? { - Some(mut jail) => { - // Create a tmpfs in the device's root directory so that we can bind mount the - // log socket directory into it. - // The size=67108864 is size=64*1024*1024 or size=64MB. - jail.mount_with_data( - Path::new("none"), - Path::new("/"), - "tmpfs", - (libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_NOSUID) as usize, - "size=67108864", - )?; - add_current_user_to_jail(&mut jail)?; - let res = param.add_bind_mounts(&mut jail); - if res.is_err() { - error!("failed to add bind mounts for console device"); - } - Some(jail) - } - None => None, - }; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail, // TODO(dverkamp): use a separate policy for console? - }) -} - -#[cfg(feature = "audio")] -fn create_sound_device(path: &Path, cfg: &Config) -> DeviceResult { - let dev = virtio::new_sound(path, virtio::base_features(cfg.protected_vm)) - .context("failed to create sound device")?; - - Ok(VirtioDeviceStub { - dev: Box::new(dev), - jail: simple_jail(cfg, "vios_audio_device")?, - }) -} +use gpu::*; // gpu_device_tube is not used when GPU support is disabled. #[cfg_attr(not(feature = "gpu"), allow(unused_variables))] @@ -1710,8 +281,8 @@ fn create_virtio_devices( #[cfg(feature = "gpu")] { if let Some(gpu_parameters) = &cfg.gpu_parameters { - let mut gpu_display_w = DEFAULT_DISPLAY_WIDTH; - let mut gpu_display_h = DEFAULT_DISPLAY_HEIGHT; + let mut gpu_display_w = virtio::DEFAULT_DISPLAY_WIDTH; + let mut gpu_display_h = virtio::DEFAULT_DISPLAY_HEIGHT; if !gpu_parameters.displays.is_empty() { gpu_display_w = gpu_parameters.displays[0].width; gpu_display_h = gpu_parameters.displays[0].height; @@ -1858,111 +429,6 @@ fn create_virtio_devices( Ok(devs) } -fn create_vfio_device( - cfg: &Config, - vm: &impl Vm, - resources: &mut SystemAllocator, - control_tubes: &mut Vec, - vfio_path: &Path, - bus_num: Option, - iommu_endpoints: &mut BTreeMap>>, - coiommu_endpoints: Option<&mut Vec>, - iommu_dev: IommuDevType, -) -> DeviceResult<(Box, Option)> { - let vfio_container = VfioCommonSetup::vfio_get_container(iommu_dev, Some(vfio_path)) - .context("failed to get vfio container")?; - - // create MSI, MSI-X, and Mem request sockets for each vfio device - let (vfio_host_tube_msi, vfio_device_tube_msi) = - Tube::pair().context("failed to create tube")?; - control_tubes.push(TaggedControlTube::VmIrq(vfio_host_tube_msi)); - - let (vfio_host_tube_msix, vfio_device_tube_msix) = - Tube::pair().context("failed to create tube")?; - control_tubes.push(TaggedControlTube::VmIrq(vfio_host_tube_msix)); - - let (vfio_host_tube_mem, vfio_device_tube_mem) = - Tube::pair().context("failed to create tube")?; - control_tubes.push(TaggedControlTube::VmMemory(vfio_host_tube_mem)); - - let hotplug = bus_num.is_some(); - let vfio_device_tube_vm = if hotplug { - let (vfio_host_tube_vm, device_tube_vm) = Tube::pair().context("failed to create tube")?; - control_tubes.push(TaggedControlTube::Vm(vfio_host_tube_vm)); - Some(device_tube_vm) - } else { - None - }; - - let vfio_device = VfioDevice::new_passthrough( - &vfio_path, - vm, - vfio_container.clone(), - iommu_dev != IommuDevType::NoIommu, - ) - .context("failed to create vfio device")?; - let mut vfio_pci_device = Box::new(VfioPciDevice::new( - vfio_device, - bus_num, - vfio_device_tube_msi, - vfio_device_tube_msix, - vfio_device_tube_mem, - vfio_device_tube_vm, - )); - // early reservation for pass-through PCI devices. - let endpoint_addr = vfio_pci_device - .allocate_address(resources) - .context("failed to allocate resources early for vfio pci dev")?; - - match iommu_dev { - IommuDevType::NoIommu => {} - IommuDevType::VirtioIommu => { - iommu_endpoints.insert(endpoint_addr.to_u32(), vfio_container); - } - IommuDevType::CoIommu => { - if let Some(endpoints) = coiommu_endpoints { - endpoints.push(endpoint_addr.to_u32() as u16); - } else { - bail!("Missed coiommu_endpoints vector to store the endpoint addr"); - } - } - } - - if hotplug { - Ok((vfio_pci_device, None)) - } else { - Ok((vfio_pci_device, simple_jail(cfg, "vfio_device")?)) - } -} - -fn create_vfio_platform_device( - cfg: &Config, - vm: &impl Vm, - _resources: &mut SystemAllocator, - control_tubes: &mut Vec, - vfio_path: &Path, - _endpoints: &mut BTreeMap>>, - iommu_dev: IommuDevType, -) -> DeviceResult<(VfioPlatformDevice, Option)> { - let vfio_container = VfioCommonSetup::vfio_get_container(iommu_dev, Some(vfio_path)) - .context("Failed to create vfio device")?; - - let (vfio_host_tube_mem, vfio_device_tube_mem) = - Tube::pair().context("failed to create tube")?; - control_tubes.push(TaggedControlTube::VmMemory(vfio_host_tube_mem)); - - let vfio_device = VfioDevice::new_passthrough( - &vfio_path, - vm, - vfio_container, - iommu_dev != IommuDevType::NoIommu, - ) - .context("Failed to create vfio device")?; - let vfio_plat_dev = VfioPlatformDevice::new(vfio_device, vfio_device_tube_mem); - - Ok((vfio_plat_dev, simple_jail(cfg, "vfio_platform_device")?)) -} - fn create_devices( cfg: &Config, vm: &mut impl Vm, @@ -2195,636 +661,6 @@ fn create_file_backed_mappings( Ok(()) } -#[derive(Copy, Clone)] -#[cfg_attr(not(feature = "tpm"), allow(dead_code))] -struct Ids { - uid: uid_t, - gid: gid_t, -} - -// Set the uid/gid for the jailed process and give a basic id map. This is -// required for bind mounts to work. -fn add_current_user_to_jail(jail: &mut Minijail) -> Result { - let crosvm_uid = geteuid(); - let crosvm_gid = getegid(); - - jail.uidmap(&format!("{0} {0} 1", crosvm_uid)) - .context("error setting UID map")?; - jail.gidmap(&format!("{0} {0} 1", crosvm_gid)) - .context("error setting GID map")?; - - if crosvm_uid != 0 { - jail.change_uid(crosvm_uid); - } - if crosvm_gid != 0 { - jail.change_gid(crosvm_gid); - } - - Ok(Ids { - uid: crosvm_uid, - gid: crosvm_gid, - }) -} - -fn add_current_user_as_root_to_jail(jail: &mut Minijail) -> Result { - let crosvm_uid = geteuid(); - let crosvm_gid = getegid(); - jail.uidmap(&format!("0 {0} 1", crosvm_uid)) - .context("error setting UID map")?; - jail.gidmap(&format!("0 {0} 1", crosvm_gid)) - .context("error setting GID map")?; - - Ok(Ids { - uid: crosvm_uid, - gid: crosvm_gid, - }) -} - -trait IntoUnixStream { - fn into_unix_stream(self) -> Result; -} - -impl<'a> IntoUnixStream for &'a Path { - fn into_unix_stream(self) -> Result { - if let Some(fd) = safe_descriptor_from_path(self).context("failed to open event device")? { - Ok(fd.into()) - } else { - UnixStream::connect(self).context("failed to open event device") - } - } -} -impl<'a> IntoUnixStream for &'a PathBuf { - fn into_unix_stream(self) -> Result { - self.as_path().into_unix_stream() - } -} - -impl IntoUnixStream for UnixStream { - fn into_unix_stream(self) -> Result { - Ok(self) - } -} - -fn setup_vcpu_signal_handler(use_hypervisor_signals: bool) -> Result<()> { - if use_hypervisor_signals { - unsafe { - extern "C" fn handle_signal(_: c_int) {} - // Our signal handler does nothing and is trivially async signal safe. - register_rt_signal_handler(SIGRTMIN() + 0, handle_signal) - .context("error registering signal handler")?; - } - block_signal(SIGRTMIN() + 0).context("failed to block signal")?; - } else { - unsafe { - extern "C" fn handle_signal(_: c_int) { - T::set_local_immediate_exit(true); - } - register_rt_signal_handler(SIGRTMIN() + 0, handle_signal::) - .context("error registering signal handler")?; - } - } - Ok(()) -} - -// Sets up a vcpu and converts it into a runnable vcpu. -fn runnable_vcpu( - cpu_id: usize, - kvm_vcpu_id: usize, - vcpu: Option, - vm: impl VmArch, - irq_chip: &mut dyn IrqChipArch, - vcpu_count: usize, - run_rt: bool, - vcpu_affinity: Vec, - no_smt: bool, - has_bios: bool, - use_hypervisor_signals: bool, - enable_per_vm_core_scheduling: bool, - host_cpu_topology: bool, - vcpu_cgroup_tasks_file: Option, -) -> Result<(V, VcpuRunHandle)> -where - V: VcpuArch, -{ - let mut vcpu = match vcpu { - Some(v) => v, - None => { - // If vcpu is None, it means this arch/hypervisor requires create_vcpu to be called from - // the vcpu thread. - match vm - .create_vcpu(kvm_vcpu_id) - .context("failed to create vcpu")? - .downcast::() - { - Ok(v) => *v, - Err(_) => panic!("VM created wrong type of VCPU"), - } - } - }; - - irq_chip - .add_vcpu(cpu_id, &vcpu) - .context("failed to add vcpu to irq chip")?; - - if !vcpu_affinity.is_empty() { - if let Err(e) = set_cpu_affinity(vcpu_affinity) { - error!("Failed to set CPU affinity: {}", e); - } - } - - Arch::configure_vcpu( - &vm, - vm.get_hypervisor(), - irq_chip, - &mut vcpu, - cpu_id, - vcpu_count, - has_bios, - no_smt, - host_cpu_topology, - ) - .context("failed to configure vcpu")?; - - if !enable_per_vm_core_scheduling { - // Do per-vCPU core scheduling by setting a unique cookie to each vCPU. - if let Err(e) = enable_core_scheduling() { - error!("Failed to enable core scheduling: {}", e); - } - } - - // Move vcpu thread to cgroup - if let Some(mut f) = vcpu_cgroup_tasks_file { - f.write_all(base::gettid().to_string().as_bytes()) - .context("failed to write vcpu tid to cgroup tasks")?; - } - - if run_rt { - const DEFAULT_VCPU_RT_LEVEL: u16 = 6; - if let Err(e) = set_rt_prio_limit(u64::from(DEFAULT_VCPU_RT_LEVEL)) - .and_then(|_| set_rt_round_robin(i32::from(DEFAULT_VCPU_RT_LEVEL))) - { - warn!("Failed to set vcpu to real time: {}", e); - } - } - - if use_hypervisor_signals { - let mut v = get_blocked_signals().context("failed to retrieve signal mask for vcpu")?; - v.retain(|&x| x != SIGRTMIN() + 0); - vcpu.set_signal_mask(&v) - .context("failed to set the signal mask for vcpu")?; - } - - let vcpu_run_handle = vcpu - .take_run_handle(Some(SIGRTMIN() + 0)) - .context("failed to set thread id for vcpu")?; - - Ok((vcpu, vcpu_run_handle)) -} - -#[cfg(all(target_arch = "x86_64", feature = "gdb"))] -fn handle_debug_msg( - cpu_id: usize, - vcpu: &V, - guest_mem: &GuestMemory, - d: VcpuDebug, - reply_tube: &mpsc::Sender, -) -> Result<()> -where - V: VcpuArch + 'static, -{ - match d { - VcpuDebug::ReadRegs => { - let msg = VcpuDebugStatusMessage { - cpu: cpu_id as usize, - msg: VcpuDebugStatus::RegValues( - Arch::debug_read_registers(vcpu as &V) - .context("failed to handle a gdb ReadRegs command")?, - ), - }; - reply_tube - .send(msg) - .context("failed to send a debug status to GDB thread") - } - VcpuDebug::WriteRegs(regs) => { - Arch::debug_write_registers(vcpu as &V, ®s) - .context("failed to handle a gdb WriteRegs command")?; - reply_tube - .send(VcpuDebugStatusMessage { - cpu: cpu_id as usize, - msg: VcpuDebugStatus::CommandComplete, - }) - .context("failed to send a debug status to GDB thread") - } - VcpuDebug::ReadMem(vaddr, len) => { - let msg = VcpuDebugStatusMessage { - cpu: cpu_id as usize, - msg: VcpuDebugStatus::MemoryRegion( - Arch::debug_read_memory(vcpu as &V, guest_mem, vaddr, len) - .unwrap_or(Vec::new()), - ), - }; - reply_tube - .send(msg) - .context("failed to send a debug status to GDB thread") - } - VcpuDebug::WriteMem(vaddr, buf) => { - Arch::debug_write_memory(vcpu as &V, guest_mem, vaddr, &buf) - .context("failed to handle a gdb WriteMem command")?; - reply_tube - .send(VcpuDebugStatusMessage { - cpu: cpu_id as usize, - msg: VcpuDebugStatus::CommandComplete, - }) - .context("failed to send a debug status to GDB thread") - } - VcpuDebug::EnableSinglestep => { - Arch::debug_enable_singlestep(vcpu as &V) - .context("failed to handle a gdb EnableSingleStep command")?; - reply_tube - .send(VcpuDebugStatusMessage { - cpu: cpu_id as usize, - msg: VcpuDebugStatus::CommandComplete, - }) - .context("failed to send a debug status to GDB thread") - } - VcpuDebug::SetHwBreakPoint(addrs) => { - Arch::debug_set_hw_breakpoints(vcpu as &V, &addrs) - .context("failed to handle a gdb SetHwBreakPoint command")?; - reply_tube - .send(VcpuDebugStatusMessage { - cpu: cpu_id as usize, - msg: VcpuDebugStatus::CommandComplete, - }) - .context("failed to send a debug status to GDB thread") - } - } -} - -fn run_vcpu( - cpu_id: usize, - kvm_vcpu_id: usize, - vcpu: Option, - vm: impl VmArch + 'static, - mut irq_chip: Box, - vcpu_count: usize, - run_rt: bool, - vcpu_affinity: Vec, - delay_rt: bool, - no_smt: bool, - start_barrier: Arc, - has_bios: bool, - mut io_bus: devices::Bus, - mut mmio_bus: devices::Bus, - exit_evt: Event, - reset_evt: Event, - crash_evt: Event, - requires_pvclock_ctrl: bool, - from_main_tube: mpsc::Receiver, - use_hypervisor_signals: bool, - #[cfg(all(target_arch = "x86_64", feature = "gdb"))] to_gdb_tube: Option< - mpsc::Sender, - >, - enable_per_vm_core_scheduling: bool, - host_cpu_topology: bool, - vcpu_cgroup_tasks_file: Option, -) -> Result> -where - V: VcpuArch + 'static, -{ - thread::Builder::new() - .name(format!("crosvm_vcpu{}", cpu_id)) - .spawn(move || { - // The VCPU thread must trigger either `exit_evt` or `reset_event` in all paths. A - // `ScopedEvent`'s Drop implementation ensures that the `exit_evt` will be sent if - // anything happens before we get to writing the final event. - let scoped_exit_evt = ScopedEvent::from(exit_evt); - - #[cfg(all(target_arch = "x86_64", feature = "gdb"))] - let guest_mem = vm.get_memory().clone(); - let runnable_vcpu = runnable_vcpu( - cpu_id, - kvm_vcpu_id, - vcpu, - vm, - irq_chip.as_mut(), - vcpu_count, - run_rt && !delay_rt, - vcpu_affinity, - no_smt, - has_bios, - use_hypervisor_signals, - enable_per_vm_core_scheduling, - host_cpu_topology, - vcpu_cgroup_tasks_file, - ); - - start_barrier.wait(); - - let (vcpu, vcpu_run_handle) = match runnable_vcpu { - Ok(v) => v, - Err(e) => { - error!("failed to start vcpu {}: {:#}", cpu_id, e); - return; - } - }; - - #[allow(unused_mut)] - let mut run_mode = VmRunMode::Running; - #[cfg(all(target_arch = "x86_64", feature = "gdb"))] - if to_gdb_tube.is_some() { - // Wait until a GDB client attaches - run_mode = VmRunMode::Breakpoint; - } - - mmio_bus.set_access_id(cpu_id); - io_bus.set_access_id(cpu_id); - - let exit_reason = vcpu_loop( - run_mode, - cpu_id, - vcpu, - vcpu_run_handle, - irq_chip, - run_rt, - delay_rt, - io_bus, - mmio_bus, - requires_pvclock_ctrl, - from_main_tube, - use_hypervisor_signals, - #[cfg(all(target_arch = "x86_64", feature = "gdb"))] - to_gdb_tube, - #[cfg(all(target_arch = "x86_64", feature = "gdb"))] - guest_mem, - ); - - let exit_evt = scoped_exit_evt.into(); - let final_event = match exit_reason { - ExitState::Stop => exit_evt, - ExitState::Reset => reset_evt, - ExitState::Crash => crash_evt, - }; - if let Err(e) = final_event.write(1) { - error!( - "failed to send final event {:?} on vcpu {}: {}", - final_event, cpu_id, e - ) - } - }) - .context("failed to spawn VCPU thread") -} - -fn vcpu_loop( - mut run_mode: VmRunMode, - cpu_id: usize, - vcpu: V, - vcpu_run_handle: VcpuRunHandle, - irq_chip: Box, - run_rt: bool, - delay_rt: bool, - io_bus: devices::Bus, - mmio_bus: devices::Bus, - requires_pvclock_ctrl: bool, - from_main_tube: mpsc::Receiver, - use_hypervisor_signals: bool, - #[cfg(all(target_arch = "x86_64", feature = "gdb"))] to_gdb_tube: Option< - mpsc::Sender, - >, - #[cfg(all(target_arch = "x86_64", feature = "gdb"))] guest_mem: GuestMemory, -) -> ExitState -where - V: VcpuArch + 'static, -{ - let mut interrupted_by_signal = false; - - loop { - // Start by checking for messages to process and the run state of the CPU. - // An extra check here for Running so there isn't a need to call recv unless a - // message is likely to be ready because a signal was sent. - if interrupted_by_signal || run_mode != VmRunMode::Running { - 'state_loop: loop { - // Tries to get a pending message without blocking first. - let msg = match from_main_tube.try_recv() { - Ok(m) => m, - Err(mpsc::TryRecvError::Empty) if run_mode == VmRunMode::Running => { - // If the VM is running and no message is pending, the state won't - // change. - break 'state_loop; - } - Err(mpsc::TryRecvError::Empty) => { - // If the VM is not running, wait until a message is ready. - match from_main_tube.recv() { - Ok(m) => m, - Err(mpsc::RecvError) => { - error!("Failed to read from main tube in vcpu"); - return ExitState::Crash; - } - } - } - Err(mpsc::TryRecvError::Disconnected) => { - error!("Failed to read from main tube in vcpu"); - return ExitState::Crash; - } - }; - - // Collect all pending messages. - let mut messages = vec![msg]; - messages.append(&mut from_main_tube.try_iter().collect()); - - for msg in messages { - match msg { - VcpuControl::RunState(new_mode) => { - run_mode = new_mode; - match run_mode { - VmRunMode::Running => break 'state_loop, - VmRunMode::Suspending => { - // On KVM implementations that use a paravirtualized - // clock (e.g. x86), a flag must be set to indicate to - // the guest kernel that a vCPU was suspended. The guest - // kernel will use this flag to prevent the soft lockup - // detection from triggering when this vCPU resumes, - // which could happen days later in realtime. - if requires_pvclock_ctrl { - if let Err(e) = vcpu.pvclock_ctrl() { - error!( - "failed to tell hypervisor vcpu {} is suspending: {}", - cpu_id, e - ); - } - } - } - VmRunMode::Breakpoint => {} - VmRunMode::Exiting => return ExitState::Stop, - } - } - #[cfg(all(target_arch = "x86_64", feature = "gdb"))] - VcpuControl::Debug(d) => match &to_gdb_tube { - Some(ref ch) => { - if let Err(e) = handle_debug_msg(cpu_id, &vcpu, &guest_mem, d, ch) { - error!("Failed to handle gdb message: {}", e); - } - } - None => { - error!("VcpuControl::Debug received while GDB feature is disabled: {:?}", d); - } - }, - VcpuControl::MakeRT => { - if run_rt && delay_rt { - info!("Making vcpu {} RT\n", cpu_id); - const DEFAULT_VCPU_RT_LEVEL: u16 = 6; - if let Err(e) = set_rt_prio_limit(u64::from(DEFAULT_VCPU_RT_LEVEL)) - .and_then(|_| { - set_rt_round_robin(i32::from(DEFAULT_VCPU_RT_LEVEL)) - }) - { - warn!("Failed to set vcpu to real time: {}", e); - } - } - } - } - } - } - } - - interrupted_by_signal = false; - - // Vcpus may have run a HLT instruction, which puts them into a state other than - // VcpuRunState::Runnable. In that case, this call to wait_until_runnable blocks - // until either the irqchip receives an interrupt for this vcpu, or until the main - // thread kicks this vcpu as a result of some VmControl operation. In most IrqChip - // implementations HLT instructions do not make it to crosvm, and thus this is a - // no-op that always returns VcpuRunState::Runnable. - match irq_chip.wait_until_runnable(&vcpu) { - Ok(VcpuRunState::Runnable) => {} - Ok(VcpuRunState::Interrupted) => interrupted_by_signal = true, - Err(e) => error!( - "error waiting for vcpu {} to become runnable: {}", - cpu_id, e - ), - } - - if !interrupted_by_signal { - match vcpu.run(&vcpu_run_handle) { - Ok(VcpuExit::IoIn { port, mut size }) => { - let mut data = [0; 8]; - if size > data.len() { - error!( - "unsupported IoIn size of {} bytes at port {:#x}", - size, port - ); - size = data.len(); - } - io_bus.read(port as u64, &mut data[..size]); - if let Err(e) = vcpu.set_data(&data[..size]) { - error!( - "failed to set return data for IoIn at port {:#x}: {}", - port, e - ); - } - } - Ok(VcpuExit::IoOut { - port, - mut size, - data, - }) => { - if size > data.len() { - error!( - "unsupported IoOut size of {} bytes at port {:#x}", - size, port - ); - size = data.len(); - } - io_bus.write(port as u64, &data[..size]); - } - Ok(VcpuExit::MmioRead { address, size }) => { - let mut data = [0; 8]; - mmio_bus.read(address, &mut data[..size]); - // Setting data for mmio can not fail. - let _ = vcpu.set_data(&data[..size]); - } - Ok(VcpuExit::MmioWrite { - address, - size, - data, - }) => { - mmio_bus.write(address, &data[..size]); - } - Ok(VcpuExit::IoapicEoi { vector }) => { - if let Err(e) = irq_chip.broadcast_eoi(vector) { - error!( - "failed to broadcast eoi {} on vcpu {}: {}", - vector, cpu_id, e - ); - } - } - Ok(VcpuExit::IrqWindowOpen) => {} - Ok(VcpuExit::Hlt) => irq_chip.halted(cpu_id), - Ok(VcpuExit::Shutdown) => return ExitState::Stop, - Ok(VcpuExit::FailEntry { - hardware_entry_failure_reason, - }) => { - error!("vcpu hw run failure: {:#x}", hardware_entry_failure_reason); - return ExitState::Crash; - } - Ok(VcpuExit::SystemEventShutdown) => { - info!("system shutdown event on vcpu {}", cpu_id); - return ExitState::Stop; - } - Ok(VcpuExit::SystemEventReset) => { - info!("system reset event"); - return ExitState::Reset; - } - Ok(VcpuExit::SystemEventCrash) => { - info!("system crash event on vcpu {}", cpu_id); - return ExitState::Stop; - } - #[rustfmt::skip] Ok(VcpuExit::Debug { .. }) => { - #[cfg(all(target_arch = "x86_64", feature = "gdb"))] - { - let msg = VcpuDebugStatusMessage { - cpu: cpu_id as usize, - msg: VcpuDebugStatus::HitBreakPoint, - }; - if let Some(ref ch) = to_gdb_tube { - if let Err(e) = ch.send(msg) { - error!("failed to notify breakpoint to GDB thread: {}", e); - return ExitState::Crash; - } - } - run_mode = VmRunMode::Breakpoint; - } - } - Ok(r) => warn!("unexpected vcpu exit: {:?}", r), - Err(e) => match e.errno() { - libc::EINTR => interrupted_by_signal = true, - libc::EAGAIN => {} - _ => { - error!("vcpu hit unknown error: {}", e); - return ExitState::Crash; - } - }, - } - } - - if interrupted_by_signal { - if use_hypervisor_signals { - // Try to clear the signal that we use to kick VCPU if it is pending before - // attempting to handle pause requests. - if let Err(e) = clear_signal(SIGRTMIN() + 0) { - error!("failed to clear pending signal: {}", e); - return ExitState::Crash; - } - } else { - vcpu.set_immediate_exit(false); - } - } - - if let Err(e) = irq_chip.inject_interrupts(&vcpu) { - error!("failed to inject interrupts for vcpu {}: {}", cpu_id, e); - } - } -} - fn setup_vm_components(cfg: &Config) -> Result { let initrd_image = if let Some(initrd_path) = &cfg.initrd_path { Some( @@ -3453,24 +1289,6 @@ fn handle_vfio_command( } } -/// Signals all running VCPUs to vmexit, sends VcpuControl message to each VCPU tube, and tells -/// `irq_chip` to stop blocking halted VCPUs. The channel message is set first because both the -/// signal and the irq_chip kick could cause the VCPU thread to continue through the VCPU run -/// loop. -fn kick_all_vcpus( - vcpu_handles: &[(JoinHandle<()>, mpsc::Sender)], - irq_chip: &dyn IrqChip, - message: VcpuControl, -) { - for (handle, tube) in vcpu_handles { - if let Err(e) = tube.send(message.clone()) { - error!("failed to send VcpuControl: {}", e); - } - let _ = handle.kill(SIGRTMIN() + 0); - } - irq_chip.kick_halted_vcpus(); -} - fn run_control( mut linux: RunnableLinuxVm, mut sys_allocator: SystemAllocator, @@ -3555,7 +1373,7 @@ fn run_control( .vm .get_hypervisor() .check_capability(HypervisorCap::ImmediateExit); - setup_vcpu_signal_handler::(use_hypervisor_signals)?; + vcpu::setup_vcpu_signal_handler::(use_hypervisor_signals)?; let vcpus: Vec> = match linux.vcpus.take() { Some(vec) => vec.into_iter().map(Some).collect(), @@ -3586,7 +1404,7 @@ fn run_control( Some(VcpuAffinity::PerVcpu(mut m)) => m.remove(&cpu_id).unwrap_or_default(), None => Default::default(), }; - let handle = run_vcpu( + let handle = vcpu::run_vcpu( cpu_id, kvm_vcpu_ids[cpu_id], vcpu, @@ -3683,7 +1501,7 @@ fn run_control( Token::Suspend => { info!("VM requested suspend"); linux.suspend_evt.read().unwrap(); - kick_all_vcpus( + vcpu::kick_all_vcpus( &vcpu_handles, linux.irq_chip.as_irq_chip(), VcpuControl::RunState(VmRunMode::Suspending), @@ -3776,7 +1594,7 @@ fn run_control( dev.lock().resume_imminent(); } } - kick_all_vcpus( + vcpu::kick_all_vcpus( &vcpu_handles, linux.irq_chip.as_irq_chip(), VcpuControl::RunState(other), @@ -3961,7 +1779,7 @@ fn run_control( } } - kick_all_vcpus( + vcpu::kick_all_vcpus( &vcpu_handles, linux.irq_chip.as_irq_chip(), VcpuControl::RunState(VmRunMode::Exiting), diff --git a/src/linux/vcpu.rs b/src/linux/vcpu.rs new file mode 100644 index 0000000000..583ecec03a --- /dev/null +++ b/src/linux/vcpu.rs @@ -0,0 +1,615 @@ +// Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use std::fs::File; +use std::io::prelude::*; +use std::sync::{mpsc, Arc, Barrier}; + +use std::thread; +use std::thread::JoinHandle; + +use libc::{self, c_int}; + +use anyhow::{Context, Result}; +use base::*; +use devices::{self, IrqChip, VcpuRunState}; +use hypervisor::{Vcpu, VcpuExit, VcpuRunHandle}; +use vm_control::*; +#[cfg(all(target_arch = "x86_64", feature = "gdb"))] +use vm_memory::GuestMemory; + +use arch::{self, LinuxArch}; + +#[cfg(any(target_arch = "arm", target_arch = "aarch64"))] +use { + aarch64::AArch64 as Arch, + devices::IrqChipAArch64 as IrqChipArch, + hypervisor::{VcpuAArch64 as VcpuArch, VmAArch64 as VmArch}, +}; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use { + devices::IrqChipX86_64 as IrqChipArch, + hypervisor::{VcpuX86_64 as VcpuArch, VmX86_64 as VmArch}, + x86_64::X8664arch as Arch, +}; + +use super::ExitState; + +pub fn setup_vcpu_signal_handler(use_hypervisor_signals: bool) -> Result<()> { + if use_hypervisor_signals { + unsafe { + extern "C" fn handle_signal(_: c_int) {} + // Our signal handler does nothing and is trivially async signal safe. + register_rt_signal_handler(SIGRTMIN() + 0, handle_signal) + .context("error registering signal handler")?; + } + block_signal(SIGRTMIN() + 0).context("failed to block signal")?; + } else { + unsafe { + extern "C" fn handle_signal(_: c_int) { + T::set_local_immediate_exit(true); + } + register_rt_signal_handler(SIGRTMIN() + 0, handle_signal::) + .context("error registering signal handler")?; + } + } + Ok(()) +} + +// Sets up a vcpu and converts it into a runnable vcpu. +pub fn runnable_vcpu( + cpu_id: usize, + kvm_vcpu_id: usize, + vcpu: Option, + vm: impl VmArch, + irq_chip: &mut dyn IrqChipArch, + vcpu_count: usize, + run_rt: bool, + vcpu_affinity: Vec, + no_smt: bool, + has_bios: bool, + use_hypervisor_signals: bool, + enable_per_vm_core_scheduling: bool, + host_cpu_topology: bool, + vcpu_cgroup_tasks_file: Option, +) -> Result<(V, VcpuRunHandle)> +where + V: VcpuArch, +{ + let mut vcpu = match vcpu { + Some(v) => v, + None => { + // If vcpu is None, it means this arch/hypervisor requires create_vcpu to be called from + // the vcpu thread. + match vm + .create_vcpu(kvm_vcpu_id) + .context("failed to create vcpu")? + .downcast::() + { + Ok(v) => *v, + Err(_) => panic!("VM created wrong type of VCPU"), + } + } + }; + + irq_chip + .add_vcpu(cpu_id, &vcpu) + .context("failed to add vcpu to irq chip")?; + + if !vcpu_affinity.is_empty() { + if let Err(e) = set_cpu_affinity(vcpu_affinity) { + error!("Failed to set CPU affinity: {}", e); + } + } + + Arch::configure_vcpu( + &vm, + vm.get_hypervisor(), + irq_chip, + &mut vcpu, + cpu_id, + vcpu_count, + has_bios, + no_smt, + host_cpu_topology, + ) + .context("failed to configure vcpu")?; + + if !enable_per_vm_core_scheduling { + // Do per-vCPU core scheduling by setting a unique cookie to each vCPU. + if let Err(e) = enable_core_scheduling() { + error!("Failed to enable core scheduling: {}", e); + } + } + + // Move vcpu thread to cgroup + if let Some(mut f) = vcpu_cgroup_tasks_file { + f.write_all(base::gettid().to_string().as_bytes()) + .context("failed to write vcpu tid to cgroup tasks")?; + } + + if run_rt { + const DEFAULT_VCPU_RT_LEVEL: u16 = 6; + if let Err(e) = set_rt_prio_limit(u64::from(DEFAULT_VCPU_RT_LEVEL)) + .and_then(|_| set_rt_round_robin(i32::from(DEFAULT_VCPU_RT_LEVEL))) + { + warn!("Failed to set vcpu to real time: {}", e); + } + } + + if use_hypervisor_signals { + let mut v = get_blocked_signals().context("failed to retrieve signal mask for vcpu")?; + v.retain(|&x| x != SIGRTMIN() + 0); + vcpu.set_signal_mask(&v) + .context("failed to set the signal mask for vcpu")?; + } + + let vcpu_run_handle = vcpu + .take_run_handle(Some(SIGRTMIN() + 0)) + .context("failed to set thread id for vcpu")?; + + Ok((vcpu, vcpu_run_handle)) +} + +#[cfg(all(target_arch = "x86_64", feature = "gdb"))] +fn handle_debug_msg( + cpu_id: usize, + vcpu: &V, + guest_mem: &GuestMemory, + d: VcpuDebug, + reply_tube: &mpsc::Sender, +) -> Result<()> +where + V: VcpuArch + 'static, +{ + match d { + VcpuDebug::ReadRegs => { + let msg = VcpuDebugStatusMessage { + cpu: cpu_id as usize, + msg: VcpuDebugStatus::RegValues( + Arch::debug_read_registers(vcpu as &V) + .context("failed to handle a gdb ReadRegs command")?, + ), + }; + reply_tube + .send(msg) + .context("failed to send a debug status to GDB thread") + } + VcpuDebug::WriteRegs(regs) => { + Arch::debug_write_registers(vcpu as &V, ®s) + .context("failed to handle a gdb WriteRegs command")?; + reply_tube + .send(VcpuDebugStatusMessage { + cpu: cpu_id as usize, + msg: VcpuDebugStatus::CommandComplete, + }) + .context("failed to send a debug status to GDB thread") + } + VcpuDebug::ReadMem(vaddr, len) => { + let msg = VcpuDebugStatusMessage { + cpu: cpu_id as usize, + msg: VcpuDebugStatus::MemoryRegion( + Arch::debug_read_memory(vcpu as &V, guest_mem, vaddr, len) + .unwrap_or(Vec::new()), + ), + }; + reply_tube + .send(msg) + .context("failed to send a debug status to GDB thread") + } + VcpuDebug::WriteMem(vaddr, buf) => { + Arch::debug_write_memory(vcpu as &V, guest_mem, vaddr, &buf) + .context("failed to handle a gdb WriteMem command")?; + reply_tube + .send(VcpuDebugStatusMessage { + cpu: cpu_id as usize, + msg: VcpuDebugStatus::CommandComplete, + }) + .context("failed to send a debug status to GDB thread") + } + VcpuDebug::EnableSinglestep => { + Arch::debug_enable_singlestep(vcpu as &V) + .context("failed to handle a gdb EnableSingleStep command")?; + reply_tube + .send(VcpuDebugStatusMessage { + cpu: cpu_id as usize, + msg: VcpuDebugStatus::CommandComplete, + }) + .context("failed to send a debug status to GDB thread") + } + VcpuDebug::SetHwBreakPoint(addrs) => { + Arch::debug_set_hw_breakpoints(vcpu as &V, &addrs) + .context("failed to handle a gdb SetHwBreakPoint command")?; + reply_tube + .send(VcpuDebugStatusMessage { + cpu: cpu_id as usize, + msg: VcpuDebugStatus::CommandComplete, + }) + .context("failed to send a debug status to GDB thread") + } + } +} + +fn vcpu_loop( + mut run_mode: VmRunMode, + cpu_id: usize, + vcpu: V, + vcpu_run_handle: VcpuRunHandle, + irq_chip: Box, + run_rt: bool, + delay_rt: bool, + io_bus: devices::Bus, + mmio_bus: devices::Bus, + requires_pvclock_ctrl: bool, + from_main_tube: mpsc::Receiver, + use_hypervisor_signals: bool, + #[cfg(all(target_arch = "x86_64", feature = "gdb"))] to_gdb_tube: Option< + mpsc::Sender, + >, + #[cfg(all(target_arch = "x86_64", feature = "gdb"))] guest_mem: GuestMemory, +) -> ExitState +where + V: VcpuArch + 'static, +{ + let mut interrupted_by_signal = false; + + loop { + // Start by checking for messages to process and the run state of the CPU. + // An extra check here for Running so there isn't a need to call recv unless a + // message is likely to be ready because a signal was sent. + if interrupted_by_signal || run_mode != VmRunMode::Running { + 'state_loop: loop { + // Tries to get a pending message without blocking first. + let msg = match from_main_tube.try_recv() { + Ok(m) => m, + Err(mpsc::TryRecvError::Empty) if run_mode == VmRunMode::Running => { + // If the VM is running and no message is pending, the state won't + // change. + break 'state_loop; + } + Err(mpsc::TryRecvError::Empty) => { + // If the VM is not running, wait until a message is ready. + match from_main_tube.recv() { + Ok(m) => m, + Err(mpsc::RecvError) => { + error!("Failed to read from main tube in vcpu"); + return ExitState::Crash; + } + } + } + Err(mpsc::TryRecvError::Disconnected) => { + error!("Failed to read from main tube in vcpu"); + return ExitState::Crash; + } + }; + + // Collect all pending messages. + let mut messages = vec![msg]; + messages.append(&mut from_main_tube.try_iter().collect()); + + for msg in messages { + match msg { + VcpuControl::RunState(new_mode) => { + run_mode = new_mode; + match run_mode { + VmRunMode::Running => break 'state_loop, + VmRunMode::Suspending => { + // On KVM implementations that use a paravirtualized + // clock (e.g. x86), a flag must be set to indicate to + // the guest kernel that a vCPU was suspended. The guest + // kernel will use this flag to prevent the soft lockup + // detection from triggering when this vCPU resumes, + // which could happen days later in realtime. + if requires_pvclock_ctrl { + if let Err(e) = vcpu.pvclock_ctrl() { + error!( + "failed to tell hypervisor vcpu {} is suspending: {}", + cpu_id, e + ); + } + } + } + VmRunMode::Breakpoint => {} + VmRunMode::Exiting => return ExitState::Stop, + } + } + #[cfg(all(target_arch = "x86_64", feature = "gdb"))] + VcpuControl::Debug(d) => match &to_gdb_tube { + Some(ref ch) => { + if let Err(e) = handle_debug_msg(cpu_id, &vcpu, &guest_mem, d, ch) { + error!("Failed to handle gdb message: {}", e); + } + } + None => { + error!("VcpuControl::Debug received while GDB feature is disabled: {:?}", d); + } + }, + VcpuControl::MakeRT => { + if run_rt && delay_rt { + info!("Making vcpu {} RT\n", cpu_id); + const DEFAULT_VCPU_RT_LEVEL: u16 = 6; + if let Err(e) = set_rt_prio_limit(u64::from(DEFAULT_VCPU_RT_LEVEL)) + .and_then(|_| { + set_rt_round_robin(i32::from(DEFAULT_VCPU_RT_LEVEL)) + }) + { + warn!("Failed to set vcpu to real time: {}", e); + } + } + } + } + } + } + } + + interrupted_by_signal = false; + + // Vcpus may have run a HLT instruction, which puts them into a state other than + // VcpuRunState::Runnable. In that case, this call to wait_until_runnable blocks + // until either the irqchip receives an interrupt for this vcpu, or until the main + // thread kicks this vcpu as a result of some VmControl operation. In most IrqChip + // implementations HLT instructions do not make it to crosvm, and thus this is a + // no-op that always returns VcpuRunState::Runnable. + match irq_chip.wait_until_runnable(&vcpu) { + Ok(VcpuRunState::Runnable) => {} + Ok(VcpuRunState::Interrupted) => interrupted_by_signal = true, + Err(e) => error!( + "error waiting for vcpu {} to become runnable: {}", + cpu_id, e + ), + } + + if !interrupted_by_signal { + match vcpu.run(&vcpu_run_handle) { + Ok(VcpuExit::IoIn { port, mut size }) => { + let mut data = [0; 8]; + if size > data.len() { + error!( + "unsupported IoIn size of {} bytes at port {:#x}", + size, port + ); + size = data.len(); + } + io_bus.read(port as u64, &mut data[..size]); + if let Err(e) = vcpu.set_data(&data[..size]) { + error!( + "failed to set return data for IoIn at port {:#x}: {}", + port, e + ); + } + } + Ok(VcpuExit::IoOut { + port, + mut size, + data, + }) => { + if size > data.len() { + error!( + "unsupported IoOut size of {} bytes at port {:#x}", + size, port + ); + size = data.len(); + } + io_bus.write(port as u64, &data[..size]); + } + Ok(VcpuExit::MmioRead { address, size }) => { + let mut data = [0; 8]; + mmio_bus.read(address, &mut data[..size]); + // Setting data for mmio can not fail. + let _ = vcpu.set_data(&data[..size]); + } + Ok(VcpuExit::MmioWrite { + address, + size, + data, + }) => { + mmio_bus.write(address, &data[..size]); + } + Ok(VcpuExit::IoapicEoi { vector }) => { + if let Err(e) = irq_chip.broadcast_eoi(vector) { + error!( + "failed to broadcast eoi {} on vcpu {}: {}", + vector, cpu_id, e + ); + } + } + Ok(VcpuExit::IrqWindowOpen) => {} + Ok(VcpuExit::Hlt) => irq_chip.halted(cpu_id), + Ok(VcpuExit::Shutdown) => return ExitState::Stop, + Ok(VcpuExit::FailEntry { + hardware_entry_failure_reason, + }) => { + error!("vcpu hw run failure: {:#x}", hardware_entry_failure_reason); + return ExitState::Crash; + } + Ok(VcpuExit::SystemEventShutdown) => { + info!("system shutdown event on vcpu {}", cpu_id); + return ExitState::Stop; + } + Ok(VcpuExit::SystemEventReset) => { + info!("system reset event"); + return ExitState::Reset; + } + Ok(VcpuExit::SystemEventCrash) => { + info!("system crash event on vcpu {}", cpu_id); + return ExitState::Stop; + } + #[rustfmt::skip] Ok(VcpuExit::Debug { .. }) => { + #[cfg(all(target_arch = "x86_64", feature = "gdb"))] + { + let msg = VcpuDebugStatusMessage { + cpu: cpu_id as usize, + msg: VcpuDebugStatus::HitBreakPoint, + }; + if let Some(ref ch) = to_gdb_tube { + if let Err(e) = ch.send(msg) { + error!("failed to notify breakpoint to GDB thread: {}", e); + return ExitState::Crash; + } + } + run_mode = VmRunMode::Breakpoint; + } + } + Ok(r) => warn!("unexpected vcpu exit: {:?}", r), + Err(e) => match e.errno() { + libc::EINTR => interrupted_by_signal = true, + libc::EAGAIN => {} + _ => { + error!("vcpu hit unknown error: {}", e); + return ExitState::Crash; + } + }, + } + } + + if interrupted_by_signal { + if use_hypervisor_signals { + // Try to clear the signal that we use to kick VCPU if it is pending before + // attempting to handle pause requests. + if let Err(e) = clear_signal(SIGRTMIN() + 0) { + error!("failed to clear pending signal: {}", e); + return ExitState::Crash; + } + } else { + vcpu.set_immediate_exit(false); + } + } + + if let Err(e) = irq_chip.inject_interrupts(&vcpu) { + error!("failed to inject interrupts for vcpu {}: {}", cpu_id, e); + } + } +} + +pub fn run_vcpu( + cpu_id: usize, + kvm_vcpu_id: usize, + vcpu: Option, + vm: impl VmArch + 'static, + mut irq_chip: Box, + vcpu_count: usize, + run_rt: bool, + vcpu_affinity: Vec, + delay_rt: bool, + no_smt: bool, + start_barrier: Arc, + has_bios: bool, + mut io_bus: devices::Bus, + mut mmio_bus: devices::Bus, + exit_evt: Event, + reset_evt: Event, + crash_evt: Event, + requires_pvclock_ctrl: bool, + from_main_tube: mpsc::Receiver, + use_hypervisor_signals: bool, + #[cfg(all(target_arch = "x86_64", feature = "gdb"))] to_gdb_tube: Option< + mpsc::Sender, + >, + enable_per_vm_core_scheduling: bool, + host_cpu_topology: bool, + vcpu_cgroup_tasks_file: Option, +) -> Result> +where + V: VcpuArch + 'static, +{ + thread::Builder::new() + .name(format!("crosvm_vcpu{}", cpu_id)) + .spawn(move || { + // The VCPU thread must trigger either `exit_evt` or `reset_event` in all paths. A + // `ScopedEvent`'s Drop implementation ensures that the `exit_evt` will be sent if + // anything happens before we get to writing the final event. + let scoped_exit_evt = ScopedEvent::from(exit_evt); + + #[cfg(all(target_arch = "x86_64", feature = "gdb"))] + let guest_mem = vm.get_memory().clone(); + let runnable_vcpu = runnable_vcpu( + cpu_id, + kvm_vcpu_id, + vcpu, + vm, + irq_chip.as_mut(), + vcpu_count, + run_rt && !delay_rt, + vcpu_affinity, + no_smt, + has_bios, + use_hypervisor_signals, + enable_per_vm_core_scheduling, + host_cpu_topology, + vcpu_cgroup_tasks_file, + ); + + start_barrier.wait(); + + let (vcpu, vcpu_run_handle) = match runnable_vcpu { + Ok(v) => v, + Err(e) => { + error!("failed to start vcpu {}: {:#}", cpu_id, e); + return; + } + }; + + #[allow(unused_mut)] + let mut run_mode = VmRunMode::Running; + #[cfg(all(target_arch = "x86_64", feature = "gdb"))] + if to_gdb_tube.is_some() { + // Wait until a GDB client attaches + run_mode = VmRunMode::Breakpoint; + } + + mmio_bus.set_access_id(cpu_id); + io_bus.set_access_id(cpu_id); + + let exit_reason = vcpu_loop( + run_mode, + cpu_id, + vcpu, + vcpu_run_handle, + irq_chip, + run_rt, + delay_rt, + io_bus, + mmio_bus, + requires_pvclock_ctrl, + from_main_tube, + use_hypervisor_signals, + #[cfg(all(target_arch = "x86_64", feature = "gdb"))] + to_gdb_tube, + #[cfg(all(target_arch = "x86_64", feature = "gdb"))] + guest_mem, + ); + + let exit_evt = scoped_exit_evt.into(); + let final_event = match exit_reason { + ExitState::Stop => exit_evt, + ExitState::Reset => reset_evt, + ExitState::Crash => crash_evt, + }; + if let Err(e) = final_event.write(1) { + error!( + "failed to send final event {:?} on vcpu {}: {}", + final_event, cpu_id, e + ) + } + }) + .context("failed to spawn VCPU thread") +} + +/// Signals all running VCPUs to vmexit, sends VcpuControl message to each VCPU tube, and tells +/// `irq_chip` to stop blocking halted VCPUs. The channel message is set first because both the +/// signal and the irq_chip kick could cause the VCPU thread to continue through the VCPU run +/// loop. +pub fn kick_all_vcpus( + vcpu_handles: &[(JoinHandle<()>, mpsc::Sender)], + irq_chip: &dyn IrqChip, + message: VcpuControl, +) { + for (handle, tube) in vcpu_handles { + if let Err(e) = tube.send(message.clone()) { + error!("failed to send VcpuControl: {}", e); + } + let _ = handle.kill(SIGRTMIN() + 0); + } + irq_chip.kick_halted_vcpus(); +}