From 91296bdee06ae6c456d9fa2f1f57a5dd51b498fc Mon Sep 17 00:00:00 2001 From: Woody Chow Date: Thu, 3 Mar 2022 10:56:26 +0900 Subject: [PATCH] devices: vvu: virtio-iommu support in virtio-vhost-user Make vvu use devices use VFIO to manage their virt queues. This alleviates the need to use noiommu mode. However, it is still necessary to use `vfio_iommu_type1.allow_unsafe_interrupts=1`. BUG=b:202151642,b:215310597 TEST=launch sibling with vvu + virtio-iommu Cq-Depend: chromium:3565728, chromium:3565260 Change-Id: If418524598c40a37d41c0ffaa1dcc0f8ee11fcb3 Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/crosvm/+/3501052 Reviewed-by: Keiichi Watanabe Tested-by: kokoro Commit-Queue: David Stevens --- devices/src/vfio.rs | 185 +++++++++--- devices/src/virtio/mod.rs | 2 +- .../src/virtio/vhost/user/device/vvu/bus.rs | 3 +- .../src/virtio/vhost/user/device/vvu/pci.rs | 26 +- .../src/virtio/vhost/user/device/vvu/queue.rs | 270 ++++++++---------- devices/src/virtio/vhost/user/proxy.rs | 4 +- resources/src/lib.rs | 4 +- vfio_sys/Cargo.toml | 1 + vfio_sys/bindgen.sh | 23 +- vfio_sys/src/vfio.rs | 21 ++ 10 files changed, 331 insertions(+), 208 deletions(-) diff --git a/devices/src/vfio.rs b/devices/src/vfio.rs index a02925304f..f99d7440b6 100644 --- a/devices/src/vfio.rs +++ b/devices/src/vfio.rs @@ -16,18 +16,20 @@ use std::sync::Arc; use std::u32; use crate::IommuDevType; +use base::error; use base::{ - ioctl, ioctl_with_mut_ref, ioctl_with_ptr, ioctl_with_ref, ioctl_with_val, warn, - AsRawDescriptor, Error, Event, FromRawDescriptor, RawDescriptor, SafeDescriptor, + ioctl, ioctl_with_mut_ptr, ioctl_with_mut_ref, ioctl_with_ptr, ioctl_with_ref, ioctl_with_val, + warn, AsRawDescriptor, Error, Event, FromRawDescriptor, RawDescriptor, SafeDescriptor, }; use data_model::{vec_with_array_field, DataInit}; use hypervisor::{DeviceKind, Vm}; use once_cell::sync::OnceCell; use remain::sorted; +use resources::address_allocator::AddressAllocator; +use resources::{Alloc, Error as ResourcesError}; use sync::Mutex; use thiserror::Error; use vfio_sys::*; -use vm_memory::GuestMemory; #[sorted] #[derive(Error, Debug)] @@ -56,14 +58,20 @@ pub enum VfioError { IommuDmaMap(Error), #[error("failed to remove guest memory map from iommu table: {0}")] IommuDmaUnmap(Error), + #[error("failed to get IOMMU cap info from host")] + IommuGetCapInfo, #[error("failed to get IOMMU info from host: {0}")] IommuGetInfo(Error), #[error("failed to set KVM vfio device's attribute: {0}")] KvmSetDeviceAttr(Error), + #[error("AddressAllocator is unavailable")] + NoRescAlloc, #[error("failed to open /dev/vfio/vfio container: {0}")] OpenContainer(io::Error), #[error("failed to open /dev/vfio/$group_num group: {0}")] OpenGroup(io::Error), + #[error("resources error: {0}")] + Resources(ResourcesError), #[error( "vfio API version doesn't match with VFIO_API_VERSION defined in vfio_sys/src/vfio.rs" )] @@ -100,19 +108,24 @@ enum KvmVfioGroupOps { #[repr(u32)] enum IommuType { Type1V2 = VFIO_TYPE1v2_IOMMU, - NoIommu = VFIO_NOIOMMU_IOMMU, } /// VfioContainer contain multi VfioGroup, and delegate an IOMMU domain table pub struct VfioContainer { container: File, groups: HashMap>>, - host_iommu: bool, +} + +fn extract_vfio_struct(bytes: &[u8], offset: usize) -> T +where + T: DataInit, +{ + T::from_reader(&bytes[offset..(offset + mem::size_of::())]).expect("malformed kernel data") } const VFIO_API_VERSION: u8 = 0; impl VfioContainer { - fn new_inner(host_iommu: bool) -> Result { + pub fn new() -> Result { let container = OpenOptions::new() .read(true) .write(true) @@ -128,20 +141,9 @@ impl VfioContainer { Ok(VfioContainer { container, groups: HashMap::new(), - host_iommu, }) } - /// Open VfioContainer with IOMMU enabled. - pub fn new() -> Result { - Self::new_inner(true /* host_iommu */) - } - - /// Open VfioContainer with IOMMU disabled. - pub fn new_noiommu() -> Result { - Self::new_inner(false /* host_iommu */) - } - // Construct a VfioContainer from an exist container file. pub fn new_from_container(container: File) -> Result { // Safe as file is vfio container descriptor and ioctl is defined by kernel. @@ -153,7 +155,6 @@ impl VfioContainer { Ok(VfioContainer { container, groups: HashMap::new(), - host_iommu: true, }) } @@ -236,7 +237,76 @@ impl VfioContainer { Ok(iommu_info.iova_pgsizes) } - fn init(&mut self, guest_mem: &GuestMemory, iommu_enabled: bool) -> Result<()> { + pub fn vfio_iommu_iova_get_iova_ranges(&self) -> Result> { + // Query the buffer size needed fetch the capabilities. + let mut iommu_info_argsz = vfio_iommu_type1_info { + argsz: mem::size_of::() as u32, + flags: 0, + iova_pgsizes: 0, + ..Default::default() + }; + + // Safe as file is vfio container, iommu_info_argsz has valid values, + // and we check the return value + let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_GET_INFO(), &mut iommu_info_argsz) }; + if ret != 0 { + return Err(VfioError::IommuGetInfo(get_error())); + } + + if (iommu_info_argsz.flags & VFIO_IOMMU_INFO_CAPS) == 0 { + return Err(VfioError::IommuGetCapInfo); + } + + let mut iommu_info = vec_with_array_field::( + iommu_info_argsz.argsz as usize - mem::size_of::(), + ); + iommu_info[0].argsz = iommu_info_argsz.argsz; + // Safe as file is vfio container, iommu_info has valid values, + // and we check the return value + let ret = + unsafe { ioctl_with_mut_ptr(self, VFIO_IOMMU_GET_INFO(), iommu_info.as_mut_ptr()) }; + if ret != 0 { + return Err(VfioError::IommuGetInfo(get_error())); + } + + // Safe because we initialized iommu_info with enough space, u8 has less strict + // alignment, and since it will no longer be mutated. + let info_bytes = unsafe { + std::slice::from_raw_parts( + iommu_info.as_ptr() as *const u8, + iommu_info_argsz.argsz as usize, + ) + }; + + if (iommu_info[0].flags & VFIO_IOMMU_INFO_CAPS) == 0 { + return Err(VfioError::IommuGetCapInfo); + } + + let mut offset = iommu_info[0].cap_offset as usize; + while offset != 0 { + let header = extract_vfio_struct::(info_bytes, offset); + + if header.id == VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE as u16 && header.version == 1 { + let iova_header = extract_vfio_struct::( + info_bytes, offset, + ); + let range_offset = offset + mem::size_of::(); + let mut ret = Vec::new(); + for i in 0..iova_header.nr_iovas { + ret.push(extract_vfio_struct::( + info_bytes, + range_offset + i as usize * mem::size_of::(), + )); + } + return Ok(ret); + } + offset = header.next as usize; + } + + Err(VfioError::IommuGetCapInfo) + } + + fn init_vfio_iommu(&mut self) -> Result<()> { if !self.check_extension(IommuType::Type1V2) { return Err(VfioError::VfioType1V2); } @@ -245,15 +315,6 @@ impl VfioContainer { return Err(VfioError::ContainerSetIOMMU(get_error())); } - // Add all guest memory regions into vfio container's iommu table, - // then vfio kernel driver could access guest memory from gfn - if !iommu_enabled { - guest_mem.with_regions(|_index, guest_addr, size, host_addr, _mmap, _fd_offset| { - // Safe because the guest regions are guaranteed not to overlap - unsafe { self.vfio_dma_map(guest_addr.0, size as u64, host_addr as u64, true) } - })?; - } - Ok(()) } @@ -266,11 +327,27 @@ impl VfioContainer { match self.groups.get(&id) { Some(group) => Ok(group.clone()), None => { - let group = Arc::new(Mutex::new(VfioGroup::new(self, self.host_iommu, id)?)); + let group = Arc::new(Mutex::new(VfioGroup::new(self, id)?)); if self.groups.is_empty() { - // Before the first group is added into container, do once cotainer - // initialize for a vm - self.init(vm.get_memory(), iommu_enabled)?; + // Before the first group is added into container, do once per + // container initialization. + self.init_vfio_iommu()?; + + if !iommu_enabled { + vm.get_memory().with_regions( + |_index, guest_addr, size, host_addr, _mmap, _fd_offset| { + // Safe because the guest regions are guaranteed not to overlap + unsafe { + self.vfio_dma_map( + guest_addr.0, + size as u64, + host_addr as u64, + true, + ) + } + }, + )?; + } } let kvm_vfio_file = KVM_VFIO_FILE @@ -291,12 +368,12 @@ impl VfioContainer { match self.groups.get(&id) { Some(group) => Ok(group.clone()), None => { - let group = Arc::new(Mutex::new(VfioGroup::new(self, self.host_iommu, id)?)); + let group = Arc::new(Mutex::new(VfioGroup::new(self, id)?)); - if self.groups.is_empty() && !self.host_iommu { - if self.set_iommu(IommuType::NoIommu) < 0 { - return Err(VfioError::ContainerSetIOMMU(get_error())); - } + if self.groups.is_empty() { + // Before the first group is added into container, do once per + // container initialization. + self.init_vfio_iommu()?; } self.groups.insert(id, group.clone()); @@ -352,12 +429,8 @@ struct VfioGroup { } impl VfioGroup { - fn new(container: &VfioContainer, host_iommu: bool, id: u32) -> Result { - let group_path = if host_iommu { - format!("/dev/vfio/{}", id) - } else { - format!("/dev/vfio/noiommu-{}", id) - }; + fn new(container: &VfioContainer, id: u32) -> Result { + let group_path = format!("/dev/vfio/{}", id); let group_file = OpenOptions::new() .read(true) .write(true) @@ -628,6 +701,8 @@ pub struct VfioDevice { group_id: u32, // vec for vfio device's regions regions: Vec, + + iova_alloc: Option>>, } impl VfioDevice { @@ -663,6 +738,7 @@ impl VfioDevice { group_descriptor, group_id, regions, + iova_alloc: None, }) } @@ -696,6 +772,14 @@ impl VfioDevice { group.lock().add_device_num(); let group_descriptor = group.lock().as_raw_descriptor(); + let iova_ranges = container + .lock() + .vfio_iommu_iova_get_iova_ranges()? + .into_iter() + .map(|r| std::ops::RangeInclusive::new(r.start, r.end)); + let iova_alloc = AddressAllocator::new_from_list(iova_ranges, None, None) + .map_err(VfioError::Resources)?; + Ok(VfioDevice { dev, name, @@ -703,6 +787,7 @@ impl VfioDevice { group_descriptor, group_id, regions, + iova_alloc: Some(Arc::new(Mutex::new(iova_alloc))), }) } @@ -1262,6 +1347,20 @@ impl VfioDevice { self.container.lock().vfio_dma_unmap(iova, size) } + pub fn vfio_get_iommu_page_size_mask(&self) -> Result { + self.container.lock().vfio_get_iommu_page_size_mask() + } + + pub fn alloc_iova(&self, size: u64, align_size: u64, alloc: Alloc) -> Result { + match &self.iova_alloc { + None => Err(VfioError::NoRescAlloc), + Some(iova_alloc) => iova_alloc + .lock() + .allocate_with_align(size, alloc, "alloc_iova".to_owned(), align_size) + .map_err(VfioError::Resources), + } + } + /// Gets the vfio device backing `File`. pub fn device_file(&self) -> &File { &self.dev diff --git a/devices/src/virtio/mod.rs b/devices/src/virtio/mod.rs index 6ff03f0c82..8ec8bfb491 100644 --- a/devices/src/virtio/mod.rs +++ b/devices/src/virtio/mod.rs @@ -102,7 +102,7 @@ const TYPE_TPM: u32 = MAX_VIRTIO_DEVICE_ID - 1; const TYPE_VHOST_USER: u32 = MAX_VIRTIO_DEVICE_ID - 2; pub const VIRTIO_F_VERSION_1: u32 = 32; -const VIRTIO_F_ACCESS_PLATFORM: u32 = 33; +pub const VIRTIO_F_ACCESS_PLATFORM: u32 = 33; const INTERRUPT_STATUS_USED_RING: u32 = 0x1; const INTERRUPT_STATUS_CONFIG_CHANGED: u32 = 0x2; diff --git a/devices/src/virtio/vhost/user/device/vvu/bus.rs b/devices/src/virtio/vhost/user/device/vvu/bus.rs index fdf4d53ee4..3060ef9ebb 100644 --- a/devices/src/virtio/vhost/user/device/vvu/bus.rs +++ b/devices/src/virtio/vhost/user/device/vvu/bus.rs @@ -41,8 +41,7 @@ pub fn open_vfio_device(pci_address: PciAddress) -> Result { .context("failed to clear driver_override")?; let vfio_path = format!("/sys/bus/pci/devices/{}", &addr_str); - // TODO(b/202151642): Use `VfioContainer::new()` once virtio-iommu for VFIO is implemented. - let vfio_container = Arc::new(Mutex::new(VfioContainer::new_noiommu()?)); + let vfio_container = Arc::new(Mutex::new(VfioContainer::new()?)); let vfio = VfioDevice::new(&vfio_path, vfio_container) .map_err(|e| anyhow!("failed to create VFIO device: {}", e))?; Ok(vfio) diff --git a/devices/src/virtio/vhost/user/device/vvu/pci.rs b/devices/src/virtio/vhost/user/device/vvu/pci.rs index 81e95d1835..0b2903cf58 100644 --- a/devices/src/virtio/vhost/user/device/vvu/pci.rs +++ b/devices/src/virtio/vhost/user/device/vvu/pci.rs @@ -11,6 +11,7 @@ use anyhow::{anyhow, bail, Context, Result}; use base::{info, Event}; use data_model::DataInit; use memoffset::offset_of; +use resources::Alloc; use vfio_sys::*; use virtio_sys::vhost::VIRTIO_F_VERSION_1; @@ -18,7 +19,7 @@ use crate::pci::{MsixCap, PciAddress, PciCapabilityID, CAPABILITY_LIST_HEAD_OFFS use crate::vfio::{VfioDevice, VfioPciConfig, VfioRegionAddr}; use crate::virtio::vhost::user::device::vvu::{ bus::open_vfio_device, - queue::{DescTableAddrs, UserQueue}, + queue::{DescTableAddrs, IovaAllocator, UserQueue}, }; use crate::virtio::{PciCapabilityType, VirtioPciCap}; @@ -252,6 +253,13 @@ impl VvuPciDevice { .check_device_info() .context("failed to check VFIO device information")?; + let page_mask = vfio_dev + .vfio_get_iommu_page_size_mask() + .context("failed to get iommu page size mask")?; + if page_mask & (base::pagesize() as u64) == 0 { + bail!("Unsupported iommu page mask {:x}", page_mask); + } + let mut pci_dev = Self { vfio_dev, caps, @@ -309,7 +317,7 @@ impl VvuPciDevice { QueueType::Rx => true, QueueType::Tx => false, }; - let queue = UserQueue::new(queue_size, device_writable)?; + let queue = UserQueue::new(queue_size, device_writable, typ as u8, self)?; let DescTableAddrs { desc, avail, used } = queue.desc_table_addrs()?; let desc_lo = (desc & 0xffffffff) as u32; @@ -491,3 +499,17 @@ impl VvuPciDevice { Ok(()) } } + +impl IovaAllocator for VvuPciDevice { + fn alloc_iova(&self, size: u64, tag: u8) -> Result { + self.vfio_dev + .alloc_iova(size, base::pagesize() as u64, Alloc::VvuQueue(tag)) + .context("failed to find an iova region to map the gpa region to") + } + + unsafe fn map_iova(&self, iova: u64, size: u64, addr: *const u8) -> Result<()> { + self.vfio_dev + .vfio_dma_map(iova, size, addr as u64, true) + .context("failed to map iova") + } +} diff --git a/devices/src/virtio/vhost/user/device/vvu/queue.rs b/devices/src/virtio/vhost/user/device/vvu/queue.rs index 2891a415bf..d05290776e 100644 --- a/devices/src/virtio/vhost/user/device/vvu/queue.rs +++ b/devices/src/virtio/vhost/user/device/vvu/queue.rs @@ -8,13 +8,11 @@ use std::mem; use std::num::Wrapping; use std::sync::atomic::{fence, Ordering}; -#[cfg(not(test))] -use std::{collections::BTreeMap, fs::File}; use anyhow::{anyhow, bail, Context, Result}; use data_model::{DataInit, Le16, Le32, Le64, VolatileSlice}; use virtio_sys::vhost::VRING_DESC_F_WRITE; -use vm_memory::{GuestAddress, GuestMemory}; +use vm_memory::{GuestAddress as IOVA, GuestMemory as QueueMemory}; use crate::virtio::Desc; @@ -36,19 +34,17 @@ pub struct DescTableAddrs { } struct MemLayout { - /// Address of the descriptor table. - /// Since the vvu driver runs in the guest user space, `GuestAddress` here stores the guest - /// virtual address. - desc_table: GuestAddress, + /// Address of the descriptor table in UserQueue.mem. + desc_table: IOVA, - /// Virtual address of the available ring - avail_ring: GuestAddress, + /// Address of the available ring in UserQueue.mem. + avail_ring: IOVA, - /// Virtual address of the used ring - used_ring: GuestAddress, + /// Address of the used ring in UserQueue.mem. + used_ring: IOVA, - /// Virtual address of the start of buffers. - buffer_addr: GuestAddress, + /// Address of the start of buffers in UserQueue.mem. + buffer_addr: IOVA, } /// Represents a virtqueue that is allocated in the guest userspace and manipulated from a VFIO @@ -59,8 +55,12 @@ struct MemLayout { /// /// # Memory Layout /// -/// `mem` is a continuous memory allocated in the guest userspace and used to have a virtqueue. -/// Its layout is defined in the following table and stored in `mem_layout`. +/// `mem` is the memory allocated in the guest userspace for the virtqueue, which is mapped into +/// the vvu device via VFIO. The GuestAddresses of `mem` are the IOVAs that should be used when +/// communicating with the vvu device. All accesses to the shared memory from the device backend +/// must be done through the GuestMemory read/write functions. +/// +/// The layout `mem` is defined in the following table and stored in `mem_layout`. /// /// | | Alignment | Size | /// |-----------------------------------------------------------------| @@ -81,7 +81,7 @@ pub struct UserQueue { size: Wrapping, /// The underlying memory. - mem: GuestMemory, + mem: QueueMemory, /// Virtqueue layout on `mem`. mem_layout: MemLayout, @@ -100,21 +100,30 @@ pub struct UserQueue { /// one virtqueue. Also, it's better to use `crate::virtio::DescriptorChain` for descirptors as /// a part of b/215153367. device_writable: bool, +} - /// Mapping from a virtual address to the physical address. - /// This mapping is initialized by reading `/proc/self/pagemap`. - /// TODO(b/215310597): This workaround won't work if memory mapping is changed. Currently, we - /// are assuming that memory mapping is fixed during the vvu negotiation. - /// Once virtio-iommu supports VFIO usage, we can remove this workaround and we should use - /// VFIO_IOMMU_MAP_DMA call to get physical addresses. - #[cfg(not(test))] - addr_table: BTreeMap, +/// Interface used by UserQueue to interact with the IOMMU. +pub trait IovaAllocator { + /// Allocates an IO virtual address region of the requested size. + fn alloc_iova(&self, size: u64, tag: u8) -> Result; + /// Maps the given address at the given IOVA. + /// + /// # Safety + /// + /// `addr` must reference a region of at least length `size`. Memory passed + /// to this function may be mutated at any time, so `addr` must not be memory + /// that is directly managed by rust. + unsafe fn map_iova(&self, iova: u64, size: u64, addr: *const u8) -> Result<()>; } impl UserQueue { /// Creats a `UserQueue` instance. - pub fn new(queue_size: u16, device_writable: bool) -> Result { - let (mem, size, mem_layout) = Self::init_memory(queue_size)?; + pub fn new(queue_size: u16, device_writable: bool, tag: u8, iova_alloc: &I) -> Result + where + I: IovaAllocator, + { + let (mem, size, mem_layout) = Self::init_memory(queue_size, tag, iova_alloc)?; + let mut queue = Self { mem, size: Wrapping(size), @@ -123,8 +132,6 @@ impl UserQueue { used_count: Wrapping(0), free_count: Wrapping(size), device_writable, - #[cfg(not(test))] - addr_table: Default::default(), }; queue.init_descriptor_table()?; @@ -133,7 +140,14 @@ impl UserQueue { } /// Allocates memory region and returns addresses on the regions for (`desc_table`, `avail_ring`, `used_ring`, `buffer``). - fn init_memory(max_queue_size: u16) -> Result<(GuestMemory, u16, MemLayout)> { + fn init_memory( + max_queue_size: u16, + tag: u8, + iova_alloc: &I, + ) -> Result<(QueueMemory, u16, MemLayout)> + where + I: IovaAllocator, + { // Since vhost-user negotiation finishes within ~20 messages, queue size 32 is enough. const MAX_QUEUE_SIZE: u16 = 256; @@ -149,51 +163,44 @@ impl UserQueue { ((n + m - 1) / m) * m } - let desc_table = GuestAddress(0); + let desc_table = IOVA(0); let desc_size = 16u64 * u64::from(queue_size); let desc_end = desc_table.0 + desc_size; - let avail_ring = GuestAddress(align(desc_end, 2)); + let avail_ring = IOVA(align(desc_end, 2)); let avail_size = 6 + 2 * u64::from(queue_size); let avail_end = avail_ring.0 + avail_size; - let used_ring = GuestAddress(align(avail_end, 4)); + let used_ring = IOVA(align(avail_end, 4)); let used_size = 6 + 8 * u64::from(queue_size); let used_end = used_ring.0 + used_size; - let buffer_addr = GuestAddress(align(used_end, BUF_SIZE)); + let buffer_addr = IOVA(align(used_end, BUF_SIZE)); let buffer_size = BUF_SIZE * u64::from(queue_size); let mem_size = align(buffer_addr.0 + buffer_size, base::pagesize() as u64); + let iova_start = iova_alloc + .alloc_iova(mem_size, tag) + .context("failed to allocate queue iova")?; - let mem = GuestMemory::new(&[(desc_table, mem_size)]) - .map_err(|e| anyhow!("failed to create GuestMemory for virtqueue: {}", e))?; + let mem = QueueMemory::new(&[(IOVA(iova_start), mem_size)]) + .map_err(|e| anyhow!("failed to create QueueMemory for virtqueue: {}", e))?; - // Call `mlock()` to guarantees that pages will stay in RAM. - // Note that this can't ensure that physical address mapping is consistent. - // TODO(b/215310597) We're assume that the kernel won't swap these memory region at least - // during the vvu negotiation. Although this assumption is risky, it'll be resolved once - // virtio-iommu for virtio devices is supported. - mem.with_regions(|_, _, size, ptr, _, _| { - let ret = unsafe { libc::mlock(ptr as *const libc::c_void, size) }; - if ret == -1 { - bail!("failed to mlock(): {}", base::Error::last()); - } - Ok(()) - })?; - - // To ensure the GuestMemory is mapped to physical memory, read the entire buffer first. - // Otherwise, reading `/proc/self/pagemap` returns invalid values. - // TODO(b/215310597): Once we use iommu for VFIO, we can probably remove this workaround. - let mut buf = vec![0; mem_size as usize]; - mem.read_at_addr(&mut buf, desc_table) - .map_err(|e| anyhow!("failed to read_slice: {}", e))?; + let host_addr = mem + .get_host_address_range(IOVA(iova_start), mem_size as usize) + .context("failed to get host address")?; + // Safe because the region being mapped is managed via the GuestMemory interface. + unsafe { + iova_alloc + .map_iova(iova_start, mem_size, host_addr) + .context("failed to map queue")?; + } let mem_layout = MemLayout { - desc_table, - avail_ring, - used_ring, - buffer_addr, + desc_table: desc_table.unchecked_add(iova_start), + avail_ring: avail_ring.unchecked_add(iova_start), + used_ring: used_ring.unchecked_add(iova_start), + buffer_addr: buffer_addr.unchecked_add(iova_start), }; Ok((mem, queue_size, mem_layout)) @@ -201,8 +208,6 @@ impl UserQueue { /// Initialize the descriptor table. fn init_descriptor_table(&mut self) -> Result<()> { - self.init_addr_table()?; - let flags = if self.device_writable { Le16::from(VRING_DESC_F_WRITE as u16) } else { @@ -214,9 +219,9 @@ impl UserQueue { // Register pre-allocated buffers to the descriptor area. for i in 0..self.size.0 { let idx = Wrapping(i); - let addr = Le64::from(self.to_phys_addr(&self.buffer_guest_addr(idx)?)?); + let iova = self.buffer_address(idx)?.offset(); let desc = Desc { - addr, + addr: iova.into(), len, flags, next, @@ -247,91 +252,16 @@ impl UserQueue { Ok(()) } - #[cfg(not(test))] - /// Reads `/proc/self/pagemap` and stores mapping from virtual addresses for virtqueue - /// information and buffers to physical addresses. - fn init_addr_table(&mut self) -> Result<()> { - let pagemap = File::open("/proc/self/pagemap").context("failed to open pagemap")?; - self.register_addr(&pagemap, &self.mem_layout.desc_table.clone())?; - self.register_addr(&pagemap, &self.mem_layout.avail_ring.clone())?; - self.register_addr(&pagemap, &self.mem_layout.used_ring.clone())?; - self.register_addr(&pagemap, &self.mem_layout.buffer_addr.clone())?; - // Register addresses of buffers. - for i in 0..self.size.0 { - self.register_addr(&pagemap, &self.buffer_guest_addr(Wrapping(i))?)?; - } - Ok(()) - } - - #[cfg(test)] - fn init_addr_table(&mut self) -> Result<()> { - Ok(()) - } - - /// Registers an address mapping for the given virtual address to `self.addr_table`. - // TODO(b/215310597): This function reads `/proc/self/pagemap`, which requires root - // privileges. Instead, we should use VFIO_IOMMU_MAP_DMA call with virtio-iommu to get - // physical addresses. - #[cfg(not(test))] - fn register_addr(&mut self, pagemap_file: &File, addr: &GuestAddress) -> Result { - use std::os::unix::fs::FileExt; - - let vaddr = self - .mem - .get_slice_at_addr(*addr, 1) - .context("failed to get slice")? - .as_ptr() as u64; - - let page_size = base::pagesize() as u64; - let virt_page_number = vaddr / page_size; - let offset = std::mem::size_of::() as u64 * virt_page_number; - - let mut buf = [0u8; 8]; - pagemap_file - .read_exact_at(&mut buf, offset) - .context("failed to read pagemap")?; - - let pagemap = u64::from_le_bytes(buf); - // Bit 55 is soft-dirty. - if (pagemap & (1u64 << 55)) != 0 { - bail!("page table entry is soft-dirty") - } - // page frame numbers are bits 0-54 - let page = pagemap & 0x7f_ffff_ffff_ffffu64; - if page == 0 { - bail!("failed to get page frame number: page={:x}", page); - } - - let paddr = page * page_size + (vaddr % page_size); - self.addr_table.insert(*addr, paddr); - Ok(paddr) - } - - /// Translate a virtual address to the physical address. - #[cfg(not(test))] - fn to_phys_addr(&self, addr: &GuestAddress) -> Result { - self.addr_table - .get(addr) - .context(anyhow!("addr {} not found", addr)) - .map(|v| *v) - } - - #[cfg(test)] - fn to_phys_addr(&self, addr: &GuestAddress) -> Result { - Ok(addr.0) - } - - /// Returns physical addresses of the descriptor table, the avail ring and the used ring. pub fn desc_table_addrs(&self) -> Result { - let desc = self.to_phys_addr(&self.mem_layout.desc_table)?; - let avail = self.to_phys_addr(&self.mem_layout.avail_ring)?; - let used = self.to_phys_addr(&self.mem_layout.used_ring)?; - - Ok(DescTableAddrs { desc, avail, used }) + Ok(DescTableAddrs { + desc: self.mem_layout.desc_table.offset(), + avail: self.mem_layout.avail_ring.offset(), + used: self.mem_layout.used_ring.offset(), + }) } - /// Returns a virtual address of the buffer for the given `index`. - fn buffer_guest_addr(&self, index: Wrapping) -> Result { + /// Returns the IOVA of the buffer for the given `index`. + fn buffer_address(&self, index: Wrapping) -> Result { let offset = u64::from((index % self.size).0) * BUF_SIZE; self.mem_layout .buffer_addr @@ -341,7 +271,10 @@ impl UserQueue { /// Writes the given descriptor table entry. fn write_desc_entry(&self, index: Wrapping, desc: Desc) -> Result<()> { - let addr = GuestAddress(u64::from((index % self.size).0) * mem::size_of::() as u64); + let addr = self + .mem_layout + .desc_table + .unchecked_add(u64::from((index % self.size).0) * mem::size_of::() as u64); fence(Ordering::SeqCst); self.mem .write_obj_at_addr(desc, addr) @@ -402,7 +335,7 @@ impl UserQueue { let id = Wrapping(u32::from(elem.id) as u16); let len = u32::from(elem.len) as usize; - let addr = self.buffer_guest_addr(id)?; + let addr = self.buffer_address(id)?; fence(Ordering::SeqCst); let s = self @@ -419,7 +352,7 @@ impl UserQueue { /// Writes data into virtqueue's buffer and returns its address. /// /// TODO: Use `descriptor_utils::Writer`. - fn write_to_buffer(&self, index: Wrapping, data: &[u8]) -> Result { + fn write_to_buffer(&self, index: Wrapping, data: &[u8]) -> Result { if data.len() as u64 > BUF_SIZE { bail!( "data size {} is larger than the buffer size {}", @@ -428,7 +361,7 @@ impl UserQueue { ); } - let addr = self.buffer_guest_addr(index)?; + let addr = self.buffer_address(index)?; fence(Ordering::SeqCst); let written = self .mem @@ -473,7 +406,7 @@ impl UserQueue { .context("failed to write data to virtqueue")?; let desc = Desc { - addr: Le64::from(self.to_phys_addr(&addr)?), + addr: Le64::from(addr.offset()), len: Le32::from(data.len() as u32), flags: Le16::from(0), next: Le16::from(0), @@ -492,19 +425,40 @@ impl UserQueue { mod test { use super::*; + use std::cell::RefCell; use std::io::Read; use std::io::Write; use crate::virtio::{Queue as DeviceQueue, Reader, Writer}; + // An allocator that just allocates 0 as an IOVA. + struct SimpleIovaAllocator(RefCell); + + impl IovaAllocator for SimpleIovaAllocator { + fn alloc_iova(&self, _size: u64, _tag: u8) -> Result { + if *self.0.borrow() { + bail!("exhaused"); + } + *self.0.borrow_mut() = true; + Ok(0) + } + + unsafe fn map_iova(&self, _iova: u64, _size: u64, _addr: *const u8) -> Result<()> { + if !*self.0.borrow() { + bail!("not allocated"); + } + Ok(()) + } + } + fn setup_vq(queue: &mut DeviceQueue, addrs: DescTableAddrs) { - queue.desc_table = GuestAddress(addrs.desc); - queue.avail_ring = GuestAddress(addrs.avail); - queue.used_ring = GuestAddress(addrs.used); + queue.desc_table = IOVA(addrs.desc); + queue.avail_ring = IOVA(addrs.avail); + queue.used_ring = IOVA(addrs.used); queue.ready = true; } - fn device_write(mem: &GuestMemory, q: &mut DeviceQueue, data: &[u8]) -> usize { + fn device_write(mem: &QueueMemory, q: &mut DeviceQueue, data: &[u8]) -> usize { let desc_chain = q.pop(mem).unwrap(); let index = desc_chain.index; @@ -514,7 +468,7 @@ mod test { written } - fn device_read(mem: &GuestMemory, q: &mut DeviceQueue, len: usize) -> Vec { + fn device_read(mem: &QueueMemory, q: &mut DeviceQueue, len: usize) -> Vec { let desc_chain = q.pop(mem).unwrap(); let desc_index = desc_chain.index; let mut reader = Reader::new(mem.clone(), desc_chain).unwrap(); @@ -538,7 +492,9 @@ mod test { // Send an array from the driver to the device `count` times. fn drv_to_dev(queue_size: u16, count: u32) { - let mut drv_queue = UserQueue::new(queue_size, false /* device_writable */).unwrap(); + let iova_alloc = SimpleIovaAllocator(RefCell::new(false)); + let mut drv_queue = + UserQueue::new(queue_size, false /* device_writable */, 0, &iova_alloc).unwrap(); let mut dev_queue = DeviceQueue::new(queue_size); setup_vq(&mut dev_queue, drv_queue.desc_table_addrs().unwrap()); @@ -582,7 +538,9 @@ mod test { // Send an array from the device to the driver `count` times. fn dev_to_drv(queue_size: u16, count: u32) { - let mut drv_queue = UserQueue::new(queue_size, true /* device_writable */).unwrap(); + let iova_alloc = SimpleIovaAllocator(RefCell::new(false)); + let mut drv_queue = + UserQueue::new(queue_size, true /* device_writable */, 0, &iova_alloc).unwrap(); let mut dev_queue = DeviceQueue::new(queue_size); setup_vq(&mut dev_queue, drv_queue.desc_table_addrs().unwrap()); diff --git a/devices/src/virtio/vhost/user/proxy.rs b/devices/src/virtio/vhost/user/proxy.rs index 022ab996ec..1923a31b83 100644 --- a/devices/src/virtio/vhost/user/proxy.rs +++ b/devices/src/virtio/vhost/user/proxy.rs @@ -46,7 +46,7 @@ use crate::{ PciBarConfiguration, PciBarIndex, PciBarPrefetchable, PciBarRegionType, PciCapability, PciCapabilityID, }, - virtio::VIRTIO_MSI_NO_VECTOR, + virtio::{VIRTIO_F_ACCESS_PLATFORM, VIRTIO_MSI_NO_VECTOR}, }; use remain::sorted; @@ -997,7 +997,7 @@ impl VirtioVhostUser { uuid: Option, ) -> Result { Ok(VirtioVhostUser { - base_features, + base_features: base_features | 1 << VIRTIO_F_ACCESS_PLATFORM, listener: Some(listener), config: VirtioVhostUserConfig { status: Le32::from(0), diff --git a/resources/src/lib.rs b/resources/src/lib.rs index df65a1327d..574f4e75e5 100644 --- a/resources/src/lib.rs +++ b/resources/src/lib.rs @@ -12,7 +12,7 @@ use thiserror::Error; pub use crate::system_allocator::{MemRegion, MmioType, SystemAllocator, SystemAllocatorConfig}; -mod address_allocator; +pub mod address_allocator; mod system_allocator; /// Used to tag SystemAllocator allocations. @@ -36,6 +36,8 @@ pub enum Alloc { PciBridgePrefetchWindow { bus: u8, dev: u8, func: u8 }, /// File-backed memory mapping. FileBacked(u64), + /// virtio vhost user queue with queue id + VvuQueue(u8), } #[sorted] diff --git a/vfio_sys/Cargo.toml b/vfio_sys/Cargo.toml index 866eeb7b5c..8890f55d06 100644 --- a/vfio_sys/Cargo.toml +++ b/vfio_sys/Cargo.toml @@ -6,3 +6,4 @@ edition = "2021" [dependencies] base = { path = "../base" } +data_model = { path = "../common/data_model" } diff --git a/vfio_sys/bindgen.sh b/vfio_sys/bindgen.sh index b44022009b..12333801e3 100755 --- a/vfio_sys/bindgen.sh +++ b/vfio_sys/bindgen.sh @@ -17,12 +17,33 @@ fix_vfio_type() { } VFIO_EXTRA="// Added by vfio_sys/bindgen.sh +use data_model::DataInit; + #[repr(C)] #[derive(Debug, Default)] pub struct vfio_region_info_with_cap { pub region_info: vfio_region_info, pub cap_info: __IncompleteArrayField, -}" +} + +// vfio_iommu_type1_info_cap_iova_range minus the incomplete iova_ranges +// array, so that Copy/DataInit can be implemented. +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct vfio_iommu_type1_info_cap_iova_range_header { + pub header: vfio_info_cap_header, + pub nr_iovas: u32, + pub reserved: u32, +} + +// Safe because it only has data and no implicit padding. +unsafe impl DataInit for vfio_info_cap_header {} + +// Safe because it only has data and no implicit padding. +unsafe impl DataInit for vfio_iommu_type1_info_cap_iova_range_header {} + +// Safe because it only has data and no implicit padding. +unsafe impl DataInit for vfio_iova_range {}" bindgen_generate \ --raw-line "${VFIO_EXTRA}" \ diff --git a/vfio_sys/src/vfio.rs b/vfio_sys/src/vfio.rs index 3062a610a3..401722f4ca 100644 --- a/vfio_sys/src/vfio.rs +++ b/vfio_sys/src/vfio.rs @@ -6,6 +6,8 @@ #![allow(dead_code)] // Added by vfio_sys/bindgen.sh +use data_model::DataInit; + #[repr(C)] #[derive(Debug, Default)] pub struct vfio_region_info_with_cap { @@ -13,6 +15,25 @@ pub struct vfio_region_info_with_cap { pub cap_info: __IncompleteArrayField, } +// vfio_iommu_type1_info_cap_iova_range minus the incomplete iova_ranges +// array, so that Copy/DataInit can be implemented. +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct vfio_iommu_type1_info_cap_iova_range_header { + pub header: vfio_info_cap_header, + pub nr_iovas: u32, + pub reserved: u32, +} + +// Safe because it only has data and no implicit padding. +unsafe impl DataInit for vfio_info_cap_header {} + +// Safe because it only has data and no implicit padding. +unsafe impl DataInit for vfio_iommu_type1_info_cap_iova_range_header {} + +// Safe because it only has data and no implicit padding. +unsafe impl DataInit for vfio_iova_range {} + #[repr(C)] #[derive(Default)] pub struct __IncompleteArrayField(::std::marker::PhantomData, [T; 0]);