From 6aed5cc351af1aad6b5ec9a0045ffc73b630e56b Mon Sep 17 00:00:00 2001 From: Dmytro Maluka Date: Mon, 19 Aug 2024 17:27:11 +0000 Subject: [PATCH] x86_64: support --unprotected-vm-with-firmware As pKVM and pvmfw development for x86 is ongoing, add initial support for running VMs with pvmfw as the VM bootloader. For now only support the --unprotected-vm-with-firmware development flag, for running regular non-protected VMs with pvmfw, which doesn't require pKVM support. Similarly to ARM, add a dedicated memory region for pvmfw to the x86-64 memory map. This region is located right below the PCI MMIO memory hole. Its size is 4MB, like on ARM. Similarly to ARM, the pvmfw entry point is at the beginning of the pvmfw image, the normal x86-64 boot protocol is kept as is, and additionally the VM payload (i.e. kernel) entry point is passed to pvmfw in the %rdi register. This ABI is a subject to change, the final ABI for x86 pvmfw is not defined yet. BUG=b:354676864 TEST=1. Run a VM with "--unprotected-vm-with-firmware pvmfw_test.bin" where pvmfw_test.bin begins with the "jmp *%rdi" instruction (2 bytes: 0xff 0xe7), i.e. pvmfw immediately jumping to the VM entry point. Result: VM runs as usual. 2. Run a VM with "--unprotected-vm-with-firmware pvmfw_random.bin" where pvmfw_random.bin contains just random data. Result: VM crashes. Change-Id: Ib0ee998a99a0cf352a97222769b87be615419187 Reviewed-on: https://chromium-review.googlesource.com/c/crosvm/crosvm/+/5797353 Commit-Queue: Dmytro Maluka Reviewed-by: Daniel Verkamp --- docs/book/src/appendix/memory_layout.md | 2 + hypervisor/src/kvm/x86_64.rs | 6 +- vm_memory/src/guest_memory.rs | 1 - x86_64/src/lib.rs | 217 +++++++++++++++++++++--- 4 files changed, 197 insertions(+), 29 deletions(-) diff --git a/docs/book/src/appendix/memory_layout.md b/docs/book/src/appendix/memory_layout.md index 343414b418..254535e1b9 100644 --- a/docs/book/src/appendix/memory_layout.md +++ b/docs/book/src/appendix/memory_layout.md @@ -19,6 +19,7 @@ see the source. All addresses are in hexadecimal. | [`KERNEL_START_OFFSET`] | `20_0000` | | | Linux kernel image load address | | [`initrd_start`] | after kernel | | | Initial RAM disk for Linux kernel (optional) | | [`END_ADDR_BEFORE_32BITS`] | after initrd | `D000_0000` | ~3.24 GiB | RAM (\<4G) | +| [`PROTECTED_VM_FW_START`] | `CFC0_0000` | `D000_0000` | 4 MiB | pVM firmware (if running a protected VM) | | [`END_ADDR_BEFORE_32BITS`] | `D000_0000` | `F400_0000` | 576 MiB | Low (\<4G) MMIO allocation area | | [`PCIE_CFG_MMIO_START`] | `F400_0000` | `F800_0000` | 64 MiB | PCIe enhanced config (ECAM) | | [`RESERVED_MEM_SIZE`] | `F800_0000` | `1_0000_0000` | 128 MiB | LAPIC/IOAPIC/HPET/… | @@ -38,6 +39,7 @@ see the source. All addresses are in hexadecimal. [`acpi_hi_rsdp_window_base`]: https://crsrc.org/o/src/platform/crosvm/x86_64/src/lib.rs;l=357?q=ACPI_HI_RSDP_WINDOW_BASE [`kernel_start_offset`]: https://crsrc.org/o/src/platform/crosvm/x86_64/src/lib.rs;l=341?q=KERNEL_START_OFFSET [`initrd_start`]: https://crsrc.org/o/src/platform/crosvm/x86_64/src/lib.rs;l=1633?q=initrd_start +[`protected_vm_fw_start`]: https://crsrc.org/o/src/platform/crosvm/x86_64/src/lib.rs;l=394?q=PROTECTED_VM_FW_START [`end_addr_before_32bits`]: https://crsrc.org/o/src/platform/crosvm/x86_64/src/lib.rs;l=230?q=END_ADDR_BEFORE_32BITS [`pcie_cfg_mmio_start`]: https://crsrc.org/o/src/platform/crosvm/x86_64/src/lib.rs;l=400?q=PCIE_CFG_MMIO_START [`reserved_mem_size`]: https://crsrc.org/o/src/platform/crosvm/x86_64/src/lib.rs;l=395?q=RESERVED_MEM_SIZE diff --git a/hypervisor/src/kvm/x86_64.rs b/hypervisor/src/kvm/x86_64.rs index 80fa96455d..a8c8978980 100644 --- a/hypervisor/src/kvm/x86_64.rs +++ b/hypervisor/src/kvm/x86_64.rs @@ -158,11 +158,11 @@ impl Kvm { // The x86 machine type is always 0. Protected VMs are not supported. pub fn get_vm_type(&self, protection_type: ProtectionType) -> Result { - if protection_type == ProtectionType::Unprotected { - Ok(0) - } else { + if protection_type.isolates_memory() { error!("Protected mode is not supported on x86_64."); Err(Error::new(libc::EINVAL)) + } else { + Ok(0) } } diff --git a/vm_memory/src/guest_memory.rs b/vm_memory/src/guest_memory.rs index ead7bd217e..a6b30e93f4 100644 --- a/vm_memory/src/guest_memory.rs +++ b/vm_memory/src/guest_memory.rs @@ -118,7 +118,6 @@ pub enum MemoryRegionPurpose { // General purpose guest memory #[default] GuestMemoryRegion, - #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] ProtectedFirmwareRegion, #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] StaticSwiotlbRegion, diff --git a/x86_64/src/lib.rs b/x86_64/src/lib.rs index 37e9532beb..afbbcde0fc 100644 --- a/x86_64/src/lib.rs +++ b/x86_64/src/lib.rs @@ -137,6 +137,7 @@ use vm_memory::GuestAddress; use vm_memory::GuestMemory; use vm_memory::GuestMemoryError; use vm_memory::MemoryRegionOptions; +use vm_memory::MemoryRegionPurpose; use zerocopy::AsBytes; use zerocopy::FromBytes; use zerocopy::FromZeroes; @@ -227,6 +228,8 @@ pub enum Error { LoadBios(io::Error), #[error("error loading kernel bzImage: {0}")] LoadBzImage(bzimage::Error), + #[error("error loading custom pVM firmware: {0}")] + LoadCustomPvmFw(arch::LoadImageError), #[error("error loading initrd: {0}")] LoadInitrd(arch::LoadImageError), #[error("error loading Kernel: {0}")] @@ -235,6 +238,10 @@ pub enum Error { LoadPflash(io::Error), #[error("error translating address: Page not present")] PageNotPresent, + #[error("pci mmio overlaps with pVM firmware memory")] + PciMmioOverlapPvmFw, + #[error("pVM firmware not supported when bios is used on x86_64")] + PvmFwBiosUnsupported, #[error("error reading guest memory {0}")] ReadingGuestMemory(vm_memory::GuestMemoryError), #[error("single register read not supported on x86_64")] @@ -346,6 +353,7 @@ const FIRST_ADDR_PAST_20BITS: u64 = 1 << 20; const FIRST_ADDR_PAST_32BITS: u64 = 1 << 32; // Make sure it align to 256MB for MTRR convenient const MEM_32BIT_GAP_SIZE: u64 = 768 * MB; +const END_ADDR_BEFORE_32BITS: u64 = FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE; // Reserved memory for nand_bios/LAPIC/IOAPIC/HPET/..... const RESERVED_MEM_SIZE: u64 = 0x800_0000; const PCI_MMIO_END: u64 = FIRST_ADDR_PAST_32BITS - RESERVED_MEM_SIZE - 1; @@ -379,6 +387,12 @@ pub const X86_64_SCI_IRQ: u32 = 5; pub const X86_64_IRQ_BASE: u32 = 9; const ACPI_HI_RSDP_WINDOW_BASE: u64 = 0x000E_0000; +// pVM firmware memory. Should be within the low 4GB, so that it is identity-mapped +// by setup_page_tables() when a protected VM boots in long mode, since the pVM firmware is +// the VM entry point. +const PROTECTED_VM_FW_MAX_SIZE: u64 = 0x40_0000; +const PROTECTED_VM_FW_START: u64 = END_ADDR_BEFORE_32BITS - PROTECTED_VM_FW_MAX_SIZE; + #[derive(Debug, PartialEq, Eq)] pub enum CpuManufacturer { Intel, @@ -396,11 +410,17 @@ struct LowMemoryLayout { pci_mmio: AddressRange, // the pcie cfg mmio range pcie_cfg_mmio: AddressRange, + // the pVM firmware memory (if running a protected VM) + pvmfw_mem: Option, } static LOW_MEMORY_LAYOUT: OnceCell = OnceCell::new(); -pub fn init_low_memory_layout(pcie_ecam: Option, pci_low_start: Option) { +pub fn init_low_memory_layout( + pcie_ecam: Option, + pci_low_start: Option, + has_protected_vm_firmware: bool, +) -> Result<()> { LOW_MEMORY_LAYOUT.get_or_init(|| { const DEFAULT_PCIE_CFG_MMIO: AddressRange = AddressRange { start: DEFAULT_PCIE_CFG_MMIO_START, @@ -423,11 +443,32 @@ pub fn init_low_memory_layout(pcie_ecam: Option, pci_low_start: Op } }; + let pvmfw_mem = if has_protected_vm_firmware { + Some(AddressRange { + start: PROTECTED_VM_FW_START, + end: PROTECTED_VM_FW_START + PROTECTED_VM_FW_MAX_SIZE - 1, + }) + } else { + None + }; + LowMemoryLayout { pci_mmio, pcie_cfg_mmio, + pvmfw_mem, } }); + + if has_protected_vm_firmware { + let pci_mmio = read_pci_mmio_before_32bit(); + let pvmfw_mem = read_pvmfw_mem().unwrap(); + + if !pci_mmio.intersect(pvmfw_mem).is_empty() { + return Err(Error::PciMmioOverlapPvmFw); + } + } + + Ok(()) } pub fn read_pci_mmio_before_32bit() -> AddressRange { @@ -436,6 +477,18 @@ pub fn read_pci_mmio_before_32bit() -> AddressRange { pub fn read_pcie_cfg_mmio() -> AddressRange { LOW_MEMORY_LAYOUT.get().unwrap().pcie_cfg_mmio } +fn read_pvmfw_mem() -> Option { + LOW_MEMORY_LAYOUT.get().unwrap().pvmfw_mem +} + +fn max_ram_end_before_32bit(has_protected_vm_firmware: bool) -> u64 { + let pci_start = read_pci_mmio_before_32bit().start; + if has_protected_vm_firmware { + pci_start.min(PROTECTED_VM_FW_START) + } else { + pci_start + } +} /// The x86 reset vector for i386+ and x86_64 puts the processor into an "unreal mode" where it /// can access the last 1 MB of the 32-bit address space in 16-bit mode, and starts the instruction @@ -609,6 +662,7 @@ fn generate_e820_memory_map( ram_below_1m: AddressRange, ram_below_4g: AddressRange, ram_above_4g: AddressRange, + has_protected_vm_firmware: bool, ) -> Result> { let mut e820_entries = Vec::new(); @@ -618,6 +672,14 @@ fn generate_e820_memory_map( add_e820_entry(&mut e820_entries, ram_above_4g, E820Type::Ram)? } + if has_protected_vm_firmware { + // After the pVM firmware jumped to the guest, the pVM firmware itself + // is no longer running, so its memory is reusable by the guest OS. + // So add this memory as RAM rather than Reserved. + let pvmfw_range = read_pvmfw_mem().unwrap(); + add_e820_entry(&mut e820_entries, pvmfw_range, E820Type::Ram)?; + } + let pcie_cfg_mmio_range = read_pcie_cfg_mmio(); add_e820_entry(&mut e820_entries, pcie_cfg_mmio_range, E820Type::Reserved)?; @@ -647,23 +709,41 @@ fn generate_e820_memory_map( pub fn arch_memory_regions( size: u64, bios_size: Option, + has_protected_vm_firmware: bool, ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)> { + let mut mem_size = size; + let mut regions = Vec::new(); + + if has_protected_vm_firmware { + regions.push(( + GuestAddress(PROTECTED_VM_FW_START), + PROTECTED_VM_FW_MAX_SIZE, + MemoryRegionOptions::new().purpose(MemoryRegionPurpose::ProtectedFirmwareRegion), + )); + + // pVM firmware memory is a part of normal guest memory, since it is reusable + // by the guest OS once the pVM firmware jumped to the guest. So count its size + // as a part of the total guest memory size. + if mem_size > PROTECTED_VM_FW_MAX_SIZE { + mem_size -= PROTECTED_VM_FW_MAX_SIZE; + } + } + let mem_start = START_OF_RAM_32BITS; - let mem_end = GuestAddress(size + mem_start); + let mem_end = GuestAddress(mem_size + mem_start); let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS); - let end_32bit_gap_start = GuestAddress(read_pci_mmio_before_32bit().start); + let max_end_32bits = GuestAddress(max_ram_end_before_32bit(has_protected_vm_firmware)); - let mut regions = Vec::new(); - if mem_end <= end_32bit_gap_start { - regions.push((GuestAddress(mem_start), size, Default::default())); + if mem_end <= max_end_32bits { + regions.push((GuestAddress(mem_start), mem_size, Default::default())); if let Some(bios_size) = bios_size { regions.push((bios_start(bios_size), bios_size, Default::default())); } } else { regions.push(( GuestAddress(mem_start), - end_32bit_gap_start.offset() - mem_start, + max_end_32bits.offset() - mem_start, Default::default(), )); if let Some(bios_size) = bios_size { @@ -671,7 +751,7 @@ pub fn arch_memory_regions( } regions.push(( first_addr_past_32bits, - mem_end.offset_from(end_32bit_gap_start), + mem_end.offset_from(max_end_32bits), Default::default(), )); } @@ -686,14 +766,24 @@ impl arch::LinuxArch for X8664arch { components: &VmComponents, _hypervisor: &impl Hypervisor, ) -> std::result::Result, Self::Error> { - init_low_memory_layout(components.pcie_ecam, components.pci_low_start); + let has_protected_vm_firmware = components.hv_cfg.protection_type.runs_firmware(); + + init_low_memory_layout( + components.pcie_ecam, + components.pci_low_start, + has_protected_vm_firmware, + )?; let bios_size = match &components.vm_image { VmImage::Bios(bios_file) => Some(bios_file.metadata().map_err(Error::LoadBios)?.len()), VmImage::Kernel(_) => None, }; - Ok(arch_memory_regions(components.memory_size, bios_size)) + Ok(arch_memory_regions( + components.memory_size, + bios_size, + has_protected_vm_firmware, + )) } fn get_system_allocator_config(vm: &V) -> SystemAllocatorConfig { @@ -734,7 +824,7 @@ impl arch::LinuxArch for X8664arch { V: VmX86_64, Vcpu: VcpuX86_64, { - if components.hv_cfg.protection_type != ProtectionType::Unprotected { + if components.hv_cfg.protection_type.isolates_memory() { return Err(Error::UnsupportedProtectionType); } @@ -874,8 +964,14 @@ impl arch::LinuxArch for X8664arch { let (host_tube, device_tube) = Tube::pair() .context("create tube") .map_err(Error::SetupCmos)?; - Self::setup_legacy_cmos_device(&io_bus, irq_chip, device_tube, components.memory_size) - .map_err(Error::SetupCmos)?; + Self::setup_legacy_cmos_device( + &io_bus, + irq_chip, + device_tube, + components.memory_size, + components.hv_cfg.protection_type.runs_firmware(), + ) + .map_err(Error::SetupCmos)?; Some(host_tube) } else { None @@ -1012,8 +1108,14 @@ impl arch::LinuxArch for X8664arch { let mut vcpu_init = vec![VcpuInitX86_64::default(); vcpu_count]; let mut msrs = BTreeMap::new(); + let protection_type = components.hv_cfg.protection_type; + match components.vm_image { VmImage::Bios(ref mut bios) => { + if protection_type.runs_firmware() { + return Err(Error::PvmFwBiosUnsupported); + } + // Allow a bios to hardcode CMDLINE_OFFSET and read the kernel command line from it. Self::load_cmdline( &mem, @@ -1040,9 +1142,28 @@ impl arch::LinuxArch for X8664arch { params, dump_device_tree_blob, device_tree_overlays, + protection_type.runs_firmware(), )?; - vcpu_init[0].regs.rip = kernel_entry.offset(); + if protection_type.needs_firmware_loaded() { + arch::load_image( + &mem, + &mut components + .pvm_fw + .expect("pvmfw must be available if ProtectionType loads it"), + GuestAddress(PROTECTED_VM_FW_START), + PROTECTED_VM_FW_MAX_SIZE, + ) + .map_err(Error::LoadCustomPvmFw)?; + } + + let entry_addr = if protection_type.runs_firmware() { + PROTECTED_VM_FW_START + } else { + kernel_entry.offset() + }; + + vcpu_init[0].regs.rip = entry_addr; match kernel_type { KernelType::BzImage | KernelType::Elf => { @@ -1053,6 +1174,14 @@ impl arch::LinuxArch for X8664arch { } } + if protection_type.runs_firmware() { + // Pass pVM payload entry address to pVM firmware. + // NOTE: this ABI is subject to change. Possibly we will pass + // all the needed info (payload entry, start and size) in in-memory + // structures (e.g. DTB) instead. + vcpu_init[0].regs.rdi = kernel_entry.offset(); + } + match cpu_mode { CpuMode::LongMode => { regs::set_long_mode_msrs(&mut msrs); @@ -1477,6 +1606,7 @@ impl X8664arch { params: boot_params, dump_device_tree_blob: Option, device_tree_overlays: Vec, + has_protected_vm_firmware: bool, ) -> Result<()> { // Some guest kernels expect a typical PC memory layout where the region between 640 KB and // 1 MB is reserved for device memory/ROMs and get confused if there is a RAM region @@ -1491,17 +1621,32 @@ impl X8664arch { // GuestMemory::end_addr() returns the first address past the end, so subtract 1 to get the // inclusive end. let guest_mem_end = mem.end_addr().offset() - 1; + + // Find the end of the part of guest memory below 4G that is not pVM firmware memory. + // This part of guest memory includes just one region, so just find the end of this region. + let max_ram_end_below_4g = max_ram_end_before_32bit(has_protected_vm_firmware) - 1; + let guest_mem_end_below_4g = mem + .regions() + .map(|r| r.guest_addr.offset() + r.size as u64 - 1) + .find(|&addr| addr <= max_ram_end_below_4g) + .expect("no memory region below 4G"); + let ram_below_4g = AddressRange { start: FIRST_ADDR_PAST_20BITS, - end: guest_mem_end.min(read_pci_mmio_before_32bit().start - 1), + end: guest_mem_end_below_4g, }; - let ram_above_4g = AddressRange { start: FIRST_ADDR_PAST_32BITS, end: guest_mem_end, }; - let e820_entries = generate_e820_memory_map(mem, ram_below_1m, ram_below_4g, ram_above_4g)?; + let e820_entries = generate_e820_memory_map( + mem, + ram_below_1m, + ram_below_4g, + ram_above_4g, + has_protected_vm_firmware, + )?; let kernel_max_cmdline_len = if params.hdr.cmdline_size == 0 { // Old kernels have a maximum length of 255 bytes, not including the NUL. @@ -1549,7 +1694,9 @@ impl X8664arch { &mut initrd_file, GuestAddress(kernel_end), GuestAddress(initrd_addr_max), - None, + Some(|region| { + region.options.purpose != MemoryRegionPurpose::ProtectedFirmwareRegion + }), base::pagesize() as u64, ) .map_err(Error::LoadInitrd)?; @@ -1705,8 +1852,9 @@ impl X8664arch { irq_chip: &mut dyn IrqChipX86_64, vm_control: Tube, mem_size: u64, + has_protected_vm_firmware: bool, ) -> anyhow::Result<()> { - let mem_regions = arch_memory_regions(mem_size, None); + let mem_regions = arch_memory_regions(mem_size, None, has_protected_vm_firmware); let mem_below_4g = mem_regions .iter() @@ -2207,13 +2355,17 @@ mod tests { fn setup() { let pcie_ecam = Some(AddressRange::from_start_and_size(3 * GB, 256 * MB).unwrap()); let pci_start = Some(2 * GB); - init_low_memory_layout(pcie_ecam, pci_start); + init_low_memory_layout(pcie_ecam, pci_start, false).expect("init_low_memory_layout"); } #[test] fn regions_lt_4gb_nobios() { setup(); - let regions = arch_memory_regions(512 * MB, /* bios_size */ None); + let regions = arch_memory_regions( + 512 * MB, + /* bios_size */ None, + /* has_protected_vm_firmware */ false, + ); assert_eq!(1, regions.len()); assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0); assert_eq!(1u64 << 29, regions[0].1); @@ -2223,7 +2375,9 @@ mod tests { fn regions_gt_4gb_nobios() { setup(); let size = 4 * GB + 0x8000; - let regions = arch_memory_regions(size, /* bios_size */ None); + let regions = arch_memory_regions( + size, /* bios_size */ None, /* has_protected_vm_firmware */ false, + ); assert_eq!(2, regions.len()); assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0); assert_eq!(GuestAddress(4 * GB), regions[1].0); @@ -2234,7 +2388,11 @@ mod tests { fn regions_lt_4gb_bios() { setup(); let bios_len = 1 * MB; - let regions = arch_memory_regions(512 * MB, Some(bios_len)); + let regions = arch_memory_regions( + 512 * MB, + Some(bios_len), + /* has_protected_vm_firmware */ false, + ); assert_eq!(2, regions.len()); assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0); assert_eq!(512 * MB, regions[0].1); @@ -2249,7 +2407,11 @@ mod tests { fn regions_gt_4gb_bios() { setup(); let bios_len = 1 * MB; - let regions = arch_memory_regions(4 * GB + 0x8000, Some(bios_len)); + let regions = arch_memory_regions( + 4 * GB + 0x8000, + Some(bios_len), + /* has_protected_vm_firmware */ false, + ); assert_eq!(3, regions.len()); assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0); assert_eq!( @@ -2267,6 +2429,7 @@ mod tests { let regions = arch_memory_regions( TEST_MEMORY_SIZE - START_OF_RAM_32BITS, /* bios_size */ None, + /* has_protected_vm_firmware */ false, ); dbg!(®ions); assert_eq!(1, regions.len()); @@ -2279,7 +2442,11 @@ mod tests { setup(); // Test with exact size of 4GB - the overhead. let bios_len = 1 * MB; - let regions = arch_memory_regions(TEST_MEMORY_SIZE - START_OF_RAM_32BITS, Some(bios_len)); + let regions = arch_memory_regions( + TEST_MEMORY_SIZE - START_OF_RAM_32BITS, + Some(bios_len), + /* has_protected_vm_firmware */ false, + ); assert_eq!(2, regions.len()); assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0); assert_eq!(TEST_MEMORY_SIZE - START_OF_RAM_32BITS, regions[0].1);