crosvm: Add plumbing for split-irqchip interrupts

Devices use irqfd to inject interrupts, we listen to them in the main
thread and activate userspace pic/ioapic accordingly.

BUG=chromium:908689
TEST=lanuch linux guest with `--split-irqchip` flag

Change-Id: If30d17ce7ec9e26dba782c89cc1b9b2ff897a70d
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/crosvm/+/1945798
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Stephen Barber <smbarber@chromium.org>
Reviewed-by: Daniel Verkamp <dverkamp@chromium.org>
Commit-Queue: Zhuocheng Ding <zhuocheng.ding@intel.corp-partner.google.com>
This commit is contained in:
Zhuocheng Ding 2019-12-02 15:50:28 +08:00 committed by Commit Bot
parent 2f7dabbd6a
commit b9f4c9bca3
7 changed files with 219 additions and 48 deletions

View file

@ -245,9 +245,14 @@ impl arch::LinuxArch for AArch64 {
let pci_devices = create_devices(&mem, &mut vm, &mut resources, &exit_evt)
.map_err(|e| Error::CreateDevices(Box::new(e)))?;
let (pci, pci_irqs, pid_debug_label_map) =
arch::generate_pci_root(pci_devices, &mut mmio_bus, &mut resources, &mut vm)
.map_err(Error::CreatePciRoot)?;
let (pci, pci_irqs, pid_debug_label_map) = arch::generate_pci_root(
pci_devices,
&mut None,
&mut mmio_bus,
&mut resources,
&mut vm,
)
.map_err(Error::CreatePciRoot)?;
let pci_bus = Arc::new(Mutex::new(PciConfigMmio::new(pci)));
// ARM doesn't really use the io bus like x86, so just create an empty bus.
@ -317,6 +322,7 @@ impl arch::LinuxArch for AArch64 {
vcpu_affinity,
irq_chip,
split_irqchip: None,
gsi_relay: None,
io_bus,
mmio_bus,
pid_debug_label_map,

View file

@ -15,6 +15,7 @@ use std::os::unix::io::AsRawFd;
use std::path::PathBuf;
use std::sync::Arc;
use devices::split_irqchip_common::GsiRelay;
use devices::virtio::VirtioDevice;
use devices::{
Bus, BusDevice, BusError, PciDevice, PciDeviceError, PciInterruptPin, PciRoot, ProxyDevice,
@ -62,6 +63,7 @@ pub struct RunnableLinuxVm {
pub vcpu_affinity: Vec<usize>,
pub irq_chip: Option<File>,
pub split_irqchip: Option<(Arc<Mutex<devices::Pic>>, Arc<Mutex<devices::Ioapic>>)>,
pub gsi_relay: Option<Arc<GsiRelay>>,
pub io_bus: Bus,
pub mmio_bus: Bus,
pub pid_debug_label_map: BTreeMap<u32, String>,
@ -118,6 +120,8 @@ pub enum DeviceRegistrationError {
CreatePipe(sys_util::Error),
// Unable to create serial device from serial parameters
CreateSerialDevice(devices::SerialError),
/// Could not clone an event fd.
EventFdClone(sys_util::Error),
/// Could not create an event fd.
EventFdCreate(sys_util::Error),
/// Could not add a device to the mmio bus.
@ -149,6 +153,7 @@ impl Display for DeviceRegistrationError {
CreatePipe(e) => write!(f, "failed to create pipe: {}", e),
CreateSerialDevice(e) => write!(f, "failed to create serial device: {}", e),
Cmdline(e) => write!(f, "unable to add device to kernel command line: {}", e),
EventFdClone(e) => write!(f, "failed to clone eventfd: {}", e),
EventFdCreate(e) => write!(f, "failed to create eventfd: {}", e),
MmioInsert(e) => write!(f, "failed to add to mmio bus: {}", e),
RegisterIoevent(e) => write!(f, "failed to register ioevent to VM: {}", e),
@ -166,6 +171,7 @@ impl Display for DeviceRegistrationError {
/// Creates a root PCI device for use by this Vm.
pub fn generate_pci_root(
devices: Vec<(Box<dyn PciDevice>, Option<Minijail>)>,
gsi_relay: &mut Option<GsiRelay>,
mmio_bus: &mut Bus,
resources: &mut SystemAllocator,
vm: &mut Vm,
@ -191,10 +197,22 @@ pub fn generate_pci_root(
1 => PciInterruptPin::IntB,
2 => PciInterruptPin::IntC,
3 => PciInterruptPin::IntD,
_ => panic!(""), // Obviously not possible, but the compiler is not smart enough.
_ => unreachable!(), // Obviously not possible, but the compiler is not smart enough.
};
vm.register_irqfd_resample(&irqfd, &irq_resample_fd, irq_num)
.map_err(DeviceRegistrationError::RegisterIrqfd)?;
if let Some(relay) = gsi_relay {
relay.register_irqfd_resample(
irqfd
.try_clone()
.map_err(DeviceRegistrationError::EventFdClone)?,
irq_resample_fd
.try_clone()
.map_err(DeviceRegistrationError::EventFdClone)?,
irq_num as usize,
);
} else {
vm.register_irqfd_resample(&irqfd, &irq_resample_fd, irq_num)
.map_err(DeviceRegistrationError::RegisterIrqfd)?;
}
keep_fds.push(irqfd.as_raw_fd());
keep_fds.push(irq_resample_fd.as_raw_fd());
device.assign_irq(irqfd, irq_resample_fd, irq_num, pci_irq_pin);

View file

@ -10,6 +10,7 @@ use crate::BusDevice;
use bit_field::*;
use kvm::Vm;
use msg_socket::{MsgReceiver, MsgSender};
use std::sync::Arc;
use sys_util::{error, warn, EventFd, Result};
use vm_control::{VmIrqRequest, VmIrqRequestSocket, VmIrqResponse};
@ -89,6 +90,7 @@ pub struct Ioapic {
redirect_table: [RedirectionTableEntry; kvm::NUM_IOAPIC_PINS],
// IOREGSEL is technically 32 bits, but only bottom 8 are writable: all others are fixed to 0.
ioregsel: u8,
relay: Arc<GsiRelay>,
irqfd: Vec<EventFd>,
socket: VmIrqRequestSocket,
}
@ -166,11 +168,16 @@ impl Ioapic {
current_interrupt_level_bitmap: 0,
redirect_table: entries,
ioregsel: 0,
relay: Default::default(),
irqfd,
socket,
})
}
pub fn register_relay(&mut self, relay: Arc<GsiRelay>) {
self.relay = relay;
}
// The ioapic must be informed about EOIs in order to avoid sending multiple interrupts of the
// same type at the same time.
pub fn end_of_interrupt(&mut self, vector: u8) {
@ -183,6 +190,12 @@ impl Ioapic {
if self.redirect_table[i].get_vector() == vector
&& self.redirect_table[i].get_trigger_mode() == TriggerMode::Level
{
if self.relay.irqfd_resample[i].is_some() {
self.service_irq(i, false);
}
if let Some(resample_evt) = &self.relay.irqfd_resample[i] {
resample_evt.write(1).unwrap();
}
self.redirect_table[i].set_remote_irr(false);
}
// There is an inherent race condition in hardware if the OS is finished processing an

View file

@ -12,7 +12,9 @@
// For the purposes of both using more descriptive terms and avoiding terms with lots of charged
// emotional context, this file refers to them instead as "primary" and "secondary" PICs.
use crate::split_irqchip_common::GsiRelay;
use crate::BusDevice;
use std::sync::Arc;
use sys_util::{debug, warn};
#[repr(usize)]
@ -30,7 +32,7 @@ enum PicInitState {
Icw4 = 3,
}
#[derive(Debug, Default, Clone, Copy, PartialEq)]
#[derive(Default)]
struct PicState {
last_irr: u8, // Edge detection.
irr: u8, // Interrupt Request Register.
@ -53,6 +55,8 @@ struct PicState {
elcr: u8,
elcr_mask: u8,
init_state: Option<PicInitState>,
is_primary: bool,
relay: Arc<GsiRelay>,
}
pub struct Pic {
@ -176,12 +180,18 @@ impl Pic {
// that should be masked here. In this case, bits 8 - 8 = 0 and 13 - 8 = 5.
secondary_pic.elcr_mask = !((1 << 0) | (1 << 5));
primary_pic.is_primary = true;
Pic {
interrupt_request: false,
pics: [primary_pic, secondary_pic],
}
}
pub fn register_relay(&mut self, relay: Arc<GsiRelay>) {
self.pics[0].relay = relay.clone();
self.pics[1].relay = relay;
}
pub fn service_irq(&mut self, irq: u8, level: bool) -> bool {
assert!(irq <= 15, "Unexpectedly high value irq: {} vs 15", irq);
@ -391,6 +401,11 @@ impl Pic {
fn clear_isr(pic: &mut PicState, irq: u8) {
assert!(irq <= 7, "Unexpectedly high value for irq: {} vs 7", irq);
pic.isr &= !(1 << irq);
Pic::set_irq_internal(pic, irq, false);
let irq = if pic.is_primary { irq } else { irq + 8 };
if let Some(resample_evt) = &pic.relay.irqfd_resample[irq as usize] {
resample_evt.write(1).unwrap();
}
}
fn update_irq(&mut self) -> bool {
@ -1088,26 +1103,6 @@ mod tests {
assert_eq!(data.pic.pics[PicSelect::Primary as usize].priority_add, 6);
}
/// Verify that no-op doesn't change state.
#[test]
fn no_op_ocw2() {
let mut data = set_up();
icw_init_both_with_icw4(&mut data.pic, FULLY_NESTED_NO_AUTO_EOI);
// TODO(mutexlox): Verify APIC interaction when it is implemented.
data.pic.service_irq(/*irq=*/ 5, /*level=*/ true);
assert_eq!(data.pic.get_external_interrupt(), Some(0x08 + 5));
data.pic.service_irq(/*irq=*/ 5, /*level=*/ false);
let orig = data.pic.pics[PicSelect::Primary as usize].clone();
// Run a no-op.
data.pic.write(PIC_PRIMARY_COMMAND, &[0x40]);
// Nothing should have changed.
assert_eq!(orig, data.pic.pics[PicSelect::Primary as usize]);
}
/// Tests cascade IRQ that happens on secondary PIC.
#[test]
fn cascade_irq() {

View file

@ -5,6 +5,7 @@
// Common constants and types used for Split IRQ chip devices (e.g. PIC, PIT, IOAPIC).
use bit_field::*;
use sys_util::EventFd;
#[bitfield]
#[derive(Clone, Copy, Debug, PartialEq)]
@ -58,3 +59,36 @@ pub struct MsiDataMessage {
trigger: TriggerMode,
reserved2: BitField16,
}
/// Acts as a relay of interrupt signals between devices and IRQ chips.
#[derive(Default)]
pub struct GsiRelay {
pub irqfd: [Option<EventFd>; kvm::NUM_IOAPIC_PINS],
pub irqfd_resample: [Option<EventFd>; kvm::NUM_IOAPIC_PINS],
}
impl GsiRelay {
pub fn new() -> GsiRelay {
GsiRelay {
irqfd: Default::default(),
irqfd_resample: Default::default(),
}
}
pub fn register_irqfd(&mut self, evt: EventFd, gsi: usize) {
if gsi >= kvm::NUM_IOAPIC_PINS {
// Invalid gsi; ignore.
return;
}
self.irqfd[gsi] = Some(evt);
}
pub fn register_irqfd_resample(&mut self, evt: EventFd, resample_evt: EventFd, gsi: usize) {
if gsi >= kvm::NUM_IOAPIC_PINS {
// Invalid gsi; ignore.
return;
}
self.irqfd[gsi] = Some(evt);
self.irqfd_resample[gsi] = Some(resample_evt);
}
}

View file

@ -1699,6 +1699,7 @@ fn run_control(
Suspend,
ChildSignal,
CheckAvailableMemory,
IrqFd { gsi: usize },
LowMemory,
LowmemTimer,
VmControlServer,
@ -1749,6 +1750,16 @@ fn run_control(
.add(&freemem_timer, Token::CheckAvailableMemory)
.map_err(Error::PollContextAdd)?;
if let Some(gsi_relay) = &linux.gsi_relay {
for (gsi, evt) in gsi_relay.irqfd.into_iter().enumerate() {
if let Some(evt) = evt {
poll_ctx
.add(evt, Token::IrqFd { gsi })
.map_err(Error::PollContextAdd)?;
}
}
}
// Used to add jitter to timer values so that we don't have a thundering herd problem when
// multiple VMs are running.
let mut simple_rng = SimpleRng::new(
@ -1787,6 +1798,7 @@ fn run_control(
}
vcpu_thread_barrier.wait();
let mut ioapic_delayed = Vec::<usize>::default();
'poll: loop {
let events = {
match poll_ctx.wait() {
@ -1798,6 +1810,26 @@ fn run_control(
}
};
ioapic_delayed.retain(|&gsi| {
if let Some((_, ioapic)) = &linux.split_irqchip {
if let Ok(mut ioapic) = ioapic.try_lock() {
// The unwrap will never fail because gsi_relay is Some iff split_irqchip is
// Some.
if linux.gsi_relay.as_ref().unwrap().irqfd_resample[gsi].is_some() {
ioapic.service_irq(gsi, true);
} else {
ioapic.service_irq(gsi, true);
ioapic.service_irq(gsi, false);
}
false
} else {
true
}
} else {
true
}
});
let mut vm_control_indices_to_remove = Vec::new();
for event in events.iter_readable() {
match event.token() {
@ -1861,6 +1893,47 @@ fn run_control(
}
}
}
Token::IrqFd { gsi } => {
if let Some((pic, ioapic)) = &linux.split_irqchip {
// This will never fail because gsi_relay is Some iff split_irqchip is
// Some.
let gsi_relay = linux.gsi_relay.as_ref().unwrap();
if let Some(eventfd) = &gsi_relay.irqfd[gsi] {
eventfd.read().unwrap();
} else {
warn!(
"irqfd {} not found in GSI relay, should be impossible.",
gsi
);
}
let mut pic = pic.lock();
if gsi_relay.irqfd_resample[gsi].is_some() {
pic.service_irq(gsi as u8, true);
} else {
pic.service_irq(gsi as u8, true);
pic.service_irq(gsi as u8, false);
}
if let Err(e) = vcpu_handles[0].kill(SIGRTMIN() + 0) {
warn!("PIC: failed to kick vCPU0: {}", e);
}
// When IOAPIC is configuring its redirection table, we should first
// process its AddMsiRoute request, otherwise we would deadlock.
if let Ok(mut ioapic) = ioapic.try_lock() {
if gsi_relay.irqfd_resample[gsi].is_some() {
ioapic.service_irq(gsi, true);
} else {
ioapic.service_irq(gsi, true);
ioapic.service_irq(gsi, false);
}
} else {
ioapic_delayed.push(gsi);
}
} else {
panic!("split irqchip not found, should be impossible.");
}
}
Token::LowMemory => {
if let Some(low_mem) = &low_mem {
let old_balloon_memory = current_balloon_memory;
@ -2020,6 +2093,7 @@ fn run_control(
Token::Suspend => {}
Token::ChildSignal => {}
Token::CheckAvailableMemory => {}
Token::IrqFd { gsi: _ } => {}
Token::LowMemory => {}
Token::LowmemTimer => {}
Token::VmControlServer => {}

View file

@ -55,6 +55,7 @@ use std::sync::Arc;
use crate::bootparam::boot_params;
use arch::{RunnableLinuxVm, VmComponents, VmImage};
use devices::split_irqchip_common::GsiRelay;
use devices::{
get_serial_tty_string, Ioapic, PciConfigIo, PciDevice, PciInterruptPin, Pic, SerialParameters,
IOAPIC_BASE_ADDRESS, IOAPIC_MEM_LENGTH_BYTES,
@ -88,6 +89,7 @@ pub enum Error {
CreateVcpu(sys_util::Error),
CreateVm(sys_util::Error),
E820Configuration,
EnableSplitIrqchip(sys_util::Error),
KernelOffsetPastEnd,
LoadBios(io::Error),
LoadBzImage(bzimage::Error),
@ -136,6 +138,7 @@ impl Display for Error {
CreateVcpu(e) => write!(f, "failed to create VCPU: {}", e),
CreateVm(e) => write!(f, "failed to create VM: {}", e),
E820Configuration => write!(f, "invalid e820 setup params"),
EnableSplitIrqchip(e) => write!(f, "failed to enable split irqchip: {}", e),
KernelOffsetPastEnd => write!(f, "the kernel extends past the end of RAM"),
LoadBios(e) => write!(f, "error loading bios: {}", e),
LoadBzImage(e) => write!(f, "error loading kernel bzImage: {}", e),
@ -369,7 +372,8 @@ impl arch::LinuxArch for X8664arch {
let exit_evt = EventFd::new().map_err(Error::CreateEventFd)?;
let split_irqchip = if split_irqchip {
let (split_irqchip, mut gsi_relay) = if split_irqchip {
let gsi_relay = GsiRelay::new();
let pic = Arc::new(Mutex::new(Pic::new()));
let ioapic = Arc::new(Mutex::new(
Ioapic::new(&mut vm, ioapic_device_socket).map_err(Error::CreateIoapicDevice)?,
@ -382,15 +386,20 @@ impl arch::LinuxArch for X8664arch {
false,
)
.unwrap();
Some((pic, ioapic))
(Some((pic, ioapic)), Some(gsi_relay))
} else {
None
(None, None)
};
let pci_devices = create_devices(&mem, &mut vm, &mut resources, &exit_evt)
.map_err(|e| Error::CreateDevices(Box::new(e)))?;
let (pci, pci_irqs, pid_debug_label_map) =
arch::generate_pci_root(pci_devices, &mut mmio_bus, &mut resources, &mut vm)
.map_err(Error::CreatePciRoot)?;
let (pci, pci_irqs, pid_debug_label_map) = arch::generate_pci_root(
pci_devices,
&mut gsi_relay,
&mut mmio_bus,
&mut resources,
&mut vm,
)
.map_err(Error::CreatePciRoot)?;
let pci_bus = Arc::new(Mutex::new(PciConfigIo::new(pci)));
// Event used to notify crosvm that guest OS is trying to suspend.
@ -400,15 +409,20 @@ impl arch::LinuxArch for X8664arch {
let mut io_bus = Self::setup_io_bus(
&mut vm,
split_irqchip.is_some(),
&mut gsi_relay,
exit_evt.try_clone().map_err(Error::CloneEventFd)?,
Some(pci_bus.clone()),
components.memory_size,
suspend_evt.try_clone().map_err(Error::CloneEventFd)?,
)?;
let stdio_serial_num =
Self::setup_serial_devices(&mut vm, &mut io_bus, serial_parameters, serial_jail)?;
let stdio_serial_num = Self::setup_serial_devices(
&mut vm,
&mut io_bus,
&mut gsi_relay,
serial_parameters,
serial_jail,
)?;
let ramoops_region = match components.pstore {
Some(pstore) => Some(
@ -418,7 +432,7 @@ impl arch::LinuxArch for X8664arch {
None => None,
};
if let Some((pic, _)) = &split_irqchip {
let gsi_relay = if let Some((pic, ioapic)) = &split_irqchip {
io_bus.insert(pic.clone(), 0x20, 0x2, true).unwrap();
io_bus.insert(pic.clone(), 0xa0, 0x2, true).unwrap();
io_bus.insert(pic.clone(), 0x4d0, 0x2, true).unwrap();
@ -427,7 +441,15 @@ impl arch::LinuxArch for X8664arch {
while irq_num < kvm::NUM_IOAPIC_PINS as u32 {
irq_num = resources.allocate_irq().unwrap();
}
}
// This will never fail because gsi_relay is Some iff split_irqchip is Some.
let gsi_relay = Arc::new(gsi_relay.unwrap());
pic.lock().register_relay(gsi_relay.clone());
ioapic.lock().register_relay(gsi_relay.clone());
Some(gsi_relay)
} else {
None
};
match components.vm_image {
VmImage::Bios(ref mut bios) => Self::load_bios(&mem, bios)?,
@ -483,6 +505,7 @@ impl arch::LinuxArch for X8664arch {
vcpu_affinity,
irq_chip,
split_irqchip,
gsi_relay,
io_bus,
mmio_bus,
pid_debug_label_map,
@ -638,6 +661,8 @@ impl X8664arch {
vm.create_pit().map_err(Error::CreatePit)?;
vm.create_irq_chip().map_err(Error::CreateIrqChip)?;
} else {
vm.enable_split_irqchip()
.map_err(Error::EnableSplitIrqchip)?;
for i in 0..kvm::NUM_IOAPIC_PINS {
// Add dummy MSI routes to replace the default IRQChip routes.
let route = IrqRoute {
@ -719,13 +744,13 @@ impl X8664arch {
/// # Arguments
///
/// * - `vm` the vm object
/// * - `split_irqchip`: whether to use a split IRQ chip (i.e. userspace PIT/PIC/IOAPIC)
/// * - `gsi_relay`: only valid for split IRQ chip (i.e. userspace PIT/PIC/IOAPIC)
/// * - `exit_evt` - the event fd object which should receive exit events
/// * - `mem_size` - the size in bytes of physical ram for the guest
/// * - `suspend_evt` - the event fd object which used to suspend the vm
fn setup_io_bus(
vm: &mut Vm,
split_irqchip: bool,
_vm: &mut Vm,
gsi_relay: &mut Option<GsiRelay>,
exit_evt: EventFd,
pci: Option<Arc<Mutex<devices::PciConfigIo>>>,
mem_size: u64,
@ -758,7 +783,7 @@ impl X8664arch {
exit_evt.try_clone().map_err(Error::CloneEventFd)?,
)));
if split_irqchip {
if let Some(gsi_relay) = gsi_relay {
let pit_evt = EventFd::new().map_err(Error::CreateEventFd)?;
let pit = Arc::new(Mutex::new(
devices::Pit::new(
@ -770,8 +795,7 @@ impl X8664arch {
io_bus.insert(pit.clone(), 0x040, 0x8, true).unwrap();
io_bus.insert(pit.clone(), 0x061, 0x1, true).unwrap();
io_bus.insert(i8042, 0x062, 0x3, true).unwrap();
vm.register_irqfd(&pit_evt, 0)
.map_err(Error::RegisterIrqfd)?;
gsi_relay.register_irqfd(pit_evt, 0);
} else {
io_bus
.insert(nul_device.clone(), 0x040, 0x8, false)
@ -816,10 +840,12 @@ impl X8664arch {
///
/// * - `vm` the vm object
/// * - `io_bus` the I/O bus to add the devices to
/// * - `gsi_relay`: only valid for split IRQ chip (i.e. userspace PIT/PIC/IOAPIC)
/// * - `serial_parmaters` - definitions for how the serial devices should be configured
fn setup_serial_devices(
vm: &mut Vm,
io_bus: &mut devices::Bus,
gsi_relay: &mut Option<GsiRelay>,
serial_parameters: &BTreeMap<u8, SerialParameters>,
serial_jail: Option<Minijail>,
) -> Result<Option<u8>> {
@ -835,10 +861,15 @@ impl X8664arch {
)
.map_err(Error::CreateSerialDevices)?;
vm.register_irqfd(&com_evt_1_3, X86_64_SERIAL_1_3_IRQ)
.map_err(Error::RegisterIrqfd)?;
vm.register_irqfd(&com_evt_2_4, X86_64_SERIAL_2_4_IRQ)
.map_err(Error::RegisterIrqfd)?;
if let Some(gsi_relay) = gsi_relay {
gsi_relay.register_irqfd(com_evt_1_3, X86_64_SERIAL_1_3_IRQ as usize);
gsi_relay.register_irqfd(com_evt_2_4, X86_64_SERIAL_2_4_IRQ as usize);
} else {
vm.register_irqfd(&com_evt_1_3, X86_64_SERIAL_1_3_IRQ)
.map_err(Error::RegisterIrqfd)?;
vm.register_irqfd(&com_evt_2_4, X86_64_SERIAL_2_4_IRQ)
.map_err(Error::RegisterIrqfd)?;
}
Ok(stdio_serial_num)
}