Add io_uring interfaces

Provide a low-level interface to operating the new io_uring interface.

This is an unsafe interface with the basic operations of setting up the
ring, adding to it, removing from it, and polling it.

This will be followed by a safe interface layer on top of this code,
then by additions to the asynchronous executor that allow for
asynchronously doing multiple operations to files from one context.

Change-Id: I71f7ffb04ce8cb4da470deda9aee768ab95d3d98
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/crosvm/+/2124009
Reviewed-by: Dylan Reid <dgreid@chromium.org>
Reviewed-by: Zach Reizner <zachr@chromium.org>
Tested-by: Dylan Reid <dgreid@chromium.org>
Tested-by: kokoro <noreply+kokoro@google.com>
Commit-Queue: Dylan Reid <dgreid@chromium.org>
This commit is contained in:
Dylan Reid 2020-03-13 13:40:40 -07:00 committed by Commit Bot
parent 252d5b3cf3
commit a9a1d02277
5 changed files with 1353 additions and 0 deletions

15
io_uring/Cargo.toml Normal file
View file

@ -0,0 +1,15 @@
[package]
name = "io_uring"
version = "0.1.0"
authors = ["The Chromium OS Authors"]
edition = "2018"
[dependencies]
libc = "*"
syscall_defines = { path = "../syscall_defines" }
sys_util = { path = "../sys_util" }
[dev-dependencies]
tempfile = { path = "../tempfile" }
[workspace]

516
io_uring/src/bindings.rs Normal file
View file

@ -0,0 +1,516 @@
/* automatically generated by rust-bindgen
*
* bindgen --with-derive-default include/uapi/linux/io_uring.h
*/
#![allow(non_upper_case_globals)]
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]
#![allow(dead_code)]
#[repr(C)]
#[derive(Default)]
pub struct __IncompleteArrayField<T>(::std::marker::PhantomData<T>);
impl<T> __IncompleteArrayField<T> {
#[inline]
pub fn new() -> Self {
__IncompleteArrayField(::std::marker::PhantomData)
}
#[inline]
pub unsafe fn as_ptr(&self) -> *const T {
::std::mem::transmute(self)
}
#[inline]
pub unsafe fn as_mut_ptr(&mut self) -> *mut T {
::std::mem::transmute(self)
}
#[inline]
pub unsafe fn as_slice(&self, len: usize) -> &[T] {
::std::slice::from_raw_parts(self.as_ptr(), len)
}
#[inline]
pub unsafe fn as_mut_slice(&mut self, len: usize) -> &mut [T] {
::std::slice::from_raw_parts_mut(self.as_mut_ptr(), len)
}
}
impl<T> ::std::fmt::Debug for __IncompleteArrayField<T> {
fn fmt(&self, fmt: &mut ::std::fmt::Formatter) -> ::std::fmt::Result {
fmt.write_str("__IncompleteArrayField")
}
}
impl<T> ::std::clone::Clone for __IncompleteArrayField<T> {
#[inline]
fn clone(&self) -> Self {
Self::new()
}
}
impl<T> ::std::marker::Copy for __IncompleteArrayField<T> {}
pub const NR_OPEN: ::std::os::raw::c_uint = 1024;
pub const NGROUPS_MAX: ::std::os::raw::c_uint = 65536;
pub const ARG_MAX: ::std::os::raw::c_uint = 131072;
pub const LINK_MAX: ::std::os::raw::c_uint = 127;
pub const MAX_CANON: ::std::os::raw::c_uint = 255;
pub const MAX_INPUT: ::std::os::raw::c_uint = 255;
pub const NAME_MAX: ::std::os::raw::c_uint = 255;
pub const PATH_MAX: ::std::os::raw::c_uint = 4096;
pub const PIPE_BUF: ::std::os::raw::c_uint = 4096;
pub const XATTR_NAME_MAX: ::std::os::raw::c_uint = 255;
pub const XATTR_SIZE_MAX: ::std::os::raw::c_uint = 65536;
pub const XATTR_LIST_MAX: ::std::os::raw::c_uint = 65536;
pub const RTSIG_MAX: ::std::os::raw::c_uint = 32;
pub const _IOC_NRBITS: ::std::os::raw::c_uint = 8;
pub const _IOC_TYPEBITS: ::std::os::raw::c_uint = 8;
pub const _IOC_SIZEBITS: ::std::os::raw::c_uint = 14;
pub const _IOC_DIRBITS: ::std::os::raw::c_uint = 2;
pub const _IOC_NRMASK: ::std::os::raw::c_uint = 255;
pub const _IOC_TYPEMASK: ::std::os::raw::c_uint = 255;
pub const _IOC_SIZEMASK: ::std::os::raw::c_uint = 16383;
pub const _IOC_DIRMASK: ::std::os::raw::c_uint = 3;
pub const _IOC_NRSHIFT: ::std::os::raw::c_uint = 0;
pub const _IOC_TYPESHIFT: ::std::os::raw::c_uint = 8;
pub const _IOC_SIZESHIFT: ::std::os::raw::c_uint = 16;
pub const _IOC_DIRSHIFT: ::std::os::raw::c_uint = 30;
pub const _IOC_NONE: ::std::os::raw::c_uint = 0;
pub const _IOC_WRITE: ::std::os::raw::c_uint = 1;
pub const _IOC_READ: ::std::os::raw::c_uint = 2;
pub const IOC_IN: ::std::os::raw::c_uint = 1073741824;
pub const IOC_OUT: ::std::os::raw::c_uint = 2147483648;
pub const IOC_INOUT: ::std::os::raw::c_uint = 3221225472;
pub const IOCSIZE_MASK: ::std::os::raw::c_uint = 1073676288;
pub const IOCSIZE_SHIFT: ::std::os::raw::c_uint = 16;
pub const __BITS_PER_LONG: ::std::os::raw::c_uint = 64;
pub const __FD_SETSIZE: ::std::os::raw::c_uint = 1024;
pub const MS_RDONLY: ::std::os::raw::c_uint = 1;
pub const MS_NOSUID: ::std::os::raw::c_uint = 2;
pub const MS_NODEV: ::std::os::raw::c_uint = 4;
pub const MS_NOEXEC: ::std::os::raw::c_uint = 8;
pub const MS_SYNCHRONOUS: ::std::os::raw::c_uint = 16;
pub const MS_REMOUNT: ::std::os::raw::c_uint = 32;
pub const MS_MANDLOCK: ::std::os::raw::c_uint = 64;
pub const MS_DIRSYNC: ::std::os::raw::c_uint = 128;
pub const MS_NOATIME: ::std::os::raw::c_uint = 1024;
pub const MS_NODIRATIME: ::std::os::raw::c_uint = 2048;
pub const MS_BIND: ::std::os::raw::c_uint = 4096;
pub const MS_MOVE: ::std::os::raw::c_uint = 8192;
pub const MS_REC: ::std::os::raw::c_uint = 16384;
pub const MS_VERBOSE: ::std::os::raw::c_uint = 32768;
pub const MS_SILENT: ::std::os::raw::c_uint = 32768;
pub const MS_POSIXACL: ::std::os::raw::c_uint = 65536;
pub const MS_UNBINDABLE: ::std::os::raw::c_uint = 131072;
pub const MS_PRIVATE: ::std::os::raw::c_uint = 262144;
pub const MS_SLAVE: ::std::os::raw::c_uint = 524288;
pub const MS_SHARED: ::std::os::raw::c_uint = 1048576;
pub const MS_RELATIME: ::std::os::raw::c_uint = 2097152;
pub const MS_KERNMOUNT: ::std::os::raw::c_uint = 4194304;
pub const MS_I_VERSION: ::std::os::raw::c_uint = 8388608;
pub const MS_STRICTATIME: ::std::os::raw::c_uint = 16777216;
pub const MS_LAZYTIME: ::std::os::raw::c_uint = 33554432;
pub const MS_SUBMOUNT: ::std::os::raw::c_uint = 67108864;
pub const MS_NOREMOTELOCK: ::std::os::raw::c_uint = 134217728;
pub const MS_NOSEC: ::std::os::raw::c_uint = 268435456;
pub const MS_BORN: ::std::os::raw::c_uint = 536870912;
pub const MS_ACTIVE: ::std::os::raw::c_uint = 1073741824;
pub const MS_NOUSER: ::std::os::raw::c_uint = 2147483648;
pub const MS_RMT_MASK: ::std::os::raw::c_uint = 41943121;
pub const MS_MGC_VAL: ::std::os::raw::c_uint = 3236757504;
pub const MS_MGC_MSK: ::std::os::raw::c_uint = 4294901760;
pub const OPEN_TREE_CLONE: ::std::os::raw::c_uint = 1;
pub const MOVE_MOUNT_F_SYMLINKS: ::std::os::raw::c_uint = 1;
pub const MOVE_MOUNT_F_AUTOMOUNTS: ::std::os::raw::c_uint = 2;
pub const MOVE_MOUNT_F_EMPTY_PATH: ::std::os::raw::c_uint = 4;
pub const MOVE_MOUNT_T_SYMLINKS: ::std::os::raw::c_uint = 16;
pub const MOVE_MOUNT_T_AUTOMOUNTS: ::std::os::raw::c_uint = 32;
pub const MOVE_MOUNT_T_EMPTY_PATH: ::std::os::raw::c_uint = 64;
pub const MOVE_MOUNT__MASK: ::std::os::raw::c_uint = 119;
pub const FSOPEN_CLOEXEC: ::std::os::raw::c_uint = 1;
pub const FSPICK_CLOEXEC: ::std::os::raw::c_uint = 1;
pub const FSPICK_SYMLINK_NOFOLLOW: ::std::os::raw::c_uint = 2;
pub const FSPICK_NO_AUTOMOUNT: ::std::os::raw::c_uint = 4;
pub const FSPICK_EMPTY_PATH: ::std::os::raw::c_uint = 8;
pub const FSMOUNT_CLOEXEC: ::std::os::raw::c_uint = 1;
pub const MOUNT_ATTR_RDONLY: ::std::os::raw::c_uint = 1;
pub const MOUNT_ATTR_NOSUID: ::std::os::raw::c_uint = 2;
pub const MOUNT_ATTR_NODEV: ::std::os::raw::c_uint = 4;
pub const MOUNT_ATTR_NOEXEC: ::std::os::raw::c_uint = 8;
pub const MOUNT_ATTR__ATIME: ::std::os::raw::c_uint = 112;
pub const MOUNT_ATTR_RELATIME: ::std::os::raw::c_uint = 0;
pub const MOUNT_ATTR_NOATIME: ::std::os::raw::c_uint = 16;
pub const MOUNT_ATTR_STRICTATIME: ::std::os::raw::c_uint = 32;
pub const MOUNT_ATTR_NODIRATIME: ::std::os::raw::c_uint = 128;
pub const INR_OPEN_CUR: ::std::os::raw::c_uint = 1024;
pub const INR_OPEN_MAX: ::std::os::raw::c_uint = 4096;
pub const BLOCK_SIZE_BITS: ::std::os::raw::c_uint = 10;
pub const BLOCK_SIZE: ::std::os::raw::c_uint = 1024;
pub const SEEK_SET: ::std::os::raw::c_uint = 0;
pub const SEEK_CUR: ::std::os::raw::c_uint = 1;
pub const SEEK_END: ::std::os::raw::c_uint = 2;
pub const SEEK_DATA: ::std::os::raw::c_uint = 3;
pub const SEEK_HOLE: ::std::os::raw::c_uint = 4;
pub const SEEK_MAX: ::std::os::raw::c_uint = 4;
pub const RENAME_NOREPLACE: ::std::os::raw::c_uint = 1;
pub const RENAME_EXCHANGE: ::std::os::raw::c_uint = 2;
pub const RENAME_WHITEOUT: ::std::os::raw::c_uint = 4;
pub const FILE_DEDUPE_RANGE_SAME: ::std::os::raw::c_uint = 0;
pub const FILE_DEDUPE_RANGE_DIFFERS: ::std::os::raw::c_uint = 1;
pub const NR_FILE: ::std::os::raw::c_uint = 8192;
pub const FS_XFLAG_REALTIME: ::std::os::raw::c_uint = 1;
pub const FS_XFLAG_PREALLOC: ::std::os::raw::c_uint = 2;
pub const FS_XFLAG_IMMUTABLE: ::std::os::raw::c_uint = 8;
pub const FS_XFLAG_APPEND: ::std::os::raw::c_uint = 16;
pub const FS_XFLAG_SYNC: ::std::os::raw::c_uint = 32;
pub const FS_XFLAG_NOATIME: ::std::os::raw::c_uint = 64;
pub const FS_XFLAG_NODUMP: ::std::os::raw::c_uint = 128;
pub const FS_XFLAG_RTINHERIT: ::std::os::raw::c_uint = 256;
pub const FS_XFLAG_PROJINHERIT: ::std::os::raw::c_uint = 512;
pub const FS_XFLAG_NOSYMLINKS: ::std::os::raw::c_uint = 1024;
pub const FS_XFLAG_EXTSIZE: ::std::os::raw::c_uint = 2048;
pub const FS_XFLAG_EXTSZINHERIT: ::std::os::raw::c_uint = 4096;
pub const FS_XFLAG_NODEFRAG: ::std::os::raw::c_uint = 8192;
pub const FS_XFLAG_FILESTREAM: ::std::os::raw::c_uint = 16384;
pub const FS_XFLAG_DAX: ::std::os::raw::c_uint = 32768;
pub const FS_XFLAG_COWEXTSIZE: ::std::os::raw::c_uint = 65536;
pub const FS_XFLAG_HASATTR: ::std::os::raw::c_uint = 2147483648;
pub const BMAP_IOCTL: ::std::os::raw::c_uint = 1;
pub const FSLABEL_MAX: ::std::os::raw::c_uint = 256;
pub const FS_KEY_DESCRIPTOR_SIZE: ::std::os::raw::c_uint = 8;
pub const FS_POLICY_FLAGS_PAD_4: ::std::os::raw::c_uint = 0;
pub const FS_POLICY_FLAGS_PAD_8: ::std::os::raw::c_uint = 1;
pub const FS_POLICY_FLAGS_PAD_16: ::std::os::raw::c_uint = 2;
pub const FS_POLICY_FLAGS_PAD_32: ::std::os::raw::c_uint = 3;
pub const FS_POLICY_FLAGS_PAD_MASK: ::std::os::raw::c_uint = 3;
pub const FS_POLICY_FLAG_DIRECT_KEY: ::std::os::raw::c_uint = 4;
pub const FS_POLICY_FLAGS_VALID: ::std::os::raw::c_uint = 7;
pub const FS_ENCRYPTION_MODE_INVALID: ::std::os::raw::c_uint = 0;
pub const FS_ENCRYPTION_MODE_AES_256_XTS: ::std::os::raw::c_uint = 1;
pub const FS_ENCRYPTION_MODE_AES_256_GCM: ::std::os::raw::c_uint = 2;
pub const FS_ENCRYPTION_MODE_AES_256_CBC: ::std::os::raw::c_uint = 3;
pub const FS_ENCRYPTION_MODE_AES_256_CTS: ::std::os::raw::c_uint = 4;
pub const FS_ENCRYPTION_MODE_AES_128_CBC: ::std::os::raw::c_uint = 5;
pub const FS_ENCRYPTION_MODE_AES_128_CTS: ::std::os::raw::c_uint = 6;
pub const FS_ENCRYPTION_MODE_SPECK128_256_XTS: ::std::os::raw::c_uint = 7;
pub const FS_ENCRYPTION_MODE_SPECK128_256_CTS: ::std::os::raw::c_uint = 8;
pub const FS_ENCRYPTION_MODE_ADIANTUM: ::std::os::raw::c_uint = 9;
pub const FS_KEY_DESC_PREFIX: &'static [u8; 9usize] = b"fscrypt:\0";
pub const FS_KEY_DESC_PREFIX_SIZE: ::std::os::raw::c_uint = 8;
pub const FS_MAX_KEY_SIZE: ::std::os::raw::c_uint = 64;
pub const FS_SECRM_FL: ::std::os::raw::c_uint = 1;
pub const FS_UNRM_FL: ::std::os::raw::c_uint = 2;
pub const FS_COMPR_FL: ::std::os::raw::c_uint = 4;
pub const FS_SYNC_FL: ::std::os::raw::c_uint = 8;
pub const FS_IMMUTABLE_FL: ::std::os::raw::c_uint = 16;
pub const FS_APPEND_FL: ::std::os::raw::c_uint = 32;
pub const FS_NODUMP_FL: ::std::os::raw::c_uint = 64;
pub const FS_NOATIME_FL: ::std::os::raw::c_uint = 128;
pub const FS_DIRTY_FL: ::std::os::raw::c_uint = 256;
pub const FS_COMPRBLK_FL: ::std::os::raw::c_uint = 512;
pub const FS_NOCOMP_FL: ::std::os::raw::c_uint = 1024;
pub const FS_ENCRYPT_FL: ::std::os::raw::c_uint = 2048;
pub const FS_BTREE_FL: ::std::os::raw::c_uint = 4096;
pub const FS_INDEX_FL: ::std::os::raw::c_uint = 4096;
pub const FS_IMAGIC_FL: ::std::os::raw::c_uint = 8192;
pub const FS_JOURNAL_DATA_FL: ::std::os::raw::c_uint = 16384;
pub const FS_NOTAIL_FL: ::std::os::raw::c_uint = 32768;
pub const FS_DIRSYNC_FL: ::std::os::raw::c_uint = 65536;
pub const FS_TOPDIR_FL: ::std::os::raw::c_uint = 131072;
pub const FS_HUGE_FILE_FL: ::std::os::raw::c_uint = 262144;
pub const FS_EXTENT_FL: ::std::os::raw::c_uint = 524288;
pub const FS_EA_INODE_FL: ::std::os::raw::c_uint = 2097152;
pub const FS_EOFBLOCKS_FL: ::std::os::raw::c_uint = 4194304;
pub const FS_NOCOW_FL: ::std::os::raw::c_uint = 8388608;
pub const FS_INLINE_DATA_FL: ::std::os::raw::c_uint = 268435456;
pub const FS_PROJINHERIT_FL: ::std::os::raw::c_uint = 536870912;
pub const FS_RESERVED_FL: ::std::os::raw::c_uint = 2147483648;
pub const FS_FL_USER_VISIBLE: ::std::os::raw::c_uint = 253951;
pub const FS_FL_USER_MODIFIABLE: ::std::os::raw::c_uint = 229631;
pub const SYNC_FILE_RANGE_WAIT_BEFORE: ::std::os::raw::c_uint = 1;
pub const SYNC_FILE_RANGE_WRITE: ::std::os::raw::c_uint = 2;
pub const SYNC_FILE_RANGE_WAIT_AFTER: ::std::os::raw::c_uint = 4;
pub const SYNC_FILE_RANGE_WRITE_AND_WAIT: ::std::os::raw::c_uint = 7;
pub const IORING_SETUP_IOPOLL: ::std::os::raw::c_uint = 1;
pub const IORING_SETUP_SQPOLL: ::std::os::raw::c_uint = 2;
pub const IORING_SETUP_SQ_AFF: ::std::os::raw::c_uint = 4;
pub const IORING_SETUP_CQSIZE: ::std::os::raw::c_uint = 8;
pub const IORING_SETUP_CLAMP: ::std::os::raw::c_uint = 16;
pub const IORING_SETUP_ATTACH_WQ: ::std::os::raw::c_uint = 32;
pub const IORING_FSYNC_DATASYNC: ::std::os::raw::c_uint = 1;
pub const IORING_TIMEOUT_ABS: ::std::os::raw::c_uint = 1;
pub const IORING_OFF_SQ_RING: ::std::os::raw::c_uint = 0;
pub const IORING_OFF_CQ_RING: ::std::os::raw::c_uint = 134217728;
pub const IORING_OFF_SQES: ::std::os::raw::c_uint = 268435456;
pub const IORING_SQ_NEED_WAKEUP: ::std::os::raw::c_uint = 1;
pub const IORING_ENTER_GETEVENTS: ::std::os::raw::c_uint = 1;
pub const IORING_ENTER_SQ_WAKEUP: ::std::os::raw::c_uint = 2;
pub const IORING_FEAT_SINGLE_MMAP: ::std::os::raw::c_uint = 1;
pub const IORING_FEAT_NODROP: ::std::os::raw::c_uint = 2;
pub const IORING_FEAT_SUBMIT_STABLE: ::std::os::raw::c_uint = 4;
pub const IORING_FEAT_RW_CUR_POS: ::std::os::raw::c_uint = 8;
pub const IORING_FEAT_CUR_PERSONALITY: ::std::os::raw::c_uint = 16;
pub const IORING_REGISTER_BUFFERS: ::std::os::raw::c_uint = 0;
pub const IORING_UNREGISTER_BUFFERS: ::std::os::raw::c_uint = 1;
pub const IORING_REGISTER_FILES: ::std::os::raw::c_uint = 2;
pub const IORING_UNREGISTER_FILES: ::std::os::raw::c_uint = 3;
pub const IORING_REGISTER_EVENTFD: ::std::os::raw::c_uint = 4;
pub const IORING_UNREGISTER_EVENTFD: ::std::os::raw::c_uint = 5;
pub const IORING_REGISTER_FILES_UPDATE: ::std::os::raw::c_uint = 6;
pub const IORING_REGISTER_EVENTFD_ASYNC: ::std::os::raw::c_uint = 7;
pub const IORING_REGISTER_PROBE: ::std::os::raw::c_uint = 8;
pub const IORING_REGISTER_PERSONALITY: ::std::os::raw::c_uint = 9;
pub const IORING_UNREGISTER_PERSONALITY: ::std::os::raw::c_uint = 10;
pub const IO_URING_OP_SUPPORTED: ::std::os::raw::c_uint = 1;
#[repr(C)]
#[derive(Debug, Default, Copy, Clone)]
pub struct file_clone_range {
pub src_fd: i64,
pub src_offset: u64,
pub src_length: u64,
pub dest_offset: u64,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone)]
pub struct fstrim_range {
pub start: u64,
pub len: u64,
pub minlen: u64,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone)]
pub struct file_dedupe_range_info {
pub dest_fd: i64,
pub dest_offset: u64,
pub bytes_deduped: u64,
pub status: i32,
pub reserved: u32,
}
#[repr(C)]
#[derive(Debug, Default)]
pub struct file_dedupe_range {
pub src_offset: u64,
pub src_length: u64,
pub dest_count: u16,
pub reserved1: u16,
pub reserved2: u32,
pub info: __IncompleteArrayField<file_dedupe_range_info>,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone)]
pub struct files_stat_struct {
pub nr_files: ::std::os::raw::c_ulong,
pub nr_free_files: ::std::os::raw::c_ulong,
pub max_files: ::std::os::raw::c_ulong,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone)]
pub struct inodes_stat_t {
pub nr_inodes: ::std::os::raw::c_long,
pub nr_unused: ::std::os::raw::c_long,
pub dummy: [::std::os::raw::c_long; 5usize],
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone)]
pub struct fsxattr {
pub fsx_xflags: u32,
pub fsx_extsize: u32,
pub fsx_nextents: u32,
pub fsx_projid: u32,
pub fsx_cowextsize: u32,
pub fsx_pad: [::std::os::raw::c_uchar; 8usize],
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone)]
pub struct fscrypt_policy {
pub version: u8,
pub contents_encryption_mode: u8,
pub filenames_encryption_mode: u8,
pub flags: u8,
pub master_key_descriptor: [u8; 8usize],
}
#[repr(C)]
#[derive(Copy, Clone)]
pub struct fscrypt_key {
pub mode: u32,
pub raw: [u8; 64usize],
pub size: u32,
}
impl Default for fscrypt_key {
fn default() -> Self {
unsafe { ::std::mem::zeroed() }
}
}
pub type __kernel_rwf_t = ::std::os::raw::c_int;
#[repr(C)]
#[derive(Copy, Clone)]
pub struct io_uring_sqe {
pub opcode: u8,
pub flags: u8,
pub ioprio: u16,
pub fd: i32,
pub __bindgen_anon_1: io_uring_sqe__bindgen_ty_1,
pub addr: u64,
pub len: u32,
pub __bindgen_anon_2: io_uring_sqe__bindgen_ty_2,
pub user_data: u64,
pub __bindgen_anon_3: io_uring_sqe__bindgen_ty_3,
}
#[repr(C)]
#[derive(Copy, Clone)]
pub union io_uring_sqe__bindgen_ty_1 {
pub off: u64,
pub addr2: u64,
_bindgen_union_align: u64,
}
impl Default for io_uring_sqe__bindgen_ty_1 {
fn default() -> Self {
unsafe { ::std::mem::zeroed() }
}
}
#[repr(C)]
#[derive(Copy, Clone)]
pub union io_uring_sqe__bindgen_ty_2 {
pub rw_flags: __kernel_rwf_t,
pub fsync_flags: u32,
pub poll_events: u16,
pub sync_range_flags: u32,
pub msg_flags: u32,
pub timeout_flags: u32,
pub accept_flags: u32,
pub cancel_flags: u32,
pub open_flags: u32,
pub statx_flags: u32,
pub fadvise_advice: u32,
_bindgen_union_align: u32,
}
impl Default for io_uring_sqe__bindgen_ty_2 {
fn default() -> Self {
unsafe { ::std::mem::zeroed() }
}
}
#[repr(C)]
#[derive(Copy, Clone)]
pub union io_uring_sqe__bindgen_ty_3 {
pub __bindgen_anon_1: io_uring_sqe__bindgen_ty_3__bindgen_ty_1,
pub __pad2: [u64; 3usize],
_bindgen_union_align: [u64; 3usize],
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone)]
pub struct io_uring_sqe__bindgen_ty_3__bindgen_ty_1 {
pub buf_index: u16,
pub personality: u16,
}
impl Default for io_uring_sqe__bindgen_ty_3 {
fn default() -> Self {
unsafe { ::std::mem::zeroed() }
}
}
impl Default for io_uring_sqe {
fn default() -> Self {
unsafe { ::std::mem::zeroed() }
}
}
pub const IOSQE_FIXED_FILE_BIT: _bindgen_ty_1 = 0;
pub const IOSQE_IO_DRAIN_BIT: _bindgen_ty_1 = 1;
pub const IOSQE_IO_LINK_BIT: _bindgen_ty_1 = 2;
pub const IOSQE_IO_HARDLINK_BIT: _bindgen_ty_1 = 3;
pub const IOSQE_ASYNC_BIT: _bindgen_ty_1 = 4;
pub type _bindgen_ty_1 = ::std::os::raw::c_uint;
pub const IORING_OP_NOP: _bindgen_ty_2 = 0;
pub const IORING_OP_READV: _bindgen_ty_2 = 1;
pub const IORING_OP_WRITEV: _bindgen_ty_2 = 2;
pub const IORING_OP_FSYNC: _bindgen_ty_2 = 3;
pub const IORING_OP_READ_FIXED: _bindgen_ty_2 = 4;
pub const IORING_OP_WRITE_FIXED: _bindgen_ty_2 = 5;
pub const IORING_OP_POLL_ADD: _bindgen_ty_2 = 6;
pub const IORING_OP_POLL_REMOVE: _bindgen_ty_2 = 7;
pub const IORING_OP_SYNC_FILE_RANGE: _bindgen_ty_2 = 8;
pub const IORING_OP_SENDMSG: _bindgen_ty_2 = 9;
pub const IORING_OP_RECVMSG: _bindgen_ty_2 = 10;
pub const IORING_OP_TIMEOUT: _bindgen_ty_2 = 11;
pub const IORING_OP_TIMEOUT_REMOVE: _bindgen_ty_2 = 12;
pub const IORING_OP_ACCEPT: _bindgen_ty_2 = 13;
pub const IORING_OP_ASYNC_CANCEL: _bindgen_ty_2 = 14;
pub const IORING_OP_LINK_TIMEOUT: _bindgen_ty_2 = 15;
pub const IORING_OP_CONNECT: _bindgen_ty_2 = 16;
pub const IORING_OP_FALLOCATE: _bindgen_ty_2 = 17;
pub const IORING_OP_OPENAT: _bindgen_ty_2 = 18;
pub const IORING_OP_CLOSE: _bindgen_ty_2 = 19;
pub const IORING_OP_FILES_UPDATE: _bindgen_ty_2 = 20;
pub const IORING_OP_STATX: _bindgen_ty_2 = 21;
pub const IORING_OP_READ: _bindgen_ty_2 = 22;
pub const IORING_OP_WRITE: _bindgen_ty_2 = 23;
pub const IORING_OP_FADVISE: _bindgen_ty_2 = 24;
pub const IORING_OP_MADVISE: _bindgen_ty_2 = 25;
pub const IORING_OP_SEND: _bindgen_ty_2 = 26;
pub const IORING_OP_RECV: _bindgen_ty_2 = 27;
pub const IORING_OP_OPENAT2: _bindgen_ty_2 = 28;
pub const IORING_OP_EPOLL_CTL: _bindgen_ty_2 = 29;
pub const IORING_OP_LAST: _bindgen_ty_2 = 30;
pub type _bindgen_ty_2 = ::std::os::raw::c_uint;
#[repr(C)]
#[derive(Debug, Default, Copy, Clone)]
pub struct io_uring_cqe {
pub user_data: u64,
pub res: i32,
pub flags: u32,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone)]
pub struct io_sqring_offsets {
pub head: u32,
pub tail: u32,
pub ring_mask: u32,
pub ring_entries: u32,
pub flags: u32,
pub dropped: u32,
pub array: u32,
pub resv1: u32,
pub resv2: u64,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone)]
pub struct io_cqring_offsets {
pub head: u32,
pub tail: u32,
pub ring_mask: u32,
pub ring_entries: u32,
pub overflow: u32,
pub cqes: u32,
pub resv: [u64; 2usize],
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone)]
pub struct io_uring_params {
pub sq_entries: u32,
pub cq_entries: u32,
pub flags: u32,
pub sq_thread_cpu: u32,
pub sq_thread_idle: u32,
pub features: u32,
pub wq_fd: u32,
pub resv: [u32; 3usize],
pub sq_off: io_sqring_offsets,
pub cq_off: io_cqring_offsets,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone)]
pub struct io_uring_files_update {
pub offset: u32,
pub resv: u32,
pub fds: u64,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone)]
pub struct io_uring_probe_op {
pub op: u8,
pub resv: u8,
pub flags: u16,
pub resv2: u32,
}
#[repr(C)]
#[derive(Debug, Default)]
pub struct io_uring_probe {
pub last_op: u8,
pub ops_len: u8,
pub resv: u16,
pub resv2: [u32; 3usize],
pub ops: __IncompleteArrayField<io_uring_probe_op>,
}

9
io_uring/src/lib.rs Normal file
View file

@ -0,0 +1,9 @@
// Copyright 2020 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
mod bindings;
mod syscalls;
mod uring;
pub use uring::*;

41
io_uring/src/syscalls.rs Normal file
View file

@ -0,0 +1,41 @@
// Copyright 2020 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
use std::os::unix::io::RawFd;
use std::ptr::null_mut;
use libc::{c_int, c_long, c_void};
use syscall_defines::linux::LinuxSyscall::*;
use crate::bindings::*;
/// Returns the system error as the result;
pub type Result<T> = std::result::Result<T, c_int>;
pub unsafe fn io_uring_setup(num_entries: usize, params: &io_uring_params) -> Result<RawFd> {
let ret = libc::syscall(
SYS_io_uring_setup as c_long,
num_entries as c_int,
params as *const _,
);
if ret < 0 {
return Err(*libc::__errno_location());
}
Ok(ret as RawFd)
}
pub unsafe fn io_uring_enter(fd: RawFd, to_submit: u64, to_wait: u64, flags: u32) -> Result<()> {
let ret = libc::syscall(
SYS_io_uring_enter as c_long,
fd,
to_submit as c_int,
to_wait as c_int,
flags as c_int,
null_mut::<*mut c_void>(),
);
if ret < 0 {
return Err(*libc::__errno_location());
}
Ok(())
}

772
io_uring/src/uring.rs Normal file
View file

@ -0,0 +1,772 @@
// Copyright 2020 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
use std::fmt;
use std::fs::File;
use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
use std::ptr::null_mut;
use std::sync::atomic::{AtomicU32, Ordering};
use sys_util::{MemoryMapping, WatchingEvents};
use crate::bindings::*;
use crate::syscalls::*;
/// Holds per-operation, user specified data. The usage is up to the caller. The most common use is
/// for callers to identify each request.
pub type UserData = u64;
#[derive(Debug)]
pub enum Error {
/// The call to `io_uring_enter` failed with the given errno.
RingEnter(libc::c_int),
/// The call to `io_uring_setup` failed with the given errno.
Setup(libc::c_int),
/// Failed to map the completion ring.
MappingCompleteRing(sys_util::MmapError),
/// Failed to map the submit ring.
MappingSubmitRing(sys_util::MmapError),
/// Failed to map submit entries.
MappingSubmitEntries(sys_util::MmapError),
/// Too many ops are already queued.
NoSpace,
}
pub type Result<T> = std::result::Result<T, Error>;
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use self::Error::*;
match self {
RingEnter(e) => write!(f, "Failed to enter io uring {}", e),
Setup(e) => write!(f, "Failed to setup io uring {}", e),
MappingCompleteRing(e) => write!(f, "Failed to mmap completion ring {}", e),
MappingSubmitRing(e) => write!(f, "Failed to mmap submit ring {}", e),
MappingSubmitEntries(e) => write!(f, "Failed to mmap submit entries {}", e),
NoSpace => write!(
f,
"No space for more ring entries, try increasing the size passed to `new`",
),
}
}
}
/// Basic statistics about the operations that have been submitted to the uring.
#[derive(Default)]
pub struct URingStats {
total_enter_calls: u64, // Number of times the uring has been entered.
total_ops: u64, // Total ops submitted to io_uring.
total_complete: u64, // Total ops completed by io_uring.
}
/// Unsafe wrapper for the kernel's io_uring interface. Allows for queueing multiple I/O operations
/// to the kernel and asynchronously handling the completion of these operations.
/// Use the various `add_*` functions to configure operations, then call `wait` to start
/// the operations and get any completed results. Each op is given a u64 user_data argument that is
/// used to identify the result when returned in the iterator provided by `wait`.
///
/// # Example polling an FD for readable status.
///
/// ```
/// # use std::fs::File;
/// # use std::os::unix::io::AsRawFd;
/// # use std::path::Path;
/// # use sys_util::WatchingEvents;
/// # use io_uring::URingContext;
/// let f = File::open(Path::new("/dev/zero")).unwrap();
/// let mut uring = URingContext::new(16).unwrap();
/// uring
/// .add_poll_fd(f.as_raw_fd(), WatchingEvents::empty().set_read(), 454)
/// .unwrap();
/// let (user_data, res) = uring.wait().unwrap().next().unwrap();
/// assert_eq!(user_data, 454 as UserData);
/// assert_eq!(res.unwrap(), 1 as i32);
///
/// ```
pub struct URingContext {
ring_file: File, // Holds the io_uring context FD returned from io_uring_setup.
submit_ring: SubmitQueueState,
submit_queue_entries: SubmitQueueEntries,
complete_ring: CompleteQueueState,
io_vecs: Vec<libc::iovec>,
in_flight: usize, // The number of pending operations.
added: usize, // The number of ops added since the last call to `io_uring_enter`.
stats: URingStats,
}
impl URingContext {
/// Creates a `URingContext` where the underlying uring has a space for `num_entries`
/// simultaneous operations.
pub fn new(num_entries: usize) -> Result<URingContext> {
let ring_params = io_uring_params::default();
// The below unsafe block isolates the creation of the URingContext. Each step on it's own
// is unsafe. Using the uring FD for the mapping and the offsets returned by the kernel for
// base addresses maintains safety guarantees assuming the kernel API guarantees are
// trusted.
unsafe {
// Safe because the kernel is trusted to only modify params and `File` is created with
// an FD that it takes complete ownership of.
let fd = io_uring_setup(num_entries, &ring_params).map_err(Error::Setup)?;
let ring_file = File::from_raw_fd(fd);
// Mmap the submit and completion queues.
// Safe because we trust the kernel to set valid sizes in `io_uring_setup` and any error
// is checked.
let submit_ring = SubmitQueueState::new(
MemoryMapping::from_fd_offset_populate(
&ring_file,
ring_params.sq_off.array as usize
+ ring_params.sq_entries as usize * std::mem::size_of::<u32>(),
u64::from(IORING_OFF_SQ_RING),
)
.map_err(Error::MappingSubmitRing)?,
&ring_params,
);
let submit_queue_entries = SubmitQueueEntries {
mmap: MemoryMapping::from_fd_offset_populate(
&ring_file,
ring_params.sq_entries as usize * std::mem::size_of::<io_uring_sqe>(),
u64::from(IORING_OFF_SQES),
)
.map_err(Error::MappingSubmitEntries)?,
len: ring_params.sq_entries as usize,
};
let complete_ring = CompleteQueueState::new(
MemoryMapping::from_fd_offset_populate(
&ring_file,
ring_params.cq_off.cqes as usize
+ ring_params.cq_entries as usize * std::mem::size_of::<io_uring_cqe>(),
u64::from(IORING_OFF_CQ_RING),
)
.map_err(Error::MappingCompleteRing)?,
&ring_params,
);
Ok(URingContext {
ring_file,
submit_ring,
submit_queue_entries,
complete_ring,
io_vecs: vec![
libc::iovec {
iov_base: null_mut(),
iov_len: 0
};
num_entries
],
added: 0,
in_flight: 0,
stats: Default::default(),
})
}
}
// Call `f` with the next available sqe or return an error if none are available.
// After `f` returns, the sqe is appended to the kernel's queue.
fn prep_next_sqe<F>(&mut self, mut f: F) -> Result<()>
where
F: FnMut(&mut io_uring_sqe, &mut libc::iovec),
{
// Find the next free submission entry in the submit ring and fill it with an iovec.
// The below raw pointer derefs are safe because the memory the pointers use lives as long
// as the mmap in self.
let tail = self.submit_ring.pointers.tail(Ordering::Relaxed);
let next_tail = tail.wrapping_add(1);
if next_tail == self.submit_ring.pointers.head(Ordering::Acquire) {
return Err(Error::NoSpace);
}
// `tail` is the next sqe to use.
let index = (tail & self.submit_ring.ring_mask) as usize;
let sqe = self.submit_queue_entries.get_mut(index).unwrap();
f(sqe, &mut self.io_vecs[index]);
// Tells the kernel to use the new index when processing the entry at that index.
self.submit_ring.set_array_entry(index, index as u32);
// Ensure the above writes to sqe are seen before the tail is updated.
// set_tail uses Release ordering when storing to the ring.
self.submit_ring.pointers.set_tail(next_tail);
self.added += 1;
Ok(())
}
unsafe fn add_rw_op(
&mut self,
ptr: *const u8,
len: usize,
fd: RawFd,
offset: u64,
user_data: UserData,
op: u8,
) -> Result<()> {
self.prep_next_sqe(|sqe, iovec| {
iovec.iov_base = ptr as *const libc::c_void as *mut _;
iovec.iov_len = len;
sqe.opcode = op;
sqe.addr = iovec as *const _ as *const libc::c_void as u64;
sqe.len = 1;
sqe.__bindgen_anon_1.off = offset;
sqe.__bindgen_anon_3.__bindgen_anon_1.buf_index = 0;
sqe.ioprio = 0;
sqe.user_data = user_data;
sqe.flags = 0;
sqe.fd = fd;
})?;
Ok(())
}
/// Asynchronously writes to `fd` from the address given in `ptr`.
/// # Safety
/// `add_write` will write up to `len` bytes of data from the address given by `ptr`. This is
/// only safe if the caller guarantees that the memory lives until the transaction is complete
/// and that completion has been returned from the `wait` function. In addition there must not
/// be other references to the data pointed to by `ptr` until the operation completes. Ensure
/// that the fd remains open until the op completes as well.
pub unsafe fn add_write(
&mut self,
ptr: *const u8,
len: usize,
fd: RawFd,
offset: u64,
user_data: UserData,
) -> Result<()> {
self.add_rw_op(ptr, len, fd, offset, user_data, IORING_OP_WRITEV as u8)
}
/// Asynchronously reads from `fd` to the address given in `ptr`.
/// # Safety
/// `add_read` will write up to `len` bytes of data to the address given by `ptr`. This is only
/// safe if the caller guarantees there are no other references to that memory and that the
/// memory lives until the transaction is complete and that completion has been returned from
/// the `wait` function. In addition there must not be any mutable references to the data
/// pointed to by `ptr` until the operation completes. Ensure that the fd remains open until
/// the op completes as well.
pub unsafe fn add_read(
&mut self,
ptr: *mut u8,
len: usize,
fd: RawFd,
offset: u64,
user_data: UserData,
) -> Result<()> {
self.add_rw_op(ptr, len, fd, offset, user_data, IORING_OP_READV as u8)
}
/// Syncs all completed operations, the ordering with in-flight async ops is not
/// defined.
pub fn add_fsync(&mut self, fd: RawFd, user_data: UserData) -> Result<()> {
self.prep_next_sqe(|sqe, _iovec| {
sqe.opcode = IORING_OP_FSYNC as u8;
sqe.fd = fd;
sqe.user_data = user_data;
sqe.addr = 0;
sqe.len = 0;
sqe.__bindgen_anon_1.off = 0;
sqe.__bindgen_anon_3.__bindgen_anon_1.buf_index = 0;
sqe.__bindgen_anon_2.rw_flags = 0;
sqe.ioprio = 0;
sqe.flags = 0;
})
}
/// See the usage of `fallocate`, this asynchronously performs the same operations.
pub fn add_fallocate(
&mut self,
fd: RawFd,
offset: u64,
len: usize,
mode: u64,
user_data: UserData,
) -> Result<()> {
// Note that len for fallocate in passed in the addr field of the sqe and the mode uses the
// len field.
self.prep_next_sqe(|sqe, _iovec| {
sqe.opcode = IORING_OP_FALLOCATE as u8;
sqe.fd = fd;
sqe.addr = len as u64;
sqe.len = mode as u32;
sqe.__bindgen_anon_1.off = offset;
sqe.user_data = user_data;
sqe.__bindgen_anon_3.__bindgen_anon_1.buf_index = 0;
sqe.__bindgen_anon_2.rw_flags = 0;
sqe.ioprio = 0;
sqe.flags = 0;
})
}
/// Adds an FD to be polled based on the given flags.
/// The user must keep the FD open until the operation completion is returned from
/// `wait`.
/// Note that io_uring is always a one shot poll. After the fd is returned, it must be re-added
/// to get future events.
pub fn add_poll_fd(
&mut self,
fd: RawFd,
events: WatchingEvents,
user_data: UserData,
) -> Result<()> {
self.prep_next_sqe(|sqe, _iovec| {
sqe.opcode = IORING_OP_POLL_ADD as u8;
sqe.fd = fd;
sqe.user_data = user_data;
sqe.__bindgen_anon_2.poll_events = events.get_raw() as u16;
sqe.addr = 0;
sqe.len = 0;
sqe.__bindgen_anon_1.off = 0;
sqe.__bindgen_anon_3.__bindgen_anon_1.buf_index = 0;
sqe.ioprio = 0;
sqe.flags = 0;
})
}
/// Removes an FD that was previously added with `add_poll_fd`.
pub fn remove_poll_fd(
&mut self,
fd: RawFd,
events: WatchingEvents,
user_data: UserData,
) -> Result<()> {
self.prep_next_sqe(|sqe, _iovec| {
sqe.opcode = IORING_OP_POLL_REMOVE as u8;
sqe.fd = fd;
sqe.user_data = user_data;
sqe.__bindgen_anon_2.poll_events = events.get_raw() as u16;
sqe.addr = 0;
sqe.len = 0;
sqe.__bindgen_anon_1.off = 0;
sqe.__bindgen_anon_3.__bindgen_anon_1.buf_index = 0;
sqe.ioprio = 0;
sqe.flags = 0;
})
}
/// Sends operations added with the `add_*` functions to the kernel.
pub fn submit(&mut self) -> Result<()> {
self.in_flight += self.added;
self.stats.total_ops = self.stats.total_ops.wrapping_add(self.added as u64);
if self.added > 0 {
self.stats.total_enter_calls = self.stats.total_enter_calls.wrapping_add(1);
unsafe {
// Safe because the only memory modified is in the completion queue.
io_uring_enter(self.ring_file.as_raw_fd(), self.added as u64, 1, 0)
.map_err(Error::RingEnter)?;
}
}
self.added = 0;
Ok(())
}
/// Sends operations added with the `add_*` functions to the kernel and return an iterator to any
/// completed operations. `wait` blocks until at least one completion is ready. If called
/// without any new events added, this simply waits for any existing events to complete and
/// returns as soon an one or more is ready.
pub fn wait<'a>(
&'a mut self,
) -> Result<impl Iterator<Item = (UserData, std::io::Result<i32>)> + 'a> {
let completed = self.complete_ring.num_completed();
self.stats.total_complete = self.stats.total_complete.wrapping_add(completed as u64);
self.in_flight -= completed;
self.in_flight += self.added;
self.stats.total_ops = self.stats.total_ops.wrapping_add(self.added as u64);
if self.in_flight > 0 {
unsafe {
self.stats.total_enter_calls = self.stats.total_enter_calls.wrapping_add(1);
// Safe because the only memory modified is in the completion queue.
io_uring_enter(
self.ring_file.as_raw_fd(),
self.added as u64,
1,
IORING_ENTER_GETEVENTS,
)
.map_err(Error::RingEnter)?;
}
}
self.added = 0;
// The CompletionQueue will iterate all completed ops.
Ok(&mut self.complete_ring)
}
}
impl AsRawFd for URingContext {
fn as_raw_fd(&self) -> RawFd {
self.ring_file.as_raw_fd()
}
}
struct SubmitQueueEntries {
mmap: MemoryMapping,
len: usize,
}
impl SubmitQueueEntries {
fn get_mut(&mut self, index: usize) -> Option<&mut io_uring_sqe> {
if index >= self.len {
return None;
}
unsafe {
// Safe because the mut borrow of self resticts to one mutable reference at a time and
// we trust that the kernel has returned enough memory in io_uring_setup and mmap.
Some(&mut *(self.mmap.as_ptr() as *mut io_uring_sqe).add(index))
}
}
}
struct SubmitQueueState {
_mmap: MemoryMapping,
pointers: QueuePointers,
ring_mask: u32,
array: *mut u32,
}
impl SubmitQueueState {
// # Safety
// Safe iff `mmap` is created by mapping from a uring FD at the SQ_RING offset and params is
// the params struct passed to io_uring_setup.
unsafe fn new(mmap: MemoryMapping, params: &io_uring_params) -> SubmitQueueState {
let ptr = mmap.as_ptr();
// Transmutes are safe because a u32 is atomic on all supported architectures and the
// pointer will live until after self is dropped because the mmap is owned.
let head = ptr.add(params.sq_off.head as usize) as *const AtomicU32;
let tail = ptr.add(params.sq_off.tail as usize) as *const AtomicU32;
// This offset is guaranteed to be within the mmap so unwrap the result.
let ring_mask = mmap.read_obj(params.sq_off.ring_mask as usize).unwrap();
let array = ptr.add(params.sq_off.array as usize) as *mut u32;
SubmitQueueState {
_mmap: mmap,
pointers: QueuePointers { head, tail },
ring_mask,
array,
}
}
// Sets the kernel's array entry at the given `index` to `value`.
fn set_array_entry(&self, index: usize, value: u32) {
// Safe because self being constructed from the correct mmap guaratees that the memory is
// valid to written.
unsafe {
std::ptr::write_volatile(self.array.add(index), value as u32);
}
}
}
struct CompleteQueueState {
mmap: MemoryMapping,
pointers: QueuePointers,
ring_mask: u32,
cqes_offset: u32,
completed: usize,
}
impl CompleteQueueState {
/// # Safety
/// Safe iff `mmap` is created by mapping from a uring FD at the CQ_RING offset and params is
/// the params struct passed to io_uring_setup.
unsafe fn new(mmap: MemoryMapping, params: &io_uring_params) -> CompleteQueueState {
let ptr = mmap.as_ptr();
let head = ptr.add(params.cq_off.head as usize) as *const AtomicU32;
let tail = ptr.add(params.cq_off.tail as usize) as *const AtomicU32;
let ring_mask = mmap.read_obj(params.cq_off.ring_mask as usize).unwrap();
CompleteQueueState {
mmap,
pointers: QueuePointers { head, tail },
ring_mask,
cqes_offset: params.cq_off.cqes,
completed: 0,
}
}
fn get_cqe(&self, head: u32) -> &io_uring_cqe {
unsafe {
// Safe because we trust that the kernel has returned enough memory in io_uring_setup
// and mmap and index is checked within range by the ring_mask.
let cqes = (self.mmap.as_ptr() as *const u8).add(self.cqes_offset as usize)
as *const io_uring_cqe;
let index = head & self.ring_mask;
&*cqes.add(index as usize)
}
}
fn num_completed(&mut self) -> usize {
std::mem::replace(&mut self.completed, 0)
}
}
// Return the completed ops with their result.
impl Iterator for CompleteQueueState {
type Item = (UserData, std::io::Result<i32>);
fn next(&mut self) -> Option<Self::Item> {
// Safe because the pointers to the atomics are valid and the cqe must be in range
// because the kernel provided mask is applied to the index.
let head = self.pointers.head(Ordering::Relaxed);
// Synchronize the read of tail after the read of head.
if head == self.pointers.tail(Ordering::Acquire) {
return None;
}
self.completed += 1;
let cqe = self.get_cqe(head);
let user_data = cqe.user_data;
let res = cqe.res;
// Store the new head and ensure the reads above complete before the kernel sees the
// update to head, `set_head` uses `Release` ordering
let new_head = head.wrapping_add(1);
self.pointers.set_head(new_head);
let io_res = match res {
r if r < 0 => Err(std::io::Error::from_raw_os_error(r)),
r => Ok(r),
};
Some((user_data, io_res))
}
}
struct QueuePointers {
head: *const AtomicU32,
tail: *const AtomicU32,
}
impl QueuePointers {
// Loads the tail pointer atomically with the given ordering.
fn tail(&self, ordering: Ordering) -> u32 {
// Safe because self being constructed from the correct mmap guaratees that the memory is
// valid to read.
unsafe { (*self.tail).load(ordering) }
}
// Stores the new value of the tail in the submit queue. This allows the kernel to start
// processing entries that have been added up until the given tail pointer.
// Always stores with release ordering as that is the only valid way to use the pointer.
fn set_tail(&self, next_tail: u32) {
// Safe because self being constructed from the correct mmap guaratees that the memory is
// valid to read and it's used as an atomic to cover mutability concerns.
unsafe { (*self.tail).store(next_tail, Ordering::Release) }
}
// Loads the head pointer atomically with the given ordering.
fn head(&self, ordering: Ordering) -> u32 {
// Safe because self being constructed from the correct mmap guaratees that the memory is
// valid to read.
unsafe { (*self.head).load(ordering) }
}
// Stores the new value of the head in the submit queue. This allows the kernel to start
// processing entries that have been added up until the given head pointer.
// Always stores with release ordering as that is the only valid way to use the pointer.
fn set_head(&self, next_head: u32) {
// Safe because self being constructed from the correct mmap guaratees that the memory is
// valid to read and it's used as an atomic to cover mutability concerns.
unsafe { (*self.head).store(next_head, Ordering::Release) }
}
}
#[cfg(test)]
mod tests {
use std::fs::OpenOptions;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::time::Duration;
use sys_util::PollContext;
use tempfile::TempDir;
use super::*;
fn append_file_name(path: &Path, name: &str) -> PathBuf {
let mut joined = path.to_path_buf();
joined.push(name);
joined
}
#[test]
fn read_one_block_at_a_time() {
let tempdir = TempDir::new().unwrap();
let file_path = append_file_name(tempdir.path(), "test");
let queue_size = 128;
let mut uring = URingContext::new(queue_size).unwrap();
let mut buf = [0u8; 0x1000];
let f = OpenOptions::new()
.read(true)
.write(true)
.create(true)
.truncate(true)
.open(&file_path)
.unwrap();
f.set_len(0x1000 * 2).unwrap();
for i in 0..queue_size * 2 {
unsafe {
// Safe because the `wait` call waits until the kernel is done with`buf`.
let index = i as u64;
uring
.add_read(
buf.as_mut_ptr(),
buf.len(),
f.as_raw_fd(),
(index % 2) * 0x1000,
index,
)
.unwrap();
let (user_data, res) = uring.wait().unwrap().next().unwrap();
assert_eq!(user_data, i as UserData);
assert_eq!(res.unwrap(), buf.len() as i32);
}
}
}
#[test]
fn write_one_block() {
let tempdir = TempDir::new().unwrap();
let file_path = append_file_name(tempdir.path(), "test");
let mut uring = URingContext::new(16).unwrap();
let mut buf = [0u8; 4096];
let mut f = OpenOptions::new()
.read(true)
.write(true)
.create(true)
.truncate(true)
.open(&file_path)
.unwrap();
f.write(&buf).unwrap();
f.write(&buf).unwrap();
unsafe {
// Safe because the `wait` call waits until the kernel is done mutating `buf`.
uring
.add_write(buf.as_mut_ptr(), buf.len(), f.as_raw_fd(), 0, 55)
.unwrap();
let (user_data, res) = uring.wait().unwrap().next().unwrap();
assert_eq!(user_data, 55 as UserData);
assert_eq!(res.unwrap(), buf.len() as i32);
}
}
#[test]
fn write_one_submit_poll() {
let tempdir = TempDir::new().unwrap();
let file_path = append_file_name(tempdir.path(), "test");
let mut uring = URingContext::new(16).unwrap();
let mut buf = [0u8; 4096];
let mut f = OpenOptions::new()
.read(true)
.write(true)
.create(true)
.truncate(true)
.open(&file_path)
.unwrap();
f.write(&buf).unwrap();
f.write(&buf).unwrap();
let ctx: PollContext<u64> = PollContext::build_with(&[(&uring, 1)]).unwrap();
{
// Test that the uring context isn't readable before any events are complete.
let events = ctx.wait_timeout(Duration::from_millis(1)).unwrap();
assert!(events.iter_readable().next().is_none());
}
unsafe {
// Safe because the `wait` call waits until the kernel is done mutating `buf`.
uring
.add_write(buf.as_mut_ptr(), buf.len(), f.as_raw_fd(), 0, 55)
.unwrap();
uring.submit().unwrap();
// Poll for completion with epoll.
let events = ctx.wait().unwrap();
let event = events.iter_readable().next().unwrap();
assert_eq!(event.token(), 1);
let (user_data, res) = uring.wait().unwrap().next().unwrap();
assert_eq!(user_data, 55 as UserData);
assert_eq!(res.unwrap(), buf.len() as i32);
}
}
#[test]
fn fallocate_fsync() {
let tempdir = TempDir::new().unwrap();
let file_path = append_file_name(tempdir.path(), "test");
{
let buf = [0u8; 4096];
let mut f = OpenOptions::new()
.read(true)
.write(true)
.create(true)
.truncate(true)
.open(&file_path)
.unwrap();
f.write(&buf).unwrap();
}
let init_size = std::fs::metadata(&file_path).unwrap().len() as usize;
let set_size = init_size + 1024 * 1024 * 50;
let f = OpenOptions::new()
.read(true)
.write(true)
.create(true)
.open(&file_path)
.unwrap();
let mut uring = URingContext::new(16).unwrap();
uring
.add_fallocate(f.as_raw_fd(), 0, set_size, 0, 66)
.unwrap();
let (user_data, res) = uring.wait().unwrap().next().unwrap();
assert_eq!(user_data, 66 as UserData);
assert_eq!(res.unwrap(), 0 as i32);
uring.add_fsync(f.as_raw_fd(), 67).unwrap();
let (user_data, res) = uring.wait().unwrap().next().unwrap();
assert_eq!(user_data, 67 as UserData);
assert_eq!(res.unwrap(), 0 as i32);
uring
.add_fallocate(
f.as_raw_fd(),
init_size as u64,
set_size - init_size,
(libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE) as u64,
68,
)
.unwrap();
let (user_data, res) = uring.wait().unwrap().next().unwrap();
assert_eq!(user_data, 68 as UserData);
assert_eq!(res.unwrap(), 0 as i32);
drop(f); // Close to ensure directory entires for metadata are updated.
let new_size = std::fs::metadata(&file_path).unwrap().len() as usize;
assert_eq!(new_size, set_size);
}
#[test]
fn dev_zero_readable() {
let f = File::open(Path::new("/dev/zero")).unwrap();
let mut uring = URingContext::new(16).unwrap();
uring
.add_poll_fd(f.as_raw_fd(), WatchingEvents::empty().set_read(), 454)
.unwrap();
let (user_data, res) = uring.wait().unwrap().next().unwrap();
assert_eq!(user_data, 454 as UserData);
assert_eq!(res.unwrap(), 1 as i32);
}
}