From 13b958b967b0453cd6c1770c86a1028fb4a0d02d Mon Sep 17 00:00:00 2001 From: Zihan Chen Date: Thu, 14 Nov 2024 20:35:36 -0800 Subject: [PATCH] disk: Add seekable zstd disk support A raw disk image can be compressed as a seekable zstd and attached transaprently to a VM as a block device. TESTED=can ro mount and read seekable compressed debian rootfs BUG=b:377945783 Change-Id: Iba1950dbfc0ba99b0581e842964848d5a447b824 Reviewed-on: https://chromium-review.googlesource.com/c/crosvm/crosvm/+/6024317 Commit-Queue: Zihan Chen Reviewed-by: Daniel Verkamp Auto-Submit: Zihan Chen --- Cargo.lock | 30 +++ Cargo.toml | 6 + disk/Cargo.toml | 5 +- disk/src/disk.rs | 33 +++- disk/src/zstd.rs | 483 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 554 insertions(+), 3 deletions(-) create mode 100644 disk/src/zstd.rs diff --git a/Cargo.lock b/Cargo.lock index a82ae81dcd..50498b3893 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1094,6 +1094,7 @@ dependencies = [ name = "disk" version = "0.1.0" dependencies = [ + "anyhow", "async-trait", "base", "cfg-if", @@ -1113,6 +1114,7 @@ dependencies = [ "vm_memory", "winapi", "zerocopy", + "zstd", ] [[package]] @@ -3655,3 +3657,31 @@ name = "zeroize" version = "1.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c394b5bd0c6f669e7275d9c20aa90ae064cb22e75a1cad54e1b34088034b149f" + +[[package]] +name = "zstd" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.13+zstd.1.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml index ae3f39c98a..f94461df05 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -139,6 +139,11 @@ balloon = ["devices/balloon", "vm_control/balloon"] ## concatenate large file system images into a single disk image. composite-disk = ["protos/composite-disk", "protobuf", "disk/composite-disk"] +## Enables support for using a seekable zstd archive of a raw disk image as a read-only disk. +## See [Format Specs](https://github.com/facebook/zstd/tree/v1.5.6/contrib/seekable_format) for +## more information. +zstd-disk = ["disk/zstd-disk"] + ## Enables virtiofs uid-gid mapping from the host side through command line when user-namespace ## isn't available for non-root users. This format is supported only for vhost-user-fs. fs_runtime_ugid_map = ["devices/fs_runtime_ugid_map"] @@ -402,6 +407,7 @@ all-default = [ "vtpm", "wl-dmabuf", "x", + "zstd-disk" ] ## All features that are compiled and tested for aarch64 diff --git a/disk/Cargo.toml b/disk/Cargo.toml index 39f4b47eb4..79237ecff6 100644 --- a/disk/Cargo.toml +++ b/disk/Cargo.toml @@ -11,8 +11,10 @@ path = "src/disk.rs" android-sparse = [] composite-disk = ["crc32fast", "protos", "protobuf", "uuid"] qcow = [] +zstd-disk = ["zstd"] [dependencies] +anyhow = "*" async-trait = "0.1.36" base = { path = "../base" } cfg-if = "1.0.0" @@ -23,12 +25,13 @@ libc = "0.2" protobuf = { version = "3.2", optional = true } protos = { path = "../protos", features = ["composite-disk"], optional = true } remain = "0.2" -serde = { version = "1", features = [ "derive" ] } +serde = { version = "1", features = ["derive"] } sync = { path = "../common/sync" } thiserror = "1" uuid = { version = "1", features = ["v4"], optional = true } vm_memory = { path = "../vm_memory" } zerocopy = { version = "0.7", features = ["derive"] } +zstd = { version = "0.13", optional = true } [target.'cfg(windows)'.dependencies] winapi = "0.3" diff --git a/disk/src/disk.rs b/disk/src/disk.rs index 324dcdc2db..25b3273faa 100644 --- a/disk/src/disk.rs +++ b/disk/src/disk.rs @@ -65,6 +65,17 @@ use android_sparse::AndroidSparse; use android_sparse::SPARSE_HEADER_MAGIC; use sys::read_from_disk; +#[cfg(feature = "zstd")] +mod zstd; +#[cfg(feature = "zstd")] +use zstd::ZstdDisk; +#[cfg(feature = "zstd")] +use zstd::ZSTD_FRAME_MAGIC; +#[cfg(feature = "zstd")] +use zstd::ZSTD_SKIPPABLE_MAGIC_HIGH; +#[cfg(feature = "zstd")] +use zstd::ZSTD_SKIPPABLE_MAGIC_LOW; + /// Nesting depth limit for disk formats that can open other disk files. const MAX_NESTING_DEPTH: u32 = 10; @@ -80,6 +91,9 @@ pub enum Error { #[cfg(feature = "composite-disk")] #[error("failure in composite disk: {0}")] CreateCompositeDisk(composite::Error), + #[cfg(feature = "zstd")] + #[error("failure in zstd disk: {0}")] + CreateZstdDisk(anyhow::Error), #[error("failure creating single file disk: {0}")] CreateSingleFileDisk(cros_async::AsyncError), #[error("failed to set O_DIRECT on disk image: {0}")] @@ -201,6 +215,7 @@ pub enum ImageType { Qcow2, CompositeDisk, AndroidSparse, + Zstd, } /// Detect the type of an image file by checking for a valid header of the supported formats. @@ -239,8 +254,12 @@ pub fn detect_image_type(file: &File, overlapped_mode: bool) -> Result>::try_into(v).ok()) + { #[cfg(feature = "qcow")] if magic4 == QCOW_MAGIC.to_be_bytes() { return Ok(ImageType::Qcow2); @@ -249,6 +268,13 @@ pub fn detect_image_type(file: &File, overlapped_mode: bool) -> Result= ZSTD_SKIPPABLE_MAGIC_LOW + && u32::from_le_bytes(magic4) <= ZSTD_SKIPPABLE_MAGIC_HIGH) + { + return Ok(ImageType::Zstd); + } } Ok(ImageType::Raw) @@ -306,6 +332,9 @@ pub fn open_disk_file(params: DiskFileParams) -> Result> { Box::new(AndroidSparse::from_file(raw_image).map_err(Error::CreateAndroidSparseDisk)?) as Box } + #[cfg(feature = "zstd")] + ImageType::Zstd => Box::new(ZstdDisk::from_file(raw_image).map_err(Error::CreateZstdDisk)?) + as Box, #[allow(unreachable_patterns)] _ => return Err(Error::UnknownType), }) diff --git a/disk/src/zstd.rs b/disk/src/zstd.rs new file mode 100644 index 0000000000..c48c64fa67 --- /dev/null +++ b/disk/src/zstd.rs @@ -0,0 +1,483 @@ +// Copyright 2024 The ChromiumOS Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +//! Use seekable zstd archive of raw disk image as read only disk + +use std::cmp::min; +use std::fs::File; +use std::io; +use std::io::ErrorKind; +use std::io::Read; +use std::io::Seek; +use std::sync::Arc; + +use anyhow::bail; +use anyhow::Context; +use async_trait::async_trait; +use base::AsRawDescriptor; +use base::FileAllocate; +use base::FileReadWriteAtVolatile; +use base::FileSetLen; +use base::RawDescriptor; +use base::VolatileSlice; +use cros_async::BackingMemory; +use cros_async::Executor; +use cros_async::IoSource; + +use crate::AsyncDisk; +use crate::DiskFile; +use crate::DiskGetLen; +use crate::Error as DiskError; +use crate::Result as DiskResult; +use crate::ToAsyncDisk; + +// Zstandard frame magic +pub const ZSTD_FRAME_MAGIC: u32 = 0xFD2FB528; + +// Skippable frame magic can be anything between [0x184D2A50, 0x184D2A5F] +pub const ZSTD_SKIPPABLE_MAGIC_LOW: u32 = 0x184D2A50; +pub const ZSTD_SKIPPABLE_MAGIC_HIGH: u32 = 0x184D2A5F; +pub const ZSTD_SEEK_TABLE_MAGIC: u32 = 0x8F92EAB1; + +pub const ZSTD_DEFAULT_FRAME_SIZE: usize = 128 << 10; // 128KB + +#[derive(Clone, Debug)] +pub struct ZstdSeekTable { + // Cumulative sum of decompressed sizes of all frames before the indexed frame. + // The last element is the total decompressed size of the zstd archive. + cumulative_decompressed_sizes: Vec, + // Cumulative sum of compressed sizes of all frames before the indexed frame. + // The last element is the total compressed size of the zstd archive. + cumulative_compressed_sizes: Vec, +} + +impl ZstdSeekTable { + /// Read seek table entries from seek_table_entries + pub fn from_footer( + seek_table_entries: &[u8], + num_frames: u32, + checksum_flag: bool, + ) -> anyhow::Result { + let mut cumulative_decompressed_size: u64 = 0; + let mut cumulative_compressed_size: u64 = 0; + let mut cumulative_decompressed_sizes = Vec::with_capacity(num_frames as usize + 1); + let mut cumulative_compressed_sizes = Vec::with_capacity(num_frames as usize + 1); + let mut offset = 0; + cumulative_decompressed_sizes.push(0); + cumulative_compressed_sizes.push(0); + for _ in 0..num_frames { + let compressed_size = u32::from_le_bytes( + seek_table_entries + .get(offset..offset + 4) + .context("failed to parse seektable entry")? + .try_into()?, + ); + let decompressed_size = u32::from_le_bytes( + seek_table_entries + .get(offset + 4..offset + 8) + .context("failed to parse seektable entry")? + .try_into()?, + ); + cumulative_decompressed_size += decompressed_size as u64; + cumulative_compressed_size += compressed_size as u64; + cumulative_decompressed_sizes.push(cumulative_decompressed_size); + cumulative_compressed_sizes.push(cumulative_compressed_size); + offset += 8 + (checksum_flag as usize * 4); + } + cumulative_decompressed_sizes.push(cumulative_decompressed_size); + cumulative_compressed_sizes.push(cumulative_compressed_size); + + Ok(ZstdSeekTable { + cumulative_decompressed_sizes, + cumulative_compressed_sizes, + }) + } + + /// Returns the index of the frame that contains the given decompressed offset. + pub fn find_frame_index(&self, decompressed_offset: u64) -> Option { + if self.cumulative_decompressed_sizes.is_empty() + || decompressed_offset >= *self.cumulative_decompressed_sizes.last().unwrap() + { + return None; + } + self.cumulative_decompressed_sizes + .partition_point(|&size| size <= decompressed_offset) + .checked_sub(1) + } +} + +#[derive(Debug)] +pub struct ZstdDisk { + file: File, + seek_table: ZstdSeekTable, +} + +impl ZstdDisk { + pub fn from_file(mut file: File) -> anyhow::Result { + // Verify file is large enough to contain a seek table (17 bytes) + if file.metadata()?.len() < 17 { + return Err(anyhow::anyhow!("File too small to contain zstd seek table")); + } + + // Read last 9 bytes as seek table footer + let mut seektable_footer = [0u8; 9]; + file.seek(std::io::SeekFrom::End(-9))?; + file.read_exact(&mut seektable_footer)?; + + // Verify last 4 bytes of footer is seek table magic + if u32::from_le_bytes(seektable_footer[5..9].try_into()?) != ZSTD_SEEK_TABLE_MAGIC { + return Err(anyhow::anyhow!("Invalid zstd seek table magic")); + } + + // Get number of frame from seek table + let num_frames = u32::from_le_bytes(seektable_footer[0..4].try_into()?); + + // Read flags from seek table descriptor + let checksum_flag = (seektable_footer[4] >> 7) & 1 != 0; + if (seektable_footer[4] & 0x7C) != 0 { + bail!( + "This zstd seekable decoder cannot parse seek table with non-zero reserved flags" + ); + } + + let seek_table_entries_size = num_frames * (8 + (checksum_flag as u32 * 4)); + + // Seek to the beginning of the seek table + file.seek(std::io::SeekFrom::End( + -(9 + seek_table_entries_size as i64), + ))?; + + // Return new ZstdDisk + let mut seek_table_entries: Vec = vec![0u8; seek_table_entries_size as usize]; + file.read_exact(&mut seek_table_entries)?; + + let seek_table = + ZstdSeekTable::from_footer(&seek_table_entries, num_frames, checksum_flag)?; + + Ok(ZstdDisk { file, seek_table }) + } +} + +impl DiskGetLen for ZstdDisk { + fn get_len(&self) -> std::io::Result { + self.seek_table + .cumulative_decompressed_sizes + .last() + .copied() + .ok_or(io::ErrorKind::InvalidData.into()) + } +} + +impl FileSetLen for ZstdDisk { + fn set_len(&self, _len: u64) -> std::io::Result<()> { + Err(io::Error::new( + io::ErrorKind::PermissionDenied, + "unsupported operation", + )) + } +} + +impl AsRawDescriptor for ZstdDisk { + fn as_raw_descriptor(&self) -> RawDescriptor { + self.file.as_raw_descriptor() + } +} + +struct CompressedReadInstruction { + frame_index: usize, + read_offset: u64, + read_size: u64, +} + +fn compresed_frame_read_instruction( + seek_table: &ZstdSeekTable, + offset: u64, +) -> anyhow::Result { + let frame_index = seek_table + .find_frame_index(offset) + .with_context(|| format!("no frame for offset {}", offset))?; + let compressed_offset = seek_table.cumulative_compressed_sizes[frame_index]; + let next_compressed_offset = seek_table + .cumulative_compressed_sizes + .get(frame_index + 1) + .context("Offset out of range (next_compressed_offset overflow)")?; + let compressed_size = next_compressed_offset - compressed_offset; + Ok(CompressedReadInstruction { + frame_index, + read_offset: compressed_offset, + read_size: compressed_size, + }) +} + +impl FileReadWriteAtVolatile for ZstdDisk { + fn read_at_volatile(&self, slice: VolatileSlice, offset: u64) -> io::Result { + let read_instruction = compresed_frame_read_instruction(&self.seek_table, offset) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + + let mut compressed_data = vec![0u8; read_instruction.read_size as usize]; + + let compressed_frame_slice = VolatileSlice::new(compressed_data.as_mut_slice()); + + self.file + .read_at_volatile(compressed_frame_slice, read_instruction.read_offset) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + + let mut decompressor: zstd::bulk::Decompressor<'_> = zstd::bulk::Decompressor::new()?; + let mut decompressed_data = Vec::with_capacity(ZSTD_DEFAULT_FRAME_SIZE); + let decoded_size = + decompressor.decompress_to_buffer(&compressed_data, &mut decompressed_data)?; + + let decompressed_offset_in_frame = + offset - self.seek_table.cumulative_decompressed_sizes[read_instruction.frame_index]; + + if decompressed_offset_in_frame >= decoded_size as u64 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "BUG: Frame offset larger than decoded size", + )); + } + + let read_len = min( + slice.size() as u64, + (decoded_size as u64) - decompressed_offset_in_frame, + ) as usize; + let data_to_copy = &decompressed_data[decompressed_offset_in_frame as usize..][..read_len]; + slice + .sub_slice(0, read_len) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))? + .copy_from(data_to_copy); + Ok(data_to_copy.len()) + } + + fn write_at_volatile(&self, _slice: VolatileSlice, _offset: u64) -> io::Result { + Err(io::Error::new( + io::ErrorKind::PermissionDenied, + "unsupported operation", + )) + } +} + +pub struct AsyncZstdDisk { + inner: IoSource, + seek_table: ZstdSeekTable, +} + +impl ToAsyncDisk for ZstdDisk { + fn to_async_disk(self: Box, ex: &Executor) -> DiskResult> { + Ok(Box::new(AsyncZstdDisk { + inner: ex.async_from(self.file).map_err(DiskError::ToAsync)?, + seek_table: self.seek_table, + })) + } +} + +impl DiskGetLen for AsyncZstdDisk { + fn get_len(&self) -> io::Result { + self.seek_table + .cumulative_decompressed_sizes + .last() + .copied() + .ok_or(io::ErrorKind::InvalidData.into()) + } +} + +impl FileSetLen for AsyncZstdDisk { + fn set_len(&self, _len: u64) -> io::Result<()> { + Err(io::Error::new( + io::ErrorKind::PermissionDenied, + "unsupported operation", + )) + } +} + +impl FileAllocate for AsyncZstdDisk { + fn allocate(&self, _offset: u64, _length: u64) -> io::Result<()> { + Err(io::Error::new( + io::ErrorKind::PermissionDenied, + "unsupported operation", + )) + } +} + +#[async_trait(?Send)] +impl AsyncDisk for AsyncZstdDisk { + async fn flush(&self) -> DiskResult<()> { + // zstd is read-only, nothing to flush. + Ok(()) + } + + async fn fsync(&self) -> DiskResult<()> { + // Do nothing because it's read-only. + Ok(()) + } + + async fn fdatasync(&self) -> DiskResult<()> { + // Do nothing because it's read-only. + Ok(()) + } + + /// Reads data from `file_offset` of decompressed disk image till the end of current + /// zstd frame and write them into memory `mem` at `mem_offsets`. This function should + /// function the same as running `preadv()` on decompressed zstd image and reading into + /// the array of `iovec`s specified with `mem` and `mem_offsets`. + async fn read_to_mem<'a>( + &'a self, + file_offset: u64, + mem: Arc, + mem_offsets: cros_async::MemRegionIter<'a>, + ) -> DiskResult { + let read_instruction = compresed_frame_read_instruction(&self.seek_table, file_offset) + .map_err(|e| DiskError::ReadingData(io::Error::new(io::ErrorKind::InvalidData, e)))?; + + let compressed_data = vec![0u8; read_instruction.read_size as usize]; + + let (compressed_read_size, compressed_data) = self + .inner + .read_to_vec(Some(read_instruction.read_offset), compressed_data) + .await + .map_err(|e| DiskError::ReadingData(io::Error::new(ErrorKind::Other, e)))?; + + if compressed_read_size != read_instruction.read_size as usize { + return Err(DiskError::ReadingData(io::Error::new( + ErrorKind::UnexpectedEof, + "Read from compressed data result in wrong length", + ))); + } + + let mut decompressor: zstd::bulk::Decompressor<'_> = + zstd::bulk::Decompressor::new().map_err(DiskError::ReadingData)?; + let mut decompressed_data = Vec::with_capacity(ZSTD_DEFAULT_FRAME_SIZE); + let decoded_size = decompressor + .decompress_to_buffer(&compressed_data, &mut decompressed_data) + .map_err(DiskError::ReadingData)?; + + let decompressed_offset_in_frame = file_offset + - self.seek_table.cumulative_decompressed_sizes[read_instruction.frame_index]; + + if decompressed_offset_in_frame as usize > decoded_size { + return Err(DiskError::ReadingData(io::Error::new( + ErrorKind::InvalidData, + "BUG: Frame offset larger than decoded size", + ))); + } + + // Copy the decompressed data to the provided memory regions. + let mut total_copied = 0; + for mem_region in mem_offsets { + let src_slice = + &decompressed_data[decompressed_offset_in_frame as usize + total_copied..]; + let dst_slice = mem + .get_volatile_slice(mem_region) + .map_err(DiskError::GuestMemory)?; + + let to_copy = min(src_slice.len(), dst_slice.size()); + + if to_copy > 0 { + dst_slice + .sub_slice(0, to_copy) + .map_err(|e| DiskError::ReadingData(io::Error::new(ErrorKind::Other, e)))? + .copy_from(&src_slice[..to_copy]); + + total_copied += to_copy; + + // if fully copied destination buffers, break the loop. + if total_copied == dst_slice.size() { + break; + } + } + } + + Ok(total_copied) + } + + async fn write_from_mem<'a>( + &'a self, + _file_offset: u64, + _mem: Arc, + _mem_offsets: cros_async::MemRegionIter<'a>, + ) -> DiskResult { + Err(DiskError::UnsupportedOperation) + } + + async fn punch_hole(&self, _file_offset: u64, _length: u64) -> DiskResult<()> { + Err(DiskError::UnsupportedOperation) + } + + async fn write_zeroes_at(&self, _file_offset: u64, _length: u64) -> DiskResult<()> { + Err(DiskError::UnsupportedOperation) + } +} + +impl DiskFile for ZstdDisk {} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_find_frame_index_empty() { + let seek_table = ZstdSeekTable { + cumulative_decompressed_sizes: vec![0], + cumulative_compressed_sizes: vec![0], + }; + assert_eq!(seek_table.find_frame_index(0), None); + assert_eq!(seek_table.find_frame_index(5), None); + } + + #[test] + fn test_find_frame_index_single_frame() { + let seek_table = ZstdSeekTable { + cumulative_decompressed_sizes: vec![0, 100], + cumulative_compressed_sizes: vec![0, 50], + }; + assert_eq!(seek_table.find_frame_index(0), Some(0)); + assert_eq!(seek_table.find_frame_index(50), Some(0)); + assert_eq!(seek_table.find_frame_index(99), Some(0)); + assert_eq!(seek_table.find_frame_index(100), None); + } + + #[test] + fn test_find_frame_index_multiple_frames() { + let seek_table = ZstdSeekTable { + cumulative_decompressed_sizes: vec![0, 100, 300, 500], + cumulative_compressed_sizes: vec![0, 50, 120, 200], + }; + assert_eq!(seek_table.find_frame_index(0), Some(0)); + assert_eq!(seek_table.find_frame_index(99), Some(0)); + assert_eq!(seek_table.find_frame_index(100), Some(1)); + assert_eq!(seek_table.find_frame_index(299), Some(1)); + assert_eq!(seek_table.find_frame_index(300), Some(2)); + assert_eq!(seek_table.find_frame_index(499), Some(2)); + assert_eq!(seek_table.find_frame_index(500), None); + assert_eq!(seek_table.find_frame_index(1000), None); + } + + #[test] + fn test_find_frame_index_with_skippable_frames() { + let seek_table = ZstdSeekTable { + cumulative_decompressed_sizes: vec![0, 100, 100, 100, 300], + cumulative_compressed_sizes: vec![0, 50, 60, 70, 150], + }; + assert_eq!(seek_table.find_frame_index(0), Some(0)); + assert_eq!(seek_table.find_frame_index(99), Some(0)); + // Correctly skips the skippable frames. + assert_eq!(seek_table.find_frame_index(100), Some(3)); + assert_eq!(seek_table.find_frame_index(299), Some(3)); + assert_eq!(seek_table.find_frame_index(300), None); + } + + #[test] + fn test_find_frame_index_with_last_skippable_frame() { + let seek_table = ZstdSeekTable { + cumulative_decompressed_sizes: vec![0, 20, 40, 40, 60, 60, 80, 80], + cumulative_compressed_sizes: vec![0, 10, 20, 30, 40, 50, 60, 70], + }; + assert_eq!(seek_table.find_frame_index(0), Some(0)); + assert_eq!(seek_table.find_frame_index(20), Some(1)); + assert_eq!(seek_table.find_frame_index(21), Some(1)); + assert_eq!(seek_table.find_frame_index(79), Some(5)); + assert_eq!(seek_table.find_frame_index(80), None); + assert_eq!(seek_table.find_frame_index(300), None); + } +}