From 8704d227508f448ccd06ffd011aa0aa7f71abf1b Mon Sep 17 00:00:00 2001 From: Zixuan Chen Date: Thu, 13 Jul 2023 13:33:23 +0800 Subject: [PATCH] feat: compact bytes init --- crates/compact-bytes/Cargo.toml | 10 ++ crates/compact-bytes/README.md | 59 ++++++++++++ crates/compact-bytes/src/lib.rs | 161 ++++++++++++++++++++++++++++++++ 3 files changed, 230 insertions(+) create mode 100644 crates/compact-bytes/Cargo.toml create mode 100644 crates/compact-bytes/README.md create mode 100644 crates/compact-bytes/src/lib.rs diff --git a/crates/compact-bytes/Cargo.toml b/crates/compact-bytes/Cargo.toml new file mode 100644 index 00000000..58488aa1 --- /dev/null +++ b/crates/compact-bytes/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "compact-bytes" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +append-only-bytes = "0.1.8" +fxhash = "0.2.1" diff --git a/crates/compact-bytes/README.md b/crates/compact-bytes/README.md new file mode 100644 index 00000000..e099c712 --- /dev/null +++ b/crates/compact-bytes/README.md @@ -0,0 +1,59 @@ +# compact-bytes + +It's a append-only bytes arena. Appending new bytes will get a pointer to a +slice of the append-only bytes. It will try to reuse the allocated old bytes to +reduce memory usage, if possible. + +# Example + +```rust +use compact_bytes::CompactBytes; + +let mut arena = CompactBytes::new(); +let bytes1 = arena.alloc(b"hello"); +let bytes2 = arena.alloc(b"world"); +assert_eq!(bytes1.as_bytes(), b"hello"); +assert_eq!(bytes2.as_bytes(), b"world"); + +// bytes3 will be a pointer to the same bytes as bytes1 +let bytes3 = arena.alloc(b"hello"); +assert_eq!(bytes3.as_bytes(), b"hello"); +assert_eq!(bytes3.start(), bytes1.start()); +assert_eq!(bytes3.start(), 0); +assert_eq!(bytes3.end(), 5); + +// Allocatting short bytes will not reuse the old bytes. +// Because it will make merging neighboring slices easier so that when +// serializing the bytes it will be more compact. +let mut bytes4 = arena.alloc(b"h"); +assert_eq!(bytes4.start(), 10); +let bytes5 = arena.alloc(b"e"); +assert_eq!(bytes5.start(), 11); +// bytes4 and bytes5 can be merged +assert_eq!(bytes4.can_merge(&bytes5), true); +assert!(bytes4.try_merge(&bytes5).is_ok()); +``` + +In advance mode, it will try to reuse the old bytes as much as possible. +So it will break the bytes into small pieces to reuse them. + +```rust +use compact_bytes::CompactBytes; +use append_only_bytes::BytesSlice; + +let mut arena = CompactBytes::new(); +let bytes1 = arena.alloc(b"hello"); +// it breaks the bytes into 3 pieces "hi ", "hello", " world" +let bytes2: Vec = arena.alloc_advance(b"hi hello world"); +``` + +Or you can use `append` to not reuse the old bytes at all. + +```rust +use compact_bytes::CompactBytes; + +let mut arena = CompactBytes::new(); +let bytes1 = arena.alloc(b"hello"); +let bytes2 = arena.append(b"hello"); +assert_ne!(bytes1.start(), bytes2.start()); +``` diff --git a/crates/compact-bytes/src/lib.rs b/crates/compact-bytes/src/lib.rs new file mode 100644 index 00000000..5ff510b5 --- /dev/null +++ b/crates/compact-bytes/src/lib.rs @@ -0,0 +1,161 @@ +#![doc = include_str!("../README.md")] + +use append_only_bytes::{AppendOnlyBytes, BytesSlice}; +use fxhash::FxHashMap; + +// One entry in the hashtable will take 16bytes. And we need one entry for every position in the document. +// So the size of the hashtable will be 16 * document_size. +pub struct CompactBytes { + bytes: AppendOnlyBytes, + /// map 4 bytes to position in the document + map: FxHashMap, +} + +impl CompactBytes { + pub fn new() -> Self { + CompactBytes { + bytes: AppendOnlyBytes::new(), + map: FxHashMap::default(), + } + } + + pub fn from_bytes(bytes: &[u8]) -> Self { + let mut compact_bytes = CompactBytes::new(); + compact_bytes.append(bytes); + compact_bytes + } + + pub fn alloc(&mut self, bytes: &[u8]) -> BytesSlice { + if let Some((position, length)) = self.lookup(bytes) { + if length == bytes.len() { + return self.bytes.slice(position..position + length); + } + } + self.append(bytes) + } + + pub fn alloc_advance(&mut self, bytes: &[u8]) -> Vec { + // ans is Vec<(from_index, to_index)> + let mut ans: Vec<(usize, usize)> = vec![]; + + fn push(ans: &mut Vec<(usize, usize)>, new: (usize, usize)) { + if let Some(last) = ans.last_mut() { + if last.1 == new.0 { + last.1 = new.1; + return; + } + } + + ans.push(new); + } + + let mut index = 0; + while index < bytes.len() { + match self.lookup(&bytes[index..]) { + Some((pos, len)) => { + push(&mut ans, (pos, pos + len)); + index += len; + } + None => { + push(&mut ans, (self.bytes.len(), self.bytes.len() + 1)); + self.bytes.push(bytes[index]); + index += 1; + } + } + } + + ans.into_iter() + .map(|(from, to)| self.bytes.slice(from..to)) + .collect() + } + + pub fn append(&mut self, bytes: &[u8]) -> BytesSlice { + let old_len = self.bytes.len(); + self.bytes.push_slice(bytes); + self.append_new_entries_to_map(old_len); + self.bytes.slice(old_len..old_len + bytes.len()) + } + + /// Append the entries just created to the map + fn append_new_entries_to_map(&mut self, old_len: usize) { + // if old doc = "", append "0123", then we need to add "0123" entry to the map + // if old doc = "0123", append "x", then we need to add "123x" entry to the map + // if old doc = "0123", append "xyz", then we need to add "123x", "23xy", "3xyz" entries to the map + for i in old_len.saturating_sub(3)..self.bytes.len().saturating_sub(3) { + let key = to_key(&self.bytes[i..i + 4]); + self.map.insert(key, i as u32); + } + } + + /// given bytes, find the position with the longest match in the document + /// return Option<(position, length)> + fn lookup(&self, bytes: &[u8]) -> Option<(usize, usize)> { + if bytes.len() < 4 { + return None; + } + + let key = to_key(bytes); + match self.map.get(&key).copied() { + Some(pos) => { + let mut pos = pos as usize; + let mut len = 4; + while pos + len < self.bytes.len() + && len < bytes.len() + && self.bytes[pos + len] == bytes[len] + { + len += 1; + } + + Some((pos, len)) + } + None => None, + } + } +} + +impl Default for CompactBytes { + fn default() -> Self { + Self::new() + } +} + +/// Convert the first 4 btyes into u32 +fn to_key(bytes: &[u8]) -> u32 { + u32::from_be_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let mut bytes = CompactBytes::new(); + let a = bytes.alloc(b"12345"); + let b = bytes.alloc(b"12345"); + assert_eq!(b.start(), 0); + assert_eq!(b.end(), 5); + let b = bytes.alloc(b"2345"); + assert_eq!(b.start(), 1); + assert_eq!(b.end(), 5); + let b = bytes.alloc(b"23456"); + assert_eq!(b.start(), 5); + assert_eq!(b.end(), 10); + assert_eq!(a.as_bytes(), b"12345"); + } + + #[test] + fn advance() { + let mut bytes = CompactBytes::new(); + bytes.append(b"123456789"); + let ans = bytes.alloc_advance(b"haha12345567891234"); + assert_eq!(ans[0].len(), 4); + assert_eq!(ans[0].start(), 9); + assert_eq!(ans[1].len(), 5); + assert_eq!(ans[1].start(), 0); + assert_eq!(ans[2].len(), 5); + assert_eq!(ans[2].start(), 4); + assert_eq!(ans[3].len(), 4); + assert_eq!(ans[3].start(), 0); + } +}