From f604a89fc398986e118f239cec5ce8704ccb4c23 Mon Sep 17 00:00:00 2001 From: Zixuan Chen Date: Thu, 13 Jul 2023 15:33:49 +0800 Subject: [PATCH] refactor(bytes): refine interface --- Cargo.lock | 8 ++++ crates/compact-bytes/README.md | 3 +- crates/compact-bytes/src/lib.rs | 72 ++++++++++++++++++++++++--------- 3 files changed, 62 insertions(+), 21 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 472a8227..4e74374c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -291,6 +291,14 @@ dependencies = [ "termcolor", ] +[[package]] +name = "compact-bytes" +version = "0.1.0" +dependencies = [ + "append-only-bytes", + "fxhash", +] + [[package]] name = "console_error_panic_hook" version = "0.1.7" diff --git a/crates/compact-bytes/README.md b/crates/compact-bytes/README.md index e099c712..3a3c12b6 100644 --- a/crates/compact-bytes/README.md +++ b/crates/compact-bytes/README.md @@ -39,12 +39,11 @@ So it will break the bytes into small pieces to reuse them. ```rust use compact_bytes::CompactBytes; -use append_only_bytes::BytesSlice; let mut arena = CompactBytes::new(); let bytes1 = arena.alloc(b"hello"); // it breaks the bytes into 3 pieces "hi ", "hello", " world" -let bytes2: Vec = arena.alloc_advance(b"hi hello world"); +let bytes2: Vec> = arena.alloc_advance(b"hi hello world"); ``` Or you can use `append` to not reuse the old bytes at all. diff --git a/crates/compact-bytes/src/lib.rs b/crates/compact-bytes/src/lib.rs index 5ff510b5..587fa92c 100644 --- a/crates/compact-bytes/src/lib.rs +++ b/crates/compact-bytes/src/lib.rs @@ -1,10 +1,12 @@ #![doc = include_str!("../README.md")] +use std::ops::Range; + use append_only_bytes::{AppendOnlyBytes, BytesSlice}; use fxhash::FxHashMap; -// One entry in the hashtable will take 16bytes. And we need one entry for every position in the document. -// So the size of the hashtable will be 16 * document_size. +// One entry in the hashtable will take 16 ~ 32 bytes. And we need one entry for every position in the document. +// So the size of the hashtable will be (16 ~ 32) * document_size. pub struct CompactBytes { bytes: AppendOnlyBytes, /// map 4 bytes to position in the document @@ -34,14 +36,18 @@ impl CompactBytes { self.append(bytes) } - pub fn alloc_advance(&mut self, bytes: &[u8]) -> Vec { - // ans is Vec<(from_index, to_index)> - let mut ans: Vec<(usize, usize)> = vec![]; + pub fn as_bytes(&self) -> &[u8] { + self.bytes.as_bytes() + } - fn push(ans: &mut Vec<(usize, usize)>, new: (usize, usize)) { + pub fn alloc_advance(&mut self, bytes: &[u8]) -> Vec> { + let old_len = self.bytes.len(); + let mut ans: Vec> = vec![]; + // this push will try to merge the new range with the last range in the ans + fn push_with_merge(ans: &mut Vec>, new: Range) { if let Some(last) = ans.last_mut() { - if last.1 == new.0 { - last.1 = new.1; + if last.end == new.start { + last.end = new.end; return; } } @@ -53,20 +59,20 @@ impl CompactBytes { while index < bytes.len() { match self.lookup(&bytes[index..]) { Some((pos, len)) => { - push(&mut ans, (pos, pos + len)); + push_with_merge(&mut ans, pos..pos + len); index += len; } None => { - push(&mut ans, (self.bytes.len(), self.bytes.len() + 1)); + push_with_merge(&mut ans, self.bytes.len()..self.bytes.len() + 1); self.bytes.push(bytes[index]); index += 1; } } } - ans.into_iter() - .map(|(from, to)| self.bytes.slice(from..to)) - .collect() + self.append_new_entries_to_map(old_len); + + ans } pub fn append(&mut self, bytes: &[u8]) -> BytesSlice { @@ -81,8 +87,16 @@ impl CompactBytes { // if old doc = "", append "0123", then we need to add "0123" entry to the map // if old doc = "0123", append "x", then we need to add "123x" entry to the map // if old doc = "0123", append "xyz", then we need to add "123x", "23xy", "3xyz" entries to the map + let mut key = 0; + let mut is_first = true; for i in old_len.saturating_sub(3)..self.bytes.len().saturating_sub(3) { - let key = to_key(&self.bytes[i..i + 4]); + if is_first { + key = to_key(&self.bytes[i..i + 4]); + is_first = false; + } else { + key = (key << 8) | self.bytes[i + 3] as u32; + } + self.map.insert(key, i as u32); } } @@ -97,7 +111,7 @@ impl CompactBytes { let key = to_key(bytes); match self.map.get(&key).copied() { Some(pos) => { - let mut pos = pos as usize; + let pos = pos as usize; let mut len = 4; while pos + len < self.bytes.len() && len < bytes.len() @@ -149,13 +163,33 @@ mod tests { let mut bytes = CompactBytes::new(); bytes.append(b"123456789"); let ans = bytes.alloc_advance(b"haha12345567891234"); + assert_eq!(ans.len(), 4); assert_eq!(ans[0].len(), 4); - assert_eq!(ans[0].start(), 9); + assert_eq!(ans[0].start, 9); assert_eq!(ans[1].len(), 5); - assert_eq!(ans[1].start(), 0); + assert_eq!(ans[1].start, 0); assert_eq!(ans[2].len(), 5); - assert_eq!(ans[2].start(), 4); + assert_eq!(ans[2].start, 4); assert_eq!(ans[3].len(), 4); - assert_eq!(ans[3].start(), 0); + assert_eq!(ans[3].start, 0); + } + + #[test] + fn advance_alloc_should_be_indexed_as_well() { + let mut bytes = CompactBytes::new(); + bytes.alloc_advance(b"1234"); + let a = bytes.alloc(b"1234"); + assert_eq!(a.start(), 0); + } + + #[test] + fn advance_should_use_longer_match() { + let mut bytes = CompactBytes::new(); + bytes.append(b"1234kk 123456 1234xyz"); + let ans = bytes.alloc_advance(b"012345678"); + assert_eq!(ans.len(), 3); + assert_eq!(ans[0].len(), 1); + assert_eq!(ans[1].len(), 6); + assert_eq!(ans[2].len(), 2); } }