diff --git a/Cargo.lock b/Cargo.lock index 4e74374c..0f583b42 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -297,6 +297,7 @@ version = "0.1.0" dependencies = [ "append-only-bytes", "fxhash", + "linked-hash-map", ] [[package]] @@ -792,6 +793,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "linked-hash-map" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" + [[package]] name = "lock_api" version = "0.4.9" diff --git a/crates/compact-bytes/Cargo.toml b/crates/compact-bytes/Cargo.toml index 58488aa1..d007639c 100644 --- a/crates/compact-bytes/Cargo.toml +++ b/crates/compact-bytes/Cargo.toml @@ -8,3 +8,4 @@ edition = "2021" [dependencies] append-only-bytes = "0.1.8" fxhash = "0.2.1" +linked-hash-map = "0.5.6" diff --git a/crates/compact-bytes/README.md b/crates/compact-bytes/README.md index 3a3c12b6..dc8ed108 100644 --- a/crates/compact-bytes/README.md +++ b/crates/compact-bytes/README.md @@ -39,6 +39,7 @@ So it will break the bytes into small pieces to reuse them. ```rust use compact_bytes::CompactBytes; +use std::ops::Range; let mut arena = CompactBytes::new(); let bytes1 = arena.alloc(b"hello"); diff --git a/crates/compact-bytes/src/lib.rs b/crates/compact-bytes/src/lib.rs index 587fa92c..e6b2bac1 100644 --- a/crates/compact-bytes/src/lib.rs +++ b/crates/compact-bytes/src/lib.rs @@ -1,23 +1,61 @@ #![doc = include_str!("../README.md")] -use std::ops::Range; +use std::{hash::BuildHasherDefault, ops::Range}; use append_only_bytes::{AppendOnlyBytes, BytesSlice}; -use fxhash::FxHashMap; +use linked_hash_map::LinkedHashMap; -// One entry in the hashtable will take 16 ~ 32 bytes. And we need one entry for every position in the document. -// So the size of the hashtable will be (16 ~ 32) * document_size. +const DEFAULT_CAPACITY: usize = 2 * 1024; +const NUM_POS_PER_ENTRY: usize = 4; + +type Hasher = BuildHasherDefault; + +/// # Memory Usage +/// +/// One entry in the hash table will take 36 bytes. And we need one entry for every position in the document. +/// So the size of the hash table will be (36 ~ 72) * document_size. +/// +/// However, you can set the maximum size of the hashtable to reduce the memory usage. +/// It will drop the old entries when the size of the hashtable reaches the maximum size. +/// +/// By default the maximum size of the hash table is 2 * 1024, which means the memory usage will be 72 * 2 * 1024 = 144KB. +/// It can fit L2 cache of most CPUs. This behavior is subjected to change in the future as we do more optimization. +/// pub struct CompactBytes { bytes: AppendOnlyBytes, - /// map 4 bytes to position in the document - map: FxHashMap, + /// Map 4 bytes to positions in the document. + /// The actual position is value - 1, and 0 means the position is not found. + map: LinkedHashMap, + capacity: usize, } impl CompactBytes { pub fn new() -> Self { CompactBytes { bytes: AppendOnlyBytes::new(), - map: FxHashMap::default(), + map: LinkedHashMap::with_hasher(Default::default()), + capacity: DEFAULT_CAPACITY, + } + } + + /// Set the maximum size of the hash table + /// When the size of the hash table reaches the maximum size, it will drop the old entries. + /// When it's zero, it will never drop the old entries. + pub fn set_capacity(&mut self, capacity: usize) { + self.capacity = capacity; + } + + pub fn capacity(&self) -> usize { + self.capacity + } + + fn drop_old_entry_if_reach_maximum_capacity(&mut self) { + if self.capacity == 0 { + return; + } + + while self.map.len() > self.capacity { + self.map.pop_front(); } } @@ -70,20 +108,19 @@ impl CompactBytes { } } - self.append_new_entries_to_map(old_len); - + self.record_new_prefix(old_len); ans } pub fn append(&mut self, bytes: &[u8]) -> BytesSlice { let old_len = self.bytes.len(); self.bytes.push_slice(bytes); - self.append_new_entries_to_map(old_len); + self.record_new_prefix(old_len); self.bytes.slice(old_len..old_len + bytes.len()) } /// Append the entries just created to the map - fn append_new_entries_to_map(&mut self, old_len: usize) { + fn record_new_prefix(&mut self, old_len: usize) { // if old doc = "", append "0123", then we need to add "0123" entry to the map // if old doc = "0123", append "x", then we need to add "123x" entry to the map // if old doc = "0123", append "xyz", then we need to add "123x", "23xy", "3xyz" entries to the map @@ -97,30 +134,53 @@ impl CompactBytes { key = (key << 8) | self.bytes[i + 3] as u32; } - self.map.insert(key, i as u32); + // Override the min position in entry with the current position + let entry = self.map.entry(key).or_insert([0; NUM_POS_PER_ENTRY]); + entry + .iter_mut() + .min() + .map(|min| *min = i as u32 + 1) + .unwrap(); } + + self.drop_old_entry_if_reach_maximum_capacity() } - /// given bytes, find the position with the longest match in the document + /// Given bytes, find the position with the longest match in the document + /// It need exclusive reference to refresh the LRU + /// /// return Option<(position, length)> - fn lookup(&self, bytes: &[u8]) -> Option<(usize, usize)> { + fn lookup(&mut self, bytes: &[u8]) -> Option<(usize, usize)> { if bytes.len() < 4 { return None; } let key = to_key(bytes); - match self.map.get(&key).copied() { - Some(pos) => { - let pos = pos as usize; - let mut len = 4; - while pos + len < self.bytes.len() - && len < bytes.len() - && self.bytes[pos + len] == bytes[len] - { - len += 1; + match self.map.get_refresh(&key).copied() { + Some(poses) => { + let mut max_len = 0; + let mut ans_pos = 0; + for &pos in poses.iter() { + if pos == 0 { + continue; + } + + let pos = pos as usize - 1; + let mut len = 4; + while pos + len < self.bytes.len() + && len < bytes.len() + && self.bytes[pos + len] == bytes[len] + { + len += 1; + } + + if len > max_len { + max_len = len; + ans_pos = pos; + } } - Some((pos, len)) + Some((ans_pos, max_len)) } None => None, } @@ -133,7 +193,7 @@ impl Default for CompactBytes { } } -/// Convert the first 4 btyes into u32 +/// Convert the first 4 bytes into u32 fn to_key(bytes: &[u8]) -> u32 { u32::from_be_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) }