feat: supports setting capacity

2025-01-23 05:24:51 +00:00 · 2023-07-13 16:57:41 +08:00 · 2023-07-13 16:57:41 +08:00 · 346117ff54
commit 346117ff54
parent f604a89fc3
4 changed files with 94 additions and 25 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -297,6 +297,7 @@ version = "0.1.0"
 dependencies = [
 "append-only-bytes",
 "fxhash",
+ "linked-hash-map",
 ]

 [[package]]
@ -792,6 +793,12 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "linked-hash-map"
+version = "0.5.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
+
 [[package]]
 name = "lock_api"
 version = "0.4.9"
--- a/crates/compact-bytes/Cargo.toml
+++ b/crates/compact-bytes/Cargo.toml
@ -8,3 +8,4 @@ edition = "2021"
 [dependencies]
 append-only-bytes = "0.1.8"
 fxhash = "0.2.1"
+linked-hash-map = "0.5.6"
--- a/crates/compact-bytes/README.md
+++ b/crates/compact-bytes/README.md
@ -39,6 +39,7 @@ So it will break the bytes into small pieces to reuse them.

 ```rust
 use compact_bytes::CompactBytes;
+use std::ops::Range;

 let mut arena = CompactBytes::new();
 let bytes1 = arena.alloc(b"hello");
--- a/crates/compact-bytes/src/lib.rs
+++ b/crates/compact-bytes/src/lib.rs
@ -1,23 +1,61 @@
 #![doc = include_str!("../README.md")]

-use std::ops::Range;
+use std::{hash::BuildHasherDefault, ops::Range};

 use append_only_bytes::{AppendOnlyBytes, BytesSlice};
-use fxhash::FxHashMap;
+use linked_hash_map::LinkedHashMap;

-// One entry in the hashtable will take 16 ~ 32 bytes. And we need one entry for every position in the document.
-// So the size of the hashtable will be (16 ~ 32) * document_size.
+const DEFAULT_CAPACITY: usize = 2 * 1024;
+const NUM_POS_PER_ENTRY: usize = 4;
+
+type Hasher = BuildHasherDefault<fxhash::FxHasher32>;
+
+/// # Memory Usage
+///
+/// One entry in the hash table will take 36 bytes. And we need one entry for every position in the document.
+/// So the size of the hash table will be (36 ~ 72) * document_size.
+///
+/// However, you can set the maximum size of the hashtable to reduce the memory usage.
+/// It will drop the old entries when the size of the hashtable reaches the maximum size.
+///
+/// By default the maximum size of the hash table is 2 * 1024, which means the memory usage will be 72 * 2 * 1024 = 144KB.
+/// It can fit L2 cache of most CPUs. This behavior is subjected to change in the future as we do more optimization.
+///
 pub struct CompactBytes {
    bytes: AppendOnlyBytes,
-    /// map 4 bytes to position in the document
-    map: FxHashMap<u32, u32>,
+    /// Map 4 bytes to positions in the document.
+    /// The actual position is value - 1, and 0 means the position is not found.
+    map: LinkedHashMap<u32, [u32; NUM_POS_PER_ENTRY], Hasher>,
+    capacity: usize,
 }

 impl CompactBytes {
    pub fn new() -> Self {
        CompactBytes {
            bytes: AppendOnlyBytes::new(),
-            map: FxHashMap::default(),
+            map: LinkedHashMap::with_hasher(Default::default()),
+            capacity: DEFAULT_CAPACITY,
+        }
+    }
+
+    /// Set the maximum size of the hash table
+    /// When the size of the hash table reaches the maximum size, it will drop the old entries.
+    /// When it's zero, it will never drop the old entries.
+    pub fn set_capacity(&mut self, capacity: usize) {
+        self.capacity = capacity;
+    }
+
+    pub fn capacity(&self) -> usize {
+        self.capacity
+    }
+
+    fn drop_old_entry_if_reach_maximum_capacity(&mut self) {
+        if self.capacity == 0 {
+            return;
+        }
+
+        while self.map.len() > self.capacity {
+            self.map.pop_front();
        }
    }

@ -70,20 +108,19 @@ impl CompactBytes {
            }
        }

-        self.append_new_entries_to_map(old_len);
-
+        self.record_new_prefix(old_len);
        ans
    }

    pub fn append(&mut self, bytes: &[u8]) -> BytesSlice {
        let old_len = self.bytes.len();
        self.bytes.push_slice(bytes);
-        self.append_new_entries_to_map(old_len);
+        self.record_new_prefix(old_len);
        self.bytes.slice(old_len..old_len + bytes.len())
    }

    /// Append the entries just created to the map
-    fn append_new_entries_to_map(&mut self, old_len: usize) {
+    fn record_new_prefix(&mut self, old_len: usize) {
        // if old doc = "", append "0123", then we need to add "0123" entry to the map
        // if old doc = "0123", append "x", then we need to add "123x" entry to the map
        // if old doc = "0123", append "xyz", then we need to add "123x", "23xy", "3xyz" entries to the map
@ -97,30 +134,53 @@ impl CompactBytes {
                key = (key << 8) | self.bytes[i + 3] as u32;
            }

-            self.map.insert(key, i as u32);
+            // Override the min position in entry with the current position
+            let entry = self.map.entry(key).or_insert([0; NUM_POS_PER_ENTRY]);
+            entry
+                .iter_mut()
+                .min()
+                .map(|min| *min = i as u32 + 1)
+                .unwrap();
        }
+
+        self.drop_old_entry_if_reach_maximum_capacity()
    }

-    /// given bytes, find the position with the longest match in the document
+    /// Given bytes, find the position with the longest match in the document
+    /// It need exclusive reference to refresh the LRU
+    ///
    /// return Option<(position, length)>
-    fn lookup(&self, bytes: &[u8]) -> Option<(usize, usize)> {
+    fn lookup(&mut self, bytes: &[u8]) -> Option<(usize, usize)> {
        if bytes.len() < 4 {
            return None;
        }

        let key = to_key(bytes);
-        match self.map.get(&key).copied() {
-            Some(pos) => {
-                let pos = pos as usize;
-                let mut len = 4;
-                while pos + len < self.bytes.len()
-                    && len < bytes.len()
-                    && self.bytes[pos + len] == bytes[len]
-                {
-                    len += 1;
+        match self.map.get_refresh(&key).copied() {
+            Some(poses) => {
+                let mut max_len = 0;
+                let mut ans_pos = 0;
+                for &pos in poses.iter() {
+                    if pos == 0 {
+                        continue;
+                    }
+
+                    let pos = pos as usize - 1;
+                    let mut len = 4;
+                    while pos + len < self.bytes.len()
+                        && len < bytes.len()
+                        && self.bytes[pos + len] == bytes[len]
+                    {
+                        len += 1;
+                    }
+
+                    if len > max_len {
+                        max_len = len;
+                        ans_pos = pos;
+                    }
                }

-                Some((pos, len))
+                Some((ans_pos, max_len))
            }
            None => None,
        }
@ -133,7 +193,7 @@ impl Default for CompactBytes {
    }
 }

-/// Convert the first 4 btyes into u32
+/// Convert the first 4 bytes into u32
 fn to_key(bytes: &[u8]) -> u32 {
    u32::from_be_bytes([bytes[0], bytes[1], bytes[2], bytes[3]])
 }