feat: make capacity adjustable

2025-01-23 05:24:51 +00:00 · 2023-07-14 00:47:02 +08:00 · 2023-07-14 00:47:02 +08:00 · 92434ccdfc
commit 92434ccdfc
parent f6ebf6783d
2 changed files with 36 additions and 22 deletions
--- a/crates/compact-bytes/benches/bench.rs
+++ b/crates/compact-bytes/benches/bench.rs
@ -3,7 +3,6 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion};

 pub fn entry(c: &mut Criterion) {
    let data = include_str!("./permuted.mht");
-    let data_x4 = data.repeat(4);
    c.bench_function("compact-bytes", |b| {
        b.iter(|| {
            let mut bytes = CompactBytes::new();
@ -11,11 +10,22 @@ pub fn entry(c: &mut Criterion) {
        });
    });
    c.bench_function("compact-bytes x4", |b| {
+        let data_x4 = data.repeat(4);
        b.iter(|| {
            let mut bytes = CompactBytes::new();
            bytes.alloc_advance(black_box(data_x4.as_bytes()));
        });
    });
+
+    let mut b = c.benchmark_group("slower");
+    b.sample_size(10);
+    b.bench_function("compact-bytes x100", |b| {
+        let data_x100 = data.repeat(100);
+        b.iter(|| {
+            let mut bytes = CompactBytes::new();
+            bytes.alloc_advance(black_box(data_x100.as_bytes()));
+        });
+    });
 }

 criterion_group!(benches, entry);
--- a/crates/compact-bytes/src/lib.rs
+++ b/crates/compact-bytes/src/lib.rs
@ -5,21 +5,15 @@ use fxhash::FxHasher32;
 use std::{hash::Hasher, num::NonZeroU32, ops::Range};

 /// it must be a power of 2
-const DEFAULT_CAPACITY: usize = 1 << 17;
-const MASK: usize = DEFAULT_CAPACITY - 1;
+const DEFAULT_CAPACITY: usize = 1 << 16;
 const MAX_TRIED: usize = 4;

 /// # Memory Usage
 ///
-/// One entry in the hash table will take 36 bytes. And we need one entry for every position in the document.
-/// So the size of the hash table will be (36 ~ 72) * document_size.
-///
-/// However, you can set the maximum size of the hashtable to reduce the memory usage.
-/// It will drop the old entries when the size of the hashtable reaches the maximum size.
-///
-/// By default the maximum size of the hash table is 2 * 1024, which means the memory usage will be 72 * 2 * 1024 = 144KB.
-/// It can fit L2 cache of most CPUs. This behavior is subjected to change in the future as we do more optimization.
+/// The memory usage is capacity * 12 bytes.
+/// The default capacity is 65536 (2^16), so the default memory usage is 0.75MB
 ///
+/// You can set the capacity by calling `with_capacity`. The capacity must be a power of 2.
 pub struct CompactBytes {
    bytes: AppendOnlyBytes,
    map: Box<[Option<NonZeroU32>]>,
@ -27,6 +21,7 @@ pub struct CompactBytes {
    /// next write index fr pos_and_next
    index: usize,
    capacity: usize,
+    mask: usize,
 }

 #[derive(Debug, Default, Clone, Copy)]
@ -45,16 +40,24 @@ impl CompactBytes {
            pos_and_next: vec![Default::default(); DEFAULT_CAPACITY].into_boxed_slice(),
            index: 1,
            capacity: DEFAULT_CAPACITY,
+            mask: DEFAULT_CAPACITY - 1,
        }
    }

-    /// Set the maximum size of the hash table
-    /// When the size of the hash table reaches the maximum size, it will drop the old entries.
-    /// When it's zero, it will never drop the old entries.
-    pub fn set_capacity(&mut self, capacity: usize) {
-        self.capacity = capacity;
+    /// cap must be a power of 2
+    pub fn with_capacity(cap: usize) -> Self {
+        let cap = cap.max(1024).next_power_of_two();
+        CompactBytes {
+            bytes: AppendOnlyBytes::with_capacity(cap),
+            map: vec![None; cap].into_boxed_slice(),
+            pos_and_next: vec![Default::default(); cap].into_boxed_slice(),
+            index: 1,
+            capacity: cap,
+            mask: cap - 1,
+        }
    }

+    #[inline]
    pub fn capacity(&self) -> usize {
        self.capacity
    }
@ -74,6 +77,7 @@ impl CompactBytes {
        self.append(bytes)
    }

+    #[inline]
    pub fn as_bytes(&self) -> &[u8] {
        self.bytes.as_bytes()
    }
@ -125,7 +129,7 @@ impl CompactBytes {
        // if old doc = "0123", append "x", then we need to add "123x" entry to the map
        // if old doc = "0123", append "xyz", then we need to add "123x", "23xy", "3xyz" entries to the map
        for i in old_len.saturating_sub(3)..self.bytes.len().saturating_sub(3) {
-            let key = hash(self.bytes.as_bytes(), i);
+            let key = hash(self.bytes.as_bytes(), i, self.mask);
            // Override the min position in entry with the current position
            let old = self.map[key];
            self.pos_and_next[self.index] = PosLinkList {
@ -133,7 +137,7 @@ impl CompactBytes {
                next: old,
            };
            self.map[key] = Some(NonZeroU32::new(self.index as u32).unwrap());
-            self.index = (self.index + 1) & MASK;
+            self.index = (self.index + 1) & self.mask;
            if self.index == 0 {
                self.index = 1;
            }
@ -149,7 +153,7 @@ impl CompactBytes {
            return None;
        }

-        let key = hash(bytes, 0);
+        let key = hash(bytes, 0, self.mask);
        match self.map[key] {
            Some(pointer) => {
                let mut node = self.pos_and_next[pointer.get() as usize];
@ -195,14 +199,14 @@ impl Default for CompactBytes {
    }
 }

-#[inline]
-fn hash(bytes: &[u8], n: usize) -> usize {
+#[inline(always)]
+fn hash(bytes: &[u8], n: usize, mask: usize) -> usize {
    let mut hasher = FxHasher32::default();
    hasher.write_u8(bytes[n]);
    hasher.write_u8(bytes[n + 1]);
    hasher.write_u8(bytes[n + 2]);
    hasher.write_u8(bytes[n + 3]);
-    hasher.finish() as usize & MASK
+    hasher.finish() as usize & mask
 }

 #[cfg(test)]