From 92434ccdfc839ebe52cf3b87035d5b4178ca7bc1 Mon Sep 17 00:00:00 2001 From: Zixuan Chen Date: Fri, 14 Jul 2023 00:47:02 +0800 Subject: [PATCH] feat: make capacity adjustable --- crates/compact-bytes/benches/bench.rs | 12 ++++++- crates/compact-bytes/src/lib.rs | 46 +++++++++++++++------------ 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/crates/compact-bytes/benches/bench.rs b/crates/compact-bytes/benches/bench.rs index 5ad5a529..632587d9 100644 --- a/crates/compact-bytes/benches/bench.rs +++ b/crates/compact-bytes/benches/bench.rs @@ -3,7 +3,6 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; pub fn entry(c: &mut Criterion) { let data = include_str!("./permuted.mht"); - let data_x4 = data.repeat(4); c.bench_function("compact-bytes", |b| { b.iter(|| { let mut bytes = CompactBytes::new(); @@ -11,11 +10,22 @@ pub fn entry(c: &mut Criterion) { }); }); c.bench_function("compact-bytes x4", |b| { + let data_x4 = data.repeat(4); b.iter(|| { let mut bytes = CompactBytes::new(); bytes.alloc_advance(black_box(data_x4.as_bytes())); }); }); + + let mut b = c.benchmark_group("slower"); + b.sample_size(10); + b.bench_function("compact-bytes x100", |b| { + let data_x100 = data.repeat(100); + b.iter(|| { + let mut bytes = CompactBytes::new(); + bytes.alloc_advance(black_box(data_x100.as_bytes())); + }); + }); } criterion_group!(benches, entry); diff --git a/crates/compact-bytes/src/lib.rs b/crates/compact-bytes/src/lib.rs index 28047e01..3caa1dad 100644 --- a/crates/compact-bytes/src/lib.rs +++ b/crates/compact-bytes/src/lib.rs @@ -5,21 +5,15 @@ use fxhash::FxHasher32; use std::{hash::Hasher, num::NonZeroU32, ops::Range}; /// it must be a power of 2 -const DEFAULT_CAPACITY: usize = 1 << 17; -const MASK: usize = DEFAULT_CAPACITY - 1; +const DEFAULT_CAPACITY: usize = 1 << 16; const MAX_TRIED: usize = 4; /// # Memory Usage /// -/// One entry in the hash table will take 36 bytes. And we need one entry for every position in the document. -/// So the size of the hash table will be (36 ~ 72) * document_size. -/// -/// However, you can set the maximum size of the hashtable to reduce the memory usage. -/// It will drop the old entries when the size of the hashtable reaches the maximum size. -/// -/// By default the maximum size of the hash table is 2 * 1024, which means the memory usage will be 72 * 2 * 1024 = 144KB. -/// It can fit L2 cache of most CPUs. This behavior is subjected to change in the future as we do more optimization. +/// The memory usage is capacity * 12 bytes. +/// The default capacity is 65536 (2^16), so the default memory usage is 0.75MB /// +/// You can set the capacity by calling `with_capacity`. The capacity must be a power of 2. pub struct CompactBytes { bytes: AppendOnlyBytes, map: Box<[Option]>, @@ -27,6 +21,7 @@ pub struct CompactBytes { /// next write index fr pos_and_next index: usize, capacity: usize, + mask: usize, } #[derive(Debug, Default, Clone, Copy)] @@ -45,16 +40,24 @@ impl CompactBytes { pos_and_next: vec![Default::default(); DEFAULT_CAPACITY].into_boxed_slice(), index: 1, capacity: DEFAULT_CAPACITY, + mask: DEFAULT_CAPACITY - 1, } } - /// Set the maximum size of the hash table - /// When the size of the hash table reaches the maximum size, it will drop the old entries. - /// When it's zero, it will never drop the old entries. - pub fn set_capacity(&mut self, capacity: usize) { - self.capacity = capacity; + /// cap must be a power of 2 + pub fn with_capacity(cap: usize) -> Self { + let cap = cap.max(1024).next_power_of_two(); + CompactBytes { + bytes: AppendOnlyBytes::with_capacity(cap), + map: vec![None; cap].into_boxed_slice(), + pos_and_next: vec![Default::default(); cap].into_boxed_slice(), + index: 1, + capacity: cap, + mask: cap - 1, + } } + #[inline] pub fn capacity(&self) -> usize { self.capacity } @@ -74,6 +77,7 @@ impl CompactBytes { self.append(bytes) } + #[inline] pub fn as_bytes(&self) -> &[u8] { self.bytes.as_bytes() } @@ -125,7 +129,7 @@ impl CompactBytes { // if old doc = "0123", append "x", then we need to add "123x" entry to the map // if old doc = "0123", append "xyz", then we need to add "123x", "23xy", "3xyz" entries to the map for i in old_len.saturating_sub(3)..self.bytes.len().saturating_sub(3) { - let key = hash(self.bytes.as_bytes(), i); + let key = hash(self.bytes.as_bytes(), i, self.mask); // Override the min position in entry with the current position let old = self.map[key]; self.pos_and_next[self.index] = PosLinkList { @@ -133,7 +137,7 @@ impl CompactBytes { next: old, }; self.map[key] = Some(NonZeroU32::new(self.index as u32).unwrap()); - self.index = (self.index + 1) & MASK; + self.index = (self.index + 1) & self.mask; if self.index == 0 { self.index = 1; } @@ -149,7 +153,7 @@ impl CompactBytes { return None; } - let key = hash(bytes, 0); + let key = hash(bytes, 0, self.mask); match self.map[key] { Some(pointer) => { let mut node = self.pos_and_next[pointer.get() as usize]; @@ -195,14 +199,14 @@ impl Default for CompactBytes { } } -#[inline] -fn hash(bytes: &[u8], n: usize) -> usize { +#[inline(always)] +fn hash(bytes: &[u8], n: usize, mask: usize) -> usize { let mut hasher = FxHasher32::default(); hasher.write_u8(bytes[n]); hasher.write_u8(bytes[n + 1]); hasher.write_u8(bytes[n + 2]); hasher.write_u8(bytes[n + 3]); - hasher.finish() as usize & MASK + hasher.finish() as usize & mask } #[cfg(test)]