mirror of
https://github.com/loro-dev/loro.git
synced 2025-01-23 05:24:51 +00:00
feat: supports setting capacity
This commit is contained in:
parent
f604a89fc3
commit
346117ff54
4 changed files with 94 additions and 25 deletions
7
Cargo.lock
generated
7
Cargo.lock
generated
|
@ -297,6 +297,7 @@ version = "0.1.0"
|
|||
dependencies = [
|
||||
"append-only-bytes",
|
||||
"fxhash",
|
||||
"linked-hash-map",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -792,6 +793,12 @@ dependencies = [
|
|||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "linked-hash-map"
|
||||
version = "0.5.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.9"
|
||||
|
|
|
@ -8,3 +8,4 @@ edition = "2021"
|
|||
[dependencies]
|
||||
append-only-bytes = "0.1.8"
|
||||
fxhash = "0.2.1"
|
||||
linked-hash-map = "0.5.6"
|
||||
|
|
|
@ -39,6 +39,7 @@ So it will break the bytes into small pieces to reuse them.
|
|||
|
||||
```rust
|
||||
use compact_bytes::CompactBytes;
|
||||
use std::ops::Range;
|
||||
|
||||
let mut arena = CompactBytes::new();
|
||||
let bytes1 = arena.alloc(b"hello");
|
||||
|
|
|
@ -1,23 +1,61 @@
|
|||
#![doc = include_str!("../README.md")]
|
||||
|
||||
use std::ops::Range;
|
||||
use std::{hash::BuildHasherDefault, ops::Range};
|
||||
|
||||
use append_only_bytes::{AppendOnlyBytes, BytesSlice};
|
||||
use fxhash::FxHashMap;
|
||||
use linked_hash_map::LinkedHashMap;
|
||||
|
||||
// One entry in the hashtable will take 16 ~ 32 bytes. And we need one entry for every position in the document.
|
||||
// So the size of the hashtable will be (16 ~ 32) * document_size.
|
||||
const DEFAULT_CAPACITY: usize = 2 * 1024;
|
||||
const NUM_POS_PER_ENTRY: usize = 4;
|
||||
|
||||
type Hasher = BuildHasherDefault<fxhash::FxHasher32>;
|
||||
|
||||
/// # Memory Usage
|
||||
///
|
||||
/// One entry in the hash table will take 36 bytes. And we need one entry for every position in the document.
|
||||
/// So the size of the hash table will be (36 ~ 72) * document_size.
|
||||
///
|
||||
/// However, you can set the maximum size of the hashtable to reduce the memory usage.
|
||||
/// It will drop the old entries when the size of the hashtable reaches the maximum size.
|
||||
///
|
||||
/// By default the maximum size of the hash table is 2 * 1024, which means the memory usage will be 72 * 2 * 1024 = 144KB.
|
||||
/// It can fit L2 cache of most CPUs. This behavior is subjected to change in the future as we do more optimization.
|
||||
///
|
||||
pub struct CompactBytes {
|
||||
bytes: AppendOnlyBytes,
|
||||
/// map 4 bytes to position in the document
|
||||
map: FxHashMap<u32, u32>,
|
||||
/// Map 4 bytes to positions in the document.
|
||||
/// The actual position is value - 1, and 0 means the position is not found.
|
||||
map: LinkedHashMap<u32, [u32; NUM_POS_PER_ENTRY], Hasher>,
|
||||
capacity: usize,
|
||||
}
|
||||
|
||||
impl CompactBytes {
|
||||
pub fn new() -> Self {
|
||||
CompactBytes {
|
||||
bytes: AppendOnlyBytes::new(),
|
||||
map: FxHashMap::default(),
|
||||
map: LinkedHashMap::with_hasher(Default::default()),
|
||||
capacity: DEFAULT_CAPACITY,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the maximum size of the hash table
|
||||
/// When the size of the hash table reaches the maximum size, it will drop the old entries.
|
||||
/// When it's zero, it will never drop the old entries.
|
||||
pub fn set_capacity(&mut self, capacity: usize) {
|
||||
self.capacity = capacity;
|
||||
}
|
||||
|
||||
pub fn capacity(&self) -> usize {
|
||||
self.capacity
|
||||
}
|
||||
|
||||
fn drop_old_entry_if_reach_maximum_capacity(&mut self) {
|
||||
if self.capacity == 0 {
|
||||
return;
|
||||
}
|
||||
|
||||
while self.map.len() > self.capacity {
|
||||
self.map.pop_front();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -70,20 +108,19 @@ impl CompactBytes {
|
|||
}
|
||||
}
|
||||
|
||||
self.append_new_entries_to_map(old_len);
|
||||
|
||||
self.record_new_prefix(old_len);
|
||||
ans
|
||||
}
|
||||
|
||||
pub fn append(&mut self, bytes: &[u8]) -> BytesSlice {
|
||||
let old_len = self.bytes.len();
|
||||
self.bytes.push_slice(bytes);
|
||||
self.append_new_entries_to_map(old_len);
|
||||
self.record_new_prefix(old_len);
|
||||
self.bytes.slice(old_len..old_len + bytes.len())
|
||||
}
|
||||
|
||||
/// Append the entries just created to the map
|
||||
fn append_new_entries_to_map(&mut self, old_len: usize) {
|
||||
fn record_new_prefix(&mut self, old_len: usize) {
|
||||
// if old doc = "", append "0123", then we need to add "0123" entry to the map
|
||||
// if old doc = "0123", append "x", then we need to add "123x" entry to the map
|
||||
// if old doc = "0123", append "xyz", then we need to add "123x", "23xy", "3xyz" entries to the map
|
||||
|
@ -97,30 +134,53 @@ impl CompactBytes {
|
|||
key = (key << 8) | self.bytes[i + 3] as u32;
|
||||
}
|
||||
|
||||
self.map.insert(key, i as u32);
|
||||
// Override the min position in entry with the current position
|
||||
let entry = self.map.entry(key).or_insert([0; NUM_POS_PER_ENTRY]);
|
||||
entry
|
||||
.iter_mut()
|
||||
.min()
|
||||
.map(|min| *min = i as u32 + 1)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
self.drop_old_entry_if_reach_maximum_capacity()
|
||||
}
|
||||
|
||||
/// given bytes, find the position with the longest match in the document
|
||||
/// Given bytes, find the position with the longest match in the document
|
||||
/// It need exclusive reference to refresh the LRU
|
||||
///
|
||||
/// return Option<(position, length)>
|
||||
fn lookup(&self, bytes: &[u8]) -> Option<(usize, usize)> {
|
||||
fn lookup(&mut self, bytes: &[u8]) -> Option<(usize, usize)> {
|
||||
if bytes.len() < 4 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let key = to_key(bytes);
|
||||
match self.map.get(&key).copied() {
|
||||
Some(pos) => {
|
||||
let pos = pos as usize;
|
||||
let mut len = 4;
|
||||
while pos + len < self.bytes.len()
|
||||
&& len < bytes.len()
|
||||
&& self.bytes[pos + len] == bytes[len]
|
||||
{
|
||||
len += 1;
|
||||
match self.map.get_refresh(&key).copied() {
|
||||
Some(poses) => {
|
||||
let mut max_len = 0;
|
||||
let mut ans_pos = 0;
|
||||
for &pos in poses.iter() {
|
||||
if pos == 0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
let pos = pos as usize - 1;
|
||||
let mut len = 4;
|
||||
while pos + len < self.bytes.len()
|
||||
&& len < bytes.len()
|
||||
&& self.bytes[pos + len] == bytes[len]
|
||||
{
|
||||
len += 1;
|
||||
}
|
||||
|
||||
if len > max_len {
|
||||
max_len = len;
|
||||
ans_pos = pos;
|
||||
}
|
||||
}
|
||||
|
||||
Some((pos, len))
|
||||
Some((ans_pos, max_len))
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
|
@ -133,7 +193,7 @@ impl Default for CompactBytes {
|
|||
}
|
||||
}
|
||||
|
||||
/// Convert the first 4 btyes into u32
|
||||
/// Convert the first 4 bytes into u32
|
||||
fn to_key(bytes: &[u8]) -> u32 {
|
||||
u32::from_be_bytes([bytes[0], bytes[1], bytes[2], bytes[3]])
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue