refactor(bytes): refine interface

This commit is contained in:
Zixuan Chen 2023-07-13 15:33:49 +08:00
parent 8704d22750
commit f604a89fc3
3 changed files with 62 additions and 21 deletions

8
Cargo.lock generated
View file

@ -291,6 +291,14 @@ dependencies = [
"termcolor",
]
[[package]]
name = "compact-bytes"
version = "0.1.0"
dependencies = [
"append-only-bytes",
"fxhash",
]
[[package]]
name = "console_error_panic_hook"
version = "0.1.7"

View file

@ -39,12 +39,11 @@ So it will break the bytes into small pieces to reuse them.
```rust
use compact_bytes::CompactBytes;
use append_only_bytes::BytesSlice;
let mut arena = CompactBytes::new();
let bytes1 = arena.alloc(b"hello");
// it breaks the bytes into 3 pieces "hi ", "hello", " world"
let bytes2: Vec<BytesSlice> = arena.alloc_advance(b"hi hello world");
let bytes2: Vec<Range<usize>> = arena.alloc_advance(b"hi hello world");
```
Or you can use `append` to not reuse the old bytes at all.

View file

@ -1,10 +1,12 @@
#![doc = include_str!("../README.md")]
use std::ops::Range;
use append_only_bytes::{AppendOnlyBytes, BytesSlice};
use fxhash::FxHashMap;
// One entry in the hashtable will take 16bytes. And we need one entry for every position in the document.
// So the size of the hashtable will be 16 * document_size.
// One entry in the hashtable will take 16 ~ 32 bytes. And we need one entry for every position in the document.
// So the size of the hashtable will be (16 ~ 32) * document_size.
pub struct CompactBytes {
bytes: AppendOnlyBytes,
/// map 4 bytes to position in the document
@ -34,14 +36,18 @@ impl CompactBytes {
self.append(bytes)
}
pub fn alloc_advance(&mut self, bytes: &[u8]) -> Vec<BytesSlice> {
// ans is Vec<(from_index, to_index)>
let mut ans: Vec<(usize, usize)> = vec![];
pub fn as_bytes(&self) -> &[u8] {
self.bytes.as_bytes()
}
fn push(ans: &mut Vec<(usize, usize)>, new: (usize, usize)) {
pub fn alloc_advance(&mut self, bytes: &[u8]) -> Vec<Range<usize>> {
let old_len = self.bytes.len();
let mut ans: Vec<Range<usize>> = vec![];
// this push will try to merge the new range with the last range in the ans
fn push_with_merge(ans: &mut Vec<Range<usize>>, new: Range<usize>) {
if let Some(last) = ans.last_mut() {
if last.1 == new.0 {
last.1 = new.1;
if last.end == new.start {
last.end = new.end;
return;
}
}
@ -53,20 +59,20 @@ impl CompactBytes {
while index < bytes.len() {
match self.lookup(&bytes[index..]) {
Some((pos, len)) => {
push(&mut ans, (pos, pos + len));
push_with_merge(&mut ans, pos..pos + len);
index += len;
}
None => {
push(&mut ans, (self.bytes.len(), self.bytes.len() + 1));
push_with_merge(&mut ans, self.bytes.len()..self.bytes.len() + 1);
self.bytes.push(bytes[index]);
index += 1;
}
}
}
ans.into_iter()
.map(|(from, to)| self.bytes.slice(from..to))
.collect()
self.append_new_entries_to_map(old_len);
ans
}
pub fn append(&mut self, bytes: &[u8]) -> BytesSlice {
@ -81,8 +87,16 @@ impl CompactBytes {
// if old doc = "", append "0123", then we need to add "0123" entry to the map
// if old doc = "0123", append "x", then we need to add "123x" entry to the map
// if old doc = "0123", append "xyz", then we need to add "123x", "23xy", "3xyz" entries to the map
let mut key = 0;
let mut is_first = true;
for i in old_len.saturating_sub(3)..self.bytes.len().saturating_sub(3) {
let key = to_key(&self.bytes[i..i + 4]);
if is_first {
key = to_key(&self.bytes[i..i + 4]);
is_first = false;
} else {
key = (key << 8) | self.bytes[i + 3] as u32;
}
self.map.insert(key, i as u32);
}
}
@ -97,7 +111,7 @@ impl CompactBytes {
let key = to_key(bytes);
match self.map.get(&key).copied() {
Some(pos) => {
let mut pos = pos as usize;
let pos = pos as usize;
let mut len = 4;
while pos + len < self.bytes.len()
&& len < bytes.len()
@ -149,13 +163,33 @@ mod tests {
let mut bytes = CompactBytes::new();
bytes.append(b"123456789");
let ans = bytes.alloc_advance(b"haha12345567891234");
assert_eq!(ans.len(), 4);
assert_eq!(ans[0].len(), 4);
assert_eq!(ans[0].start(), 9);
assert_eq!(ans[0].start, 9);
assert_eq!(ans[1].len(), 5);
assert_eq!(ans[1].start(), 0);
assert_eq!(ans[1].start, 0);
assert_eq!(ans[2].len(), 5);
assert_eq!(ans[2].start(), 4);
assert_eq!(ans[2].start, 4);
assert_eq!(ans[3].len(), 4);
assert_eq!(ans[3].start(), 0);
assert_eq!(ans[3].start, 0);
}
#[test]
fn advance_alloc_should_be_indexed_as_well() {
let mut bytes = CompactBytes::new();
bytes.alloc_advance(b"1234");
let a = bytes.alloc(b"1234");
assert_eq!(a.start(), 0);
}
#[test]
fn advance_should_use_longer_match() {
let mut bytes = CompactBytes::new();
bytes.append(b"1234kk 123456 1234xyz");
let ans = bytes.alloc_advance(b"012345678");
assert_eq!(ans.len(), 3);
assert_eq!(ans[0].len(), 1);
assert_eq!(ans[1].len(), 6);
assert_eq!(ans[2].len(), 2);
}
}