feat: compact bytes init

This commit is contained in:
Zixuan Chen 2023-07-13 13:33:23 +08:00
parent 7cb6691cef
commit 8704d22750
3 changed files with 230 additions and 0 deletions

View file

@ -0,0 +1,10 @@
[package]
name = "compact-bytes"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
append-only-bytes = "0.1.8"
fxhash = "0.2.1"

View file

@ -0,0 +1,59 @@
# compact-bytes
It's a append-only bytes arena. Appending new bytes will get a pointer to a
slice of the append-only bytes. It will try to reuse the allocated old bytes to
reduce memory usage, if possible.
# Example
```rust
use compact_bytes::CompactBytes;
let mut arena = CompactBytes::new();
let bytes1 = arena.alloc(b"hello");
let bytes2 = arena.alloc(b"world");
assert_eq!(bytes1.as_bytes(), b"hello");
assert_eq!(bytes2.as_bytes(), b"world");
// bytes3 will be a pointer to the same bytes as bytes1
let bytes3 = arena.alloc(b"hello");
assert_eq!(bytes3.as_bytes(), b"hello");
assert_eq!(bytes3.start(), bytes1.start());
assert_eq!(bytes3.start(), 0);
assert_eq!(bytes3.end(), 5);
// Allocatting short bytes will not reuse the old bytes.
// Because it will make merging neighboring slices easier so that when
// serializing the bytes it will be more compact.
let mut bytes4 = arena.alloc(b"h");
assert_eq!(bytes4.start(), 10);
let bytes5 = arena.alloc(b"e");
assert_eq!(bytes5.start(), 11);
// bytes4 and bytes5 can be merged
assert_eq!(bytes4.can_merge(&bytes5), true);
assert!(bytes4.try_merge(&bytes5).is_ok());
```
In advance mode, it will try to reuse the old bytes as much as possible.
So it will break the bytes into small pieces to reuse them.
```rust
use compact_bytes::CompactBytes;
use append_only_bytes::BytesSlice;
let mut arena = CompactBytes::new();
let bytes1 = arena.alloc(b"hello");
// it breaks the bytes into 3 pieces "hi ", "hello", " world"
let bytes2: Vec<BytesSlice> = arena.alloc_advance(b"hi hello world");
```
Or you can use `append` to not reuse the old bytes at all.
```rust
use compact_bytes::CompactBytes;
let mut arena = CompactBytes::new();
let bytes1 = arena.alloc(b"hello");
let bytes2 = arena.append(b"hello");
assert_ne!(bytes1.start(), bytes2.start());
```

View file

@ -0,0 +1,161 @@
#![doc = include_str!("../README.md")]
use append_only_bytes::{AppendOnlyBytes, BytesSlice};
use fxhash::FxHashMap;
// One entry in the hashtable will take 16bytes. And we need one entry for every position in the document.
// So the size of the hashtable will be 16 * document_size.
pub struct CompactBytes {
bytes: AppendOnlyBytes,
/// map 4 bytes to position in the document
map: FxHashMap<u32, u32>,
}
impl CompactBytes {
pub fn new() -> Self {
CompactBytes {
bytes: AppendOnlyBytes::new(),
map: FxHashMap::default(),
}
}
pub fn from_bytes(bytes: &[u8]) -> Self {
let mut compact_bytes = CompactBytes::new();
compact_bytes.append(bytes);
compact_bytes
}
pub fn alloc(&mut self, bytes: &[u8]) -> BytesSlice {
if let Some((position, length)) = self.lookup(bytes) {
if length == bytes.len() {
return self.bytes.slice(position..position + length);
}
}
self.append(bytes)
}
pub fn alloc_advance(&mut self, bytes: &[u8]) -> Vec<BytesSlice> {
// ans is Vec<(from_index, to_index)>
let mut ans: Vec<(usize, usize)> = vec![];
fn push(ans: &mut Vec<(usize, usize)>, new: (usize, usize)) {
if let Some(last) = ans.last_mut() {
if last.1 == new.0 {
last.1 = new.1;
return;
}
}
ans.push(new);
}
let mut index = 0;
while index < bytes.len() {
match self.lookup(&bytes[index..]) {
Some((pos, len)) => {
push(&mut ans, (pos, pos + len));
index += len;
}
None => {
push(&mut ans, (self.bytes.len(), self.bytes.len() + 1));
self.bytes.push(bytes[index]);
index += 1;
}
}
}
ans.into_iter()
.map(|(from, to)| self.bytes.slice(from..to))
.collect()
}
pub fn append(&mut self, bytes: &[u8]) -> BytesSlice {
let old_len = self.bytes.len();
self.bytes.push_slice(bytes);
self.append_new_entries_to_map(old_len);
self.bytes.slice(old_len..old_len + bytes.len())
}
/// Append the entries just created to the map
fn append_new_entries_to_map(&mut self, old_len: usize) {
// if old doc = "", append "0123", then we need to add "0123" entry to the map
// if old doc = "0123", append "x", then we need to add "123x" entry to the map
// if old doc = "0123", append "xyz", then we need to add "123x", "23xy", "3xyz" entries to the map
for i in old_len.saturating_sub(3)..self.bytes.len().saturating_sub(3) {
let key = to_key(&self.bytes[i..i + 4]);
self.map.insert(key, i as u32);
}
}
/// given bytes, find the position with the longest match in the document
/// return Option<(position, length)>
fn lookup(&self, bytes: &[u8]) -> Option<(usize, usize)> {
if bytes.len() < 4 {
return None;
}
let key = to_key(bytes);
match self.map.get(&key).copied() {
Some(pos) => {
let mut pos = pos as usize;
let mut len = 4;
while pos + len < self.bytes.len()
&& len < bytes.len()
&& self.bytes[pos + len] == bytes[len]
{
len += 1;
}
Some((pos, len))
}
None => None,
}
}
}
impl Default for CompactBytes {
fn default() -> Self {
Self::new()
}
}
/// Convert the first 4 btyes into u32
fn to_key(bytes: &[u8]) -> u32 {
u32::from_be_bytes([bytes[0], bytes[1], bytes[2], bytes[3]])
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn it_works() {
let mut bytes = CompactBytes::new();
let a = bytes.alloc(b"12345");
let b = bytes.alloc(b"12345");
assert_eq!(b.start(), 0);
assert_eq!(b.end(), 5);
let b = bytes.alloc(b"2345");
assert_eq!(b.start(), 1);
assert_eq!(b.end(), 5);
let b = bytes.alloc(b"23456");
assert_eq!(b.start(), 5);
assert_eq!(b.end(), 10);
assert_eq!(a.as_bytes(), b"12345");
}
#[test]
fn advance() {
let mut bytes = CompactBytes::new();
bytes.append(b"123456789");
let ans = bytes.alloc_advance(b"haha12345567891234");
assert_eq!(ans[0].len(), 4);
assert_eq!(ans[0].start(), 9);
assert_eq!(ans[1].len(), 5);
assert_eq!(ans[1].start(), 0);
assert_eq!(ans[2].len(), 5);
assert_eq!(ans[2].start(), 4);
assert_eq!(ans[3].len(), 4);
assert_eq!(ans[3].start(), 0);
}
}