index: don't store commit ids in sorted lookup table to save disk space

This reduces the index file size. In my linux mirror repo containing 1591524
commits, the initial index file shrank from 122MB to 92MB. In theory, this
makes commit id lookup slow because of additional indirection and cache miss,
but I don't see significant difference. In mid-size repo, this is actually a
bit faster thanks to smaller index reads.

Alternatively, the commit id field could be removed from the CommitGraphEntry,
but doing that would introduce indirect lookup there, and the index disk size
isn't as small as this change.

- jj-0 baseline                         122MB
- jj-1 shrink CommitLookupEntry (this)   92MB
- jj-3 shrink CommitGraphEntry           98MB

Mid-size repo, "log" with default template
```
% hyperfine --sort command --warmup 3 --runs 20 -L bin jj-0,jj-1,jj-2,jj-3 \
  -s "target/release-with-debug/{bin} -R ~/mirrors/linux debug reindex" \
  "target/release-with-debug/{bin} -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=\"\"'"
Benchmark 1: target/release-with-debug/jj-0 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""'
  Time (mean ± σ):     177.7 ms ±  12.9 ms    [User: 96.3 ms, System: 81.5 ms]
  Range (min … max):   156.8 ms … 191.2 ms    20 runs

Benchmark 2: target/release-with-debug/jj-1 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""'
  Time (mean ± σ):     169.8 ms ±  13.8 ms    [User: 93.3 ms, System: 76.6 ms]
  Range (min … max):   151.1 ms … 191.5 ms    20 runs

Benchmark 4: target/release-with-debug/jj-3 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""'
  Time (mean ± σ):     170.3 ms ±  13.4 ms    [User: 90.1 ms, System: 79.7 ms]
  Range (min … max):   154.8 ms … 186.2 ms    20 runs

Relative speed comparison
        1.05 ±  0.11  target/release-with-debug/jj-0 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""'
        1.00          target/release-with-debug/jj-1 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""'
        1.00 ±  0.11  target/release-with-debug/jj-3 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""'
```

Small repo, "log" thousands of commits with -T"commit_id.shortest()"
```
% hyperfine --sort command --warmup 3 --runs 100 -L bin jj-0,jj-1,jj-2,jj-3 \
  -s "target/release-with-debug/{bin} -R ~/mirrors/git debug reindex" \
  "target/release-with-debug/{bin} -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=\"\"'"
Benchmark 1: target/release-with-debug/jj-0 -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=""'
  Time (mean ± σ):     179.3 ms ±  12.8 ms    [User: 149.7 ms, System: 29.6 ms]
  Range (min … max):   155.2 ms … 191.0 ms    100 runs

Benchmark 2: target/release-with-debug/jj-1 -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=""'
  Time (mean ± σ):     179.1 ms ±  13.7 ms    [User: 148.5 ms, System: 30.5 ms]
  Range (min … max):   157.2 ms … 196.7 ms    100 runs

Benchmark 4: target/release-with-debug/jj-3 -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=""'
  Time (mean ± σ):     178.2 ms ±  13.6 ms    [User: 148.7 ms, System: 29.6 ms]
  Range (min … max):   156.5 ms … 191.7 ms    100 runs

Relative speed comparison
        1.01 ±  0.11  target/release-with-debug/jj-0 -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=""'
        1.01 ±  0.11  target/release-with-debug/jj-1 -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=""'
        1.01 ±  0.11  target/release-with-debug/jj-3 -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=""'
```
This commit is contained in:
Yuya Nishihara 2023-12-19 19:53:42 +09:00
parent c75230747a
commit 0c0eb37f2e
2 changed files with 22 additions and 42 deletions

View file

@ -254,8 +254,7 @@ impl MutableIndexSegment {
buf.extend_from_slice(entry.commit_id.as_bytes()); buf.extend_from_slice(entry.commit_id.as_bytes());
} }
for (commit_id, LocalPosition(pos)) in &self.commit_lookup { for LocalPosition(pos) in self.commit_lookup.values() {
buf.extend_from_slice(commit_id.as_bytes());
buf.extend(pos.to_le_bytes()); buf.extend(pos.to_le_bytes());
} }

View file

@ -73,7 +73,7 @@ impl ReadonlyIndexLoadError {
} }
/// Current format version of the index segment file. /// Current format version of the index segment file.
pub(crate) const INDEX_SEGMENT_FILE_FORMAT_VERSION: u32 = 5; pub(crate) const INDEX_SEGMENT_FILE_FORMAT_VERSION: u32 = 6;
/// If set, the value is stored in the overflow table. /// If set, the value is stored in the overflow table.
pub(crate) const OVERFLOW_FLAG: u32 = 0x8000_0000; pub(crate) const OVERFLOW_FLAG: u32 = 0x8000_0000;
@ -133,33 +133,13 @@ impl CommitGraphEntry<'_> {
u32::from_le_bytes(self.data[12..16].try_into().unwrap()) u32::from_le_bytes(self.data[12..16].try_into().unwrap())
} }
fn commit_id(&self) -> CommitId {
CommitId::from_bytes(&self.data[16..])
}
}
struct CommitLookupEntry<'a> {
data: &'a [u8],
commit_id_length: usize,
}
impl CommitLookupEntry<'_> {
fn size(commit_id_length: usize) -> usize {
commit_id_length + 4
}
fn commit_id(&self) -> CommitId { fn commit_id(&self) -> CommitId {
CommitId::from_bytes(self.commit_id_bytes()) CommitId::from_bytes(self.commit_id_bytes())
} }
// might be better to add borrowed version of CommitId // might be better to add borrowed version of CommitId
fn commit_id_bytes(&self) -> &[u8] { fn commit_id_bytes(&self) -> &[u8] {
&self.data[0..self.commit_id_length] &self.data[16..]
}
fn local_pos(&self) -> LocalPosition {
let pos = u32::from_le_bytes(self.data[self.commit_id_length..][..4].try_into().unwrap());
LocalPosition(pos)
} }
} }
@ -188,7 +168,6 @@ impl CommitLookupEntry<'_> {
/// u32: change id position in the sorted change ids table /// u32: change id position in the sorted change ids table
/// <commit id length number of bytes>: commit id /// <commit id length number of bytes>: commit id
/// for each entry, sorted by commit id: /// for each entry, sorted by commit id:
/// <commit id length number of bytes>: commit id
/// u32: local position in the graph entries table /// u32: local position in the graph entries table
/// for each entry, sorted by change id: /// for each entry, sorted by change id:
/// <change id length number of bytes>: change id /// <change id length number of bytes>: change id
@ -324,8 +303,7 @@ impl ReadonlyIndexSegment {
let commit_graph_entry_size = CommitGraphEntry::size(commit_id_length); let commit_graph_entry_size = CommitGraphEntry::size(commit_id_length);
let graph_size = (num_local_commits as usize) * commit_graph_entry_size; let graph_size = (num_local_commits as usize) * commit_graph_entry_size;
let commit_lookup_entry_size = CommitLookupEntry::size(commit_id_length); let commit_lookup_size = (num_local_commits as usize) * 4;
let commit_lookup_size = (num_local_commits as usize) * commit_lookup_entry_size;
let change_id_table_size = (num_local_change_ids as usize) * change_id_length; let change_id_table_size = (num_local_change_ids as usize) * change_id_length;
let change_pos_table_size = (num_local_change_ids as usize) * 4; let change_pos_table_size = (num_local_change_ids as usize) * 4;
let parent_overflow_size = (num_parent_overflow_entries as usize) * 4; let parent_overflow_size = (num_parent_overflow_entries as usize) * 4;
@ -389,14 +367,10 @@ impl ReadonlyIndexSegment {
} }
} }
fn commit_lookup_entry(&self, lookup_pos: u32) -> CommitLookupEntry { fn commit_lookup_pos(&self, lookup_pos: u32) -> LocalPosition {
let table = &self.data[self.commit_lookup_base..self.change_id_table_base]; let table = &self.data[self.commit_lookup_base..self.change_id_table_base];
let entry_size = CommitLookupEntry::size(self.commit_id_length); let offset = (lookup_pos as usize) * 4;
let offset = (lookup_pos as usize) * entry_size; LocalPosition(u32::from_le_bytes(table[offset..][..4].try_into().unwrap()))
CommitLookupEntry {
data: &table[offset..][..entry_size],
commit_id_length: self.commit_id_length,
}
} }
fn change_lookup_id(&self, lookup_pos: u32) -> ChangeId { fn change_lookup_id(&self, lookup_pos: u32) -> ChangeId {
@ -438,7 +412,8 @@ impl ReadonlyIndexSegment {
/// Binary searches commit id by `prefix`. Returns the lookup position. /// Binary searches commit id by `prefix`. Returns the lookup position.
fn commit_id_byte_prefix_to_lookup_pos(&self, prefix: &[u8]) -> PositionLookupResult { fn commit_id_byte_prefix_to_lookup_pos(&self, prefix: &[u8]) -> PositionLookupResult {
binary_search_pos_by(self.num_local_commits, |pos| { binary_search_pos_by(self.num_local_commits, |pos| {
let entry = self.commit_lookup_entry(pos); let local_pos = self.commit_lookup_pos(pos);
let entry = self.graph_entry(local_pos);
entry.commit_id_bytes().cmp(prefix) entry.commit_id_bytes().cmp(prefix)
}) })
} }
@ -470,11 +445,9 @@ impl IndexSegment for ReadonlyIndexSegment {
} }
fn commit_id_to_pos(&self, commit_id: &CommitId) -> Option<LocalPosition> { fn commit_id_to_pos(&self, commit_id: &CommitId) -> Option<LocalPosition> {
let lookup_pos = self self.commit_id_byte_prefix_to_lookup_pos(commit_id.as_bytes())
.commit_id_byte_prefix_to_lookup_pos(commit_id.as_bytes()) .ok()
.ok()?; .map(|pos| self.commit_lookup_pos(pos))
let entry = self.commit_lookup_entry(lookup_pos);
Some(entry.local_pos())
} }
fn resolve_neighbor_commit_ids( fn resolve_neighbor_commit_ids(
@ -482,12 +455,20 @@ impl IndexSegment for ReadonlyIndexSegment {
commit_id: &CommitId, commit_id: &CommitId,
) -> (Option<CommitId>, Option<CommitId>) { ) -> (Option<CommitId>, Option<CommitId>) {
self.commit_id_byte_prefix_to_lookup_pos(commit_id.as_bytes()) self.commit_id_byte_prefix_to_lookup_pos(commit_id.as_bytes())
.map_neighbors(|pos| self.commit_lookup_entry(pos).commit_id()) .map_neighbors(|pos| {
let local_pos = self.commit_lookup_pos(pos);
let entry = self.graph_entry(local_pos);
entry.commit_id()
})
} }
fn resolve_commit_id_prefix(&self, prefix: &HexPrefix) -> PrefixResolution<CommitId> { fn resolve_commit_id_prefix(&self, prefix: &HexPrefix) -> PrefixResolution<CommitId> {
self.commit_id_byte_prefix_to_lookup_pos(prefix.min_prefix_bytes()) self.commit_id_byte_prefix_to_lookup_pos(prefix.min_prefix_bytes())
.prefix_matches(prefix, |pos| self.commit_lookup_entry(pos).commit_id()) .prefix_matches(prefix, |pos| {
let local_pos = self.commit_lookup_pos(pos);
let entry = self.graph_entry(local_pos);
entry.commit_id()
})
.map(|(id, _)| id) .map(|(id, _)| id)
} }