From 0c0eb37f2e408dce2fe157437f604a3b3d796fa0 Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Tue, 19 Dec 2023 19:53:42 +0900 Subject: [PATCH] index: don't store commit ids in sorted lookup table to save disk space MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reduces the index file size. In my linux mirror repo containing 1591524 commits, the initial index file shrank from 122MB to 92MB. In theory, this makes commit id lookup slow because of additional indirection and cache miss, but I don't see significant difference. In mid-size repo, this is actually a bit faster thanks to smaller index reads. Alternatively, the commit id field could be removed from the CommitGraphEntry, but doing that would introduce indirect lookup there, and the index disk size isn't as small as this change. - jj-0 baseline 122MB - jj-1 shrink CommitLookupEntry (this) 92MB - jj-3 shrink CommitGraphEntry 98MB Mid-size repo, "log" with default template ``` % hyperfine --sort command --warmup 3 --runs 20 -L bin jj-0,jj-1,jj-2,jj-3 \ -s "target/release-with-debug/{bin} -R ~/mirrors/linux debug reindex" \ "target/release-with-debug/{bin} -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=\"\"'" Benchmark 1: target/release-with-debug/jj-0 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""' Time (mean ± σ): 177.7 ms ± 12.9 ms [User: 96.3 ms, System: 81.5 ms] Range (min … max): 156.8 ms … 191.2 ms 20 runs Benchmark 2: target/release-with-debug/jj-1 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""' Time (mean ± σ): 169.8 ms ± 13.8 ms [User: 93.3 ms, System: 76.6 ms] Range (min … max): 151.1 ms … 191.5 ms 20 runs Benchmark 4: target/release-with-debug/jj-3 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""' Time (mean ± σ): 170.3 ms ± 13.4 ms [User: 90.1 ms, System: 79.7 ms] Range (min … max): 154.8 ms … 186.2 ms 20 runs Relative speed comparison 1.05 ± 0.11 target/release-with-debug/jj-0 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""' 1.00 target/release-with-debug/jj-1 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""' 1.00 ± 0.11 target/release-with-debug/jj-3 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""' ``` Small repo, "log" thousands of commits with -T"commit_id.shortest()" ``` % hyperfine --sort command --warmup 3 --runs 100 -L bin jj-0,jj-1,jj-2,jj-3 \ -s "target/release-with-debug/{bin} -R ~/mirrors/git debug reindex" \ "target/release-with-debug/{bin} -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=\"\"'" Benchmark 1: target/release-with-debug/jj-0 -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=""' Time (mean ± σ): 179.3 ms ± 12.8 ms [User: 149.7 ms, System: 29.6 ms] Range (min … max): 155.2 ms … 191.0 ms 100 runs Benchmark 2: target/release-with-debug/jj-1 -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=""' Time (mean ± σ): 179.1 ms ± 13.7 ms [User: 148.5 ms, System: 30.5 ms] Range (min … max): 157.2 ms … 196.7 ms 100 runs Benchmark 4: target/release-with-debug/jj-3 -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=""' Time (mean ± σ): 178.2 ms ± 13.6 ms [User: 148.7 ms, System: 29.6 ms] Range (min … max): 156.5 ms … 191.7 ms 100 runs Relative speed comparison 1.01 ± 0.11 target/release-with-debug/jj-0 -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=""' 1.01 ± 0.11 target/release-with-debug/jj-1 -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=""' 1.01 ± 0.11 target/release-with-debug/jj-3 -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=""' ``` --- lib/src/default_index/mutable.rs | 3 +- lib/src/default_index/readonly.rs | 61 +++++++++++-------------------- 2 files changed, 22 insertions(+), 42 deletions(-) diff --git a/lib/src/default_index/mutable.rs b/lib/src/default_index/mutable.rs index d979af78d..f88c121c5 100644 --- a/lib/src/default_index/mutable.rs +++ b/lib/src/default_index/mutable.rs @@ -254,8 +254,7 @@ impl MutableIndexSegment { buf.extend_from_slice(entry.commit_id.as_bytes()); } - for (commit_id, LocalPosition(pos)) in &self.commit_lookup { - buf.extend_from_slice(commit_id.as_bytes()); + for LocalPosition(pos) in self.commit_lookup.values() { buf.extend(pos.to_le_bytes()); } diff --git a/lib/src/default_index/readonly.rs b/lib/src/default_index/readonly.rs index 3c33fca4c..d8822b3cd 100644 --- a/lib/src/default_index/readonly.rs +++ b/lib/src/default_index/readonly.rs @@ -73,7 +73,7 @@ impl ReadonlyIndexLoadError { } /// Current format version of the index segment file. -pub(crate) const INDEX_SEGMENT_FILE_FORMAT_VERSION: u32 = 5; +pub(crate) const INDEX_SEGMENT_FILE_FORMAT_VERSION: u32 = 6; /// If set, the value is stored in the overflow table. pub(crate) const OVERFLOW_FLAG: u32 = 0x8000_0000; @@ -133,33 +133,13 @@ impl CommitGraphEntry<'_> { u32::from_le_bytes(self.data[12..16].try_into().unwrap()) } - fn commit_id(&self) -> CommitId { - CommitId::from_bytes(&self.data[16..]) - } -} - -struct CommitLookupEntry<'a> { - data: &'a [u8], - commit_id_length: usize, -} - -impl CommitLookupEntry<'_> { - fn size(commit_id_length: usize) -> usize { - commit_id_length + 4 - } - fn commit_id(&self) -> CommitId { CommitId::from_bytes(self.commit_id_bytes()) } // might be better to add borrowed version of CommitId fn commit_id_bytes(&self) -> &[u8] { - &self.data[0..self.commit_id_length] - } - - fn local_pos(&self) -> LocalPosition { - let pos = u32::from_le_bytes(self.data[self.commit_id_length..][..4].try_into().unwrap()); - LocalPosition(pos) + &self.data[16..] } } @@ -188,7 +168,6 @@ impl CommitLookupEntry<'_> { /// u32: change id position in the sorted change ids table /// : commit id /// for each entry, sorted by commit id: -/// : commit id /// u32: local position in the graph entries table /// for each entry, sorted by change id: /// : change id @@ -324,8 +303,7 @@ impl ReadonlyIndexSegment { let commit_graph_entry_size = CommitGraphEntry::size(commit_id_length); let graph_size = (num_local_commits as usize) * commit_graph_entry_size; - let commit_lookup_entry_size = CommitLookupEntry::size(commit_id_length); - let commit_lookup_size = (num_local_commits as usize) * commit_lookup_entry_size; + let commit_lookup_size = (num_local_commits as usize) * 4; let change_id_table_size = (num_local_change_ids as usize) * change_id_length; let change_pos_table_size = (num_local_change_ids as usize) * 4; let parent_overflow_size = (num_parent_overflow_entries as usize) * 4; @@ -389,14 +367,10 @@ impl ReadonlyIndexSegment { } } - fn commit_lookup_entry(&self, lookup_pos: u32) -> CommitLookupEntry { + fn commit_lookup_pos(&self, lookup_pos: u32) -> LocalPosition { let table = &self.data[self.commit_lookup_base..self.change_id_table_base]; - let entry_size = CommitLookupEntry::size(self.commit_id_length); - let offset = (lookup_pos as usize) * entry_size; - CommitLookupEntry { - data: &table[offset..][..entry_size], - commit_id_length: self.commit_id_length, - } + let offset = (lookup_pos as usize) * 4; + LocalPosition(u32::from_le_bytes(table[offset..][..4].try_into().unwrap())) } fn change_lookup_id(&self, lookup_pos: u32) -> ChangeId { @@ -438,7 +412,8 @@ impl ReadonlyIndexSegment { /// Binary searches commit id by `prefix`. Returns the lookup position. fn commit_id_byte_prefix_to_lookup_pos(&self, prefix: &[u8]) -> PositionLookupResult { binary_search_pos_by(self.num_local_commits, |pos| { - let entry = self.commit_lookup_entry(pos); + let local_pos = self.commit_lookup_pos(pos); + let entry = self.graph_entry(local_pos); entry.commit_id_bytes().cmp(prefix) }) } @@ -470,11 +445,9 @@ impl IndexSegment for ReadonlyIndexSegment { } fn commit_id_to_pos(&self, commit_id: &CommitId) -> Option { - let lookup_pos = self - .commit_id_byte_prefix_to_lookup_pos(commit_id.as_bytes()) - .ok()?; - let entry = self.commit_lookup_entry(lookup_pos); - Some(entry.local_pos()) + self.commit_id_byte_prefix_to_lookup_pos(commit_id.as_bytes()) + .ok() + .map(|pos| self.commit_lookup_pos(pos)) } fn resolve_neighbor_commit_ids( @@ -482,12 +455,20 @@ impl IndexSegment for ReadonlyIndexSegment { commit_id: &CommitId, ) -> (Option, Option) { self.commit_id_byte_prefix_to_lookup_pos(commit_id.as_bytes()) - .map_neighbors(|pos| self.commit_lookup_entry(pos).commit_id()) + .map_neighbors(|pos| { + let local_pos = self.commit_lookup_pos(pos); + let entry = self.graph_entry(local_pos); + entry.commit_id() + }) } fn resolve_commit_id_prefix(&self, prefix: &HexPrefix) -> PrefixResolution { self.commit_id_byte_prefix_to_lookup_pos(prefix.min_prefix_bytes()) - .prefix_matches(prefix, |pos| self.commit_lookup_entry(pos).commit_id()) + .prefix_matches(prefix, |pos| { + let local_pos = self.commit_lookup_pos(pos); + let entry = self.graph_entry(local_pos); + entry.commit_id() + }) .map(|(id, _)| id) }