From 62ce5782b53225f499fabfd2164264695512bfde Mon Sep 17 00:00:00 2001 From: Martin von Zweigbergk Date: Tue, 16 Feb 2021 11:50:28 -0800 Subject: [PATCH] index: when writing incremental index, squash into parent file if smaller We currently write a new incremental index file every time. That means that the stack of index files quickly gets deep, which makes it slow to read the index. This commit makes it so that we squash the new index segment into its parent if the parent has fewer commits. That means we'll limit the number of files to O(log n). Writes time will also be O(log n) on average. --- lib/src/index.rs | 50 +++++++++++++++++++++++++- lib/tests/test_index.rs | 77 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 119 insertions(+), 8 deletions(-) diff --git a/lib/src/index.rs b/lib/src/index.rs index 1bbf683ac..08149277e 100644 --- a/lib/src/index.rs +++ b/lib/src/index.rs @@ -466,6 +466,46 @@ impl MutableIndex { buf } + /// If the MutableIndex has more commits than its parent ReadonlyIndex, + /// return MutableIndex with the commits from both. This is done + /// recursively, so the stack of index files has O(log n) files. + fn maybe_squash_with_ancestors(self) -> MutableIndex { + if self.parent_file.is_none() { + return self; + } + + let mut num_new_commits = self.segment_num_commits(); + let mut parent_file = self.parent_file.as_ref().unwrap().clone(); + let mut squashed; + loop { + // TODO: We should probably also squash if the parent file has less than N + // commits, regardless of how many (few) are in `self`. + if num_new_commits < parent_file.segment_num_commits() { + squashed = MutableIndex::incremental(parent_file); + break; + } + if parent_file.parent_file.is_none() { + squashed = MutableIndex::full(self.dir.clone(), self.hash_length); + break; + } + num_new_commits += parent_file.segment_num_commits(); + parent_file = parent_file.parent_file.as_ref().unwrap().clone(); + } + + // TODO: This can be made more efficient by walking the parent files in order + // and not looking up via `self`. + for pos in squashed.num_parent_commits..self.num_commits() { + let entry = self.entry_by_pos(pos); + let parent_ids: Vec<_> = entry + .parents_positions() + .iter() + .map(|pos| self.entry_by_pos(*pos).commit_id()) + .collect(); + squashed.add_commit_data(entry.commit_id(), parent_ids); + } + squashed + } + pub fn save(self) -> io::Result> { if self.segment_num_commits() == 0 && self.parent_file.is_some() { return Ok(self.parent_file.unwrap()); @@ -473,8 +513,8 @@ impl MutableIndex { let hash_length = self.hash_length; let dir = self.dir.clone(); - let buf = self.serialize(); + let buf = self.maybe_squash_with_ancestors().serialize(); let mut hasher = Blake2b::new(); hasher.update(&buf); let index_file_id_hex = hex::encode(&hasher.finalize()); @@ -509,6 +549,10 @@ impl MutableIndex { CompositeIndex(self).entry_by_id(commit_id) } + pub fn entry_by_pos(&self, pos: u32) -> IndexEntry { + CompositeIndex(self).entry_by_pos(pos) + } + pub fn has_id(&self, commit_id: &CommitId) -> bool { CompositeIndex(self).has_id(commit_id) } @@ -1235,6 +1279,10 @@ impl ReadonlyIndex { CompositeIndex(self).entry_by_id(commit_id) } + pub fn entry_by_pos(&self, pos: u32) -> IndexEntry { + CompositeIndex(self).entry_by_pos(pos) + } + pub fn has_id(&self, commit_id: &CommitId) -> bool { CompositeIndex(self).has_id(commit_id) } diff --git a/lib/tests/test_index.rs b/lib/tests/test_index.rs index 7cb3b86b9..91ed88e1c 100644 --- a/lib/tests/test_index.rs +++ b/lib/tests/test_index.rs @@ -19,6 +19,7 @@ use jujube_lib::repo::ReadonlyRepo; use jujube_lib::settings::UserSettings; use jujube_lib::store::CommitId; use jujube_lib::testutils; +use jujube_lib::testutils::create_random_commit; use std::sync::Arc; use test_case::test_case; @@ -337,13 +338,8 @@ fn test_index_commits_incremental(use_git: bool) { assert_eq!(stats.num_commits, 2 + 3); assert_eq!(stats.num_merges, 0); assert_eq!(stats.max_generation_number, 3); - assert_eq!(stats.levels.len(), 3); - assert_eq!(stats.levels[0].num_commits, 2); - assert_eq!(stats.levels[1].num_commits, 1); - assert_ne!(stats.levels[1].name, stats.levels[0].name); - assert_eq!(stats.levels[2].num_commits, 2); - assert_ne!(stats.levels[2].name, stats.levels[0].name); - assert_ne!(stats.levels[2].name, stats.levels[1].name); + assert_eq!(stats.levels.len(), 1); + assert_eq!(stats.levels[0].num_commits, 5); assert_eq!(generation_number(index.clone(), root_commit.id()), 0); assert_eq!(generation_number(index.clone(), commit_a.id()), 1); @@ -389,6 +385,7 @@ fn test_index_commits_incremental_empty_transaction(use_git: bool) { assert_eq!(stats.levels.len(), 2); assert_eq!(stats.levels[0].num_commits, 2); assert_eq!(stats.levels[1].num_commits, 1); + assert_ne!(stats.levels[1].name, stats.levels[0].name); assert_eq!(generation_number(index.clone(), root_commit.id()), 0); assert_eq!(generation_number(index.clone(), commit_a.id()), 1); @@ -419,3 +416,69 @@ fn test_index_commits_incremental_already_indexed(use_git: bool) { assert_eq!(tx.index().num_commits(), 2 + 1); tx.discard(); } + +fn create_n_commits(settings: &UserSettings, repo: &mut Arc, num_commits: i32) { + let mut tx = repo.start_transaction("test"); + for _ in 0..num_commits { + create_random_commit(settings, repo).write_to_transaction(&mut tx); + } + tx.commit(); + Arc::get_mut(repo).unwrap().reload(); +} + +fn commits_by_level(repo: &ReadonlyRepo) -> Vec { + repo.index() + .stats() + .levels + .iter() + .map(|level| level.num_commits) + .collect() +} + +#[test_case(false ; "local store")] +#[test_case(true ; "git store")] +fn test_index_commits_incremental_squashed(use_git: bool) { + let settings = testutils::user_settings(); + + let (_temp_dir, mut repo) = testutils::init_repo(&settings, use_git); + create_n_commits(&settings, &mut repo, 1); + assert_eq!(commits_by_level(&repo), vec![2, 1]); + create_n_commits(&settings, &mut repo, 1); + assert_eq!(commits_by_level(&repo), vec![4]); + + let (_temp_dir, mut repo) = testutils::init_repo(&settings, use_git); + create_n_commits(&settings, &mut repo, 2); + assert_eq!(commits_by_level(&repo), vec![4]); + + let (_temp_dir, mut repo) = testutils::init_repo(&settings, use_git); + create_n_commits(&settings, &mut repo, 100); + assert_eq!(commits_by_level(&repo), vec![102]); + + let (_temp_dir, mut repo) = testutils::init_repo(&settings, use_git); + create_n_commits(&settings, &mut repo, 2); + create_n_commits(&settings, &mut repo, 4); + create_n_commits(&settings, &mut repo, 8); + create_n_commits(&settings, &mut repo, 16); + create_n_commits(&settings, &mut repo, 32); + assert_eq!(commits_by_level(&repo), vec![64]); + + let (_temp_dir, mut repo) = testutils::init_repo(&settings, use_git); + create_n_commits(&settings, &mut repo, 32); + create_n_commits(&settings, &mut repo, 16); + create_n_commits(&settings, &mut repo, 8); + create_n_commits(&settings, &mut repo, 4); + create_n_commits(&settings, &mut repo, 2); + assert_eq!(commits_by_level(&repo), vec![34, 16, 8, 4, 2]); + + let (_temp_dir, mut repo) = testutils::init_repo(&settings, use_git); + create_n_commits(&settings, &mut repo, 10); + create_n_commits(&settings, &mut repo, 10); + create_n_commits(&settings, &mut repo, 10); + create_n_commits(&settings, &mut repo, 10); + create_n_commits(&settings, &mut repo, 10); + create_n_commits(&settings, &mut repo, 10); + create_n_commits(&settings, &mut repo, 10); + create_n_commits(&settings, &mut repo, 10); + create_n_commits(&settings, &mut repo, 10); + assert_eq!(commits_by_level(&repo), vec![72, 20]); +}