From 38e7eff09fdbab6749ce69c7cdd5fc42af74304a Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Tue, 25 Apr 2023 21:01:23 +0900 Subject: [PATCH] index: merge overlapped generation ranges to be enqueued Before, the number of the generations to track would increase at each merge point. This was really bad for queries like ':@--' in merge-heavy history, but I didn't notice the problem because ancestors query is lazy and the default log template is slow. Since I'm going to reuse RevWalk for 'roots++:' queries, which can't be lazy, I need to fix this problem first. As we don't have a revset expression to specify exact generation range, gen.end is initialized to either 1 or close to u32::MAX. So, this change means long-lived generation ranges will eventually be merged into one. --- lib/src/default_index_store.rs | 107 +++++++++++++++++++++++++++------ testing/bench-revsets-git.txt | 13 +++- 2 files changed, 99 insertions(+), 21 deletions(-) diff --git a/lib/src/default_index_store.rs b/lib/src/default_index_store.rs index 667898749..3aaf1e21b 100644 --- a/lib/src/default_index_store.rs +++ b/lib/src/default_index_store.rs @@ -13,7 +13,7 @@ // limitations under the License. use std::any::Any; -use std::cmp::{max, min, Ordering}; +use std::cmp::{max, min, Ordering, Reverse}; use std::collections::{BTreeMap, BTreeSet, BinaryHeap, Bound, HashMap, HashSet}; use std::fmt::{Debug, Formatter}; use std::fs::File; @@ -1332,7 +1332,8 @@ impl<'a> Iterator for RevWalk<'a> { #[derive(Clone)] pub struct RevWalkGenerationRange<'a> { - queue: RevWalkQueue<'a, RevWalkItemGenerationRange>, + // Sort item generations in ascending order + queue: RevWalkQueue<'a, Reverse>, generation_end: u32, } @@ -1351,7 +1352,7 @@ impl<'a> RevWalkGenerationRange<'a> { end: u32::saturating_sub(generation_range.end, generation_range.start), }; RevWalkGenerationRange { - queue: queue.map_wanted(|()| item_range), + queue: queue.map_wanted(|()| Reverse(item_range)), generation_end: generation_range.end, } } @@ -1365,7 +1366,7 @@ impl<'a> RevWalkGenerationRange<'a> { start: gen.start + 1, end: gen.end.saturating_add(1), }; - self.queue.push_wanted_parents(entry, succ_gen); + self.queue.push_wanted_parents(entry, Reverse(succ_gen)); } } @@ -1374,25 +1375,26 @@ impl<'a> Iterator for RevWalkGenerationRange<'a> { fn next(&mut self) -> Option { while let Some(item) = self.queue.pop() { - if let RevWalkWorkItemState::Wanted(mut known_gen) = item.state { - let mut some_in_range = known_gen.contains_end(self.generation_end); - self.enqueue_wanted_parents(&item.entry.0, known_gen); + if let RevWalkWorkItemState::Wanted(Reverse(mut pending_gen)) = item.state { + let mut some_in_range = pending_gen.contains_end(self.generation_end); while let Some(x) = self.queue.pop_eq(&item.entry.0) { - // For wanted item, simply track all generation chains. This can - // be optimized if the wanted range is just upper/lower bounded. - // If the range is fully bounded and if the range is wide, we - // can instead extend 'gen' to a range of the same width, and - // merge overlapping generation ranges. - match x.state { - RevWalkWorkItemState::Wanted(gen) if known_gen != gen => { - some_in_range |= gen.contains_end(self.generation_end); - self.enqueue_wanted_parents(&item.entry.0, gen); - known_gen = gen; - } - RevWalkWorkItemState::Wanted(_) => {} - RevWalkWorkItemState::Unwanted => unreachable!(), + // Merge overlapped ranges to reduce number of the queued items. + // For queries like `:(heads-)`, `gen.end` is close to `u32::MAX`, so + // ranges can be merged into one. If this is still slow, maybe we can add + // special case for upper/lower bounded ranges. + if let RevWalkWorkItemState::Wanted(Reverse(gen)) = x.state { + some_in_range |= gen.contains_end(self.generation_end); + pending_gen = if let Some(merged) = pending_gen.try_merge_end(gen) { + merged + } else { + self.enqueue_wanted_parents(&item.entry.0, pending_gen); + gen + }; + } else { + unreachable!("no more unwanted items of the same entry"); } } + self.enqueue_wanted_parents(&item.entry.0, pending_gen); if some_in_range { return Some(item.entry.0); } @@ -1421,6 +1423,15 @@ struct RevWalkItemGenerationRange { } impl RevWalkItemGenerationRange { + /// Suppose sorted ranges `self, other`, merges them if overlapped. + #[must_use] + fn try_merge_end(self, other: Self) -> Option { + (other.start <= self.end).then(|| RevWalkItemGenerationRange { + start: self.start, + end: max(self.end, other.end), + }) + } + #[must_use] fn contains_end(self, end: u32) -> bool { self.start < end && end <= self.end @@ -2701,6 +2712,62 @@ mod tests { ); } + #[test] + #[allow(clippy::redundant_clone)] // allow id_n.clone() + fn test_walk_revs_filter_by_generation_range_merging() { + let mut new_change_id = change_id_generator(); + let mut index = MutableIndexImpl::full(3, 16); + // Long linear history with some short branches + let ids = (0..11) + .map(|n| CommitId::from_hex(&format!("{n:06x}"))) + .collect_vec(); + index.add_commit_data(ids[0].clone(), new_change_id(), &[]); + for i in 1..ids.len() { + index.add_commit_data(ids[i].clone(), new_change_id(), &[ids[i - 1].clone()]); + } + let id_branch5_0 = CommitId::from_hex("050000"); + let id_branch5_1 = CommitId::from_hex("050001"); + index.add_commit_data(id_branch5_0.clone(), new_change_id(), &[ids[5].clone()]); + index.add_commit_data( + id_branch5_1.clone(), + new_change_id(), + &[id_branch5_0.clone()], + ); + + let walk_commit_ids = |wanted: &[CommitId], range: Range| { + index + .walk_revs(wanted, &[]) + .filter_by_generation(range) + .map(|entry| entry.commit_id()) + .collect_vec() + }; + + // Multiple non-overlapping generation ranges to track: + // 9->6: 3..5, 6: 0..2 + assert_eq!( + walk_commit_ids(&[&ids[9], &ids[6]].map(Clone::clone), 4..6), + [&ids[5], &ids[4], &ids[2], &ids[1]].map(Clone::clone) + ); + + // Multiple non-overlapping generation ranges to track, and merged later: + // 10->7: 3..5, 7: 0..2 + // 10->6: 4..6, 7->6, 1..3, 6: 0..2 + assert_eq!( + walk_commit_ids(&[&ids[10], &ids[7], &ids[6]].map(Clone::clone), 5..7), + [&ids[5], &ids[4], &ids[2], &ids[1], &ids[0]].map(Clone::clone) + ); + + // Merge range with sub-range (1..4 + 2..3 should be 1..4, not 1..3): + // 8,7,6->5:1..4, B5_1->5:2..3 + assert_eq!( + walk_commit_ids( + &[&ids[8], &ids[7], &ids[6], &id_branch5_1].map(Clone::clone), + 5..6 + ), + [&ids[3], &ids[2], &ids[1]].map(Clone::clone) + ); + } + #[test] fn test_heads() { let mut new_change_id = change_id_generator(); diff --git a/testing/bench-revsets-git.txt b/testing/bench-revsets-git.txt index 4f9817b90..72013f2e3 100644 --- a/testing/bench-revsets-git.txt +++ b/testing/bench-revsets-git.txt @@ -38,8 +38,19 @@ heads(author(peff)) # Roots and heads of range roots(:v2.40.0) heads(:v2.40.0) -# Parents and children of small subset +# Parents and ancestors of old commit +v1.0.0- +v1.0.0--- +:v1.0.0--- +# Parents and ancestors of recent commit +v2.40.0- +v2.40.0--- +:v2.40.0--- +# Parents and ancestors of small subset tags()- +tags()--- +:tags()--- +# Children of small subset tags()+ # Filter that doesn't read commit object merges()