From fc1731a18002bae02770fcc801fefc1d401d90f5 Mon Sep 17 00:00:00 2001 From: Martin von Zweigbergk Date: Mon, 7 Mar 2022 22:17:16 -0800 Subject: [PATCH] diff: switch from BTreeMap to sorted Vec for unchanged ranges We don't really need a BTreeMap for keeping the unchanged ranges. The only place it helps a bit is when refining a diff because we may then insert some more unchanged ranges in the list. I think there has to be very many unchanged ranges for that to matter, however. This patch therefore replace the BTreeMap by a sorted Vec. `cargo bench` says that a few tests got ~20% faster. I'm looking into this code now because I'm thinking of copying some of it for the "partial conflict resolution" tool I'm working on for Mercurial. --- lib/src/diff.rs | 234 +++++++++++++++++++++++++++++++----------------- 1 file changed, 153 insertions(+), 81 deletions(-) diff --git a/lib/src/diff.rs b/lib/src/diff.rs index 704a3ab7b..1d72ad099 100644 --- a/lib/src/diff.rs +++ b/lib/src/diff.rs @@ -13,9 +13,10 @@ // limitations under the License. use std::cmp::{max, min, Ordering}; -use std::collections::{btree_map, BTreeMap, HashMap}; +use std::collections::{BTreeMap, HashMap}; use std::fmt::{Debug, Formatter}; use std::ops::Range; +use std::slice; use itertools::Itertools; @@ -290,20 +291,26 @@ pub(crate) fn unchanged_ranges( result } -/// Wrapper around Range to provide Ord. We only order by the range's -/// start because we make sure to never have overlapping ranges. #[derive(Clone, PartialEq, Eq, Debug)] -struct BaseRange(Range); +struct UnchangedRange { + base_range: Range, + offsets: Vec, +} -impl PartialOrd for BaseRange { +/// We only order by the base range's start because we make sure to never have +/// overlapping ranges. +impl PartialOrd for UnchangedRange { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ord for BaseRange { +impl Ord for UnchangedRange { fn cmp(&self, other: &Self) -> Ordering { - self.0.start.cmp(&other.0.start).then_with(|| self.0.end.cmp(&other.0.end)) + self.base_range + .start + .cmp(&other.base_range.start) + .then_with(|| self.base_range.end.cmp(&other.base_range.end)) } } @@ -316,21 +323,25 @@ pub struct Diff<'input> { // The key is a range in the base input. The value is the start of each non-base region // relative to the base region's start. By making them relative, they don't need to change // when the base range changes. - unchanged_regions: BTreeMap>, + unchanged_regions: Vec, } /// Takes the current regions and intersects it with the new unchanged ranges /// from a 2-way diff. The result is a map of unchanged regions with one more /// offset in the map's values. fn intersect_regions( - current_ranges: BTreeMap>, + current_ranges: Vec, new_unchanged_ranges: &[(Range, Range)], -) -> BTreeMap> { - let mut result = BTreeMap::new(); +) -> Vec { + let mut result = vec![]; let mut current_ranges_iter = current_ranges.into_iter().peekable(); for (new_base_range, other_range) in new_unchanged_ranges.iter() { assert_eq!(new_base_range.len(), other_range.len()); - while let Some((BaseRange(base_range), offsets)) = current_ranges_iter.peek() { + while let Some(UnchangedRange { + base_range, + offsets, + }) = current_ranges_iter.peek() + { // No need to look further if we're past the new range. if base_range.start >= new_base_range.end { break; @@ -345,7 +356,10 @@ fn intersect_regions( let new_end = min(base_range.end, new_base_range.end); let mut new_offsets = offsets.clone(); new_offsets.push(other_range.start.wrapping_sub(new_base_range.start) as isize); - result.insert(BaseRange(new_start..new_end), new_offsets); + result.push(UnchangedRange { + base_range: new_start..new_end, + offsets: new_offsets, + }); if base_range.end >= new_base_range.end { // Break without consuming the item; there may be other new ranges that overlap // with it. @@ -376,8 +390,10 @@ impl<'input> Diff<'input> { // input as unchanged (compared to itself). Then diff each other input // against the base. Intersect the previously found ranges with the // unchanged ranges in the diff. - let mut unchanged_regions = BTreeMap::new(); - unchanged_regions.insert(BaseRange(0..base_input.len()), vec![]); + let mut unchanged_regions = vec![UnchangedRange { + base_range: 0..base_input.len(), + offsets: vec![], + }]; for (i, other_token_ranges) in other_token_ranges.iter().enumerate() { let unchanged_diff_ranges = unchanged_ranges( base_input, @@ -392,7 +408,10 @@ impl<'input> Diff<'input> { .iter() .map(|input| input.len().wrapping_sub(base_input.len()) as isize) .collect_vec(); - unchanged_regions.insert(BaseRange(base_input.len()..base_input.len()), offsets); + unchanged_regions.push(UnchangedRange { + base_range: base_input.len()..base_input.len(), + offsets, + }); let mut diff = Self { base_input, @@ -424,8 +443,10 @@ impl<'input> Diff<'input> { let previous_offsets = vec![0; self.other_inputs.len()]; DiffHunkIterator { diff: self, - previous_base_range: 0..0, - previous_offsets, + previous: UnchangedRange { + base_range: 0..0, + offsets: previous_offsets, + }, unchanged_emitted: true, unchanged_iter: self.unchanged_regions.iter(), } @@ -434,55 +455,77 @@ impl<'input> Diff<'input> { /// Uses the given tokenizer to split the changed regions into smaller /// regions. Then tries to finds unchanged regions among them. pub fn refine_changed_regions(&mut self, tokenizer: &impl Fn(&[u8]) -> Vec>) { - let mut previous_base_end = 0; - let mut previous_offsets = vec![0; self.other_inputs.len()]; - let mut new_unchanged_ranges = BTreeMap::new(); - for (BaseRange(base_range), offsets) in self.unchanged_regions.iter() { + let mut previous = UnchangedRange { + base_range: 0..0, + offsets: vec![0; self.other_inputs.len()], + }; + let mut new_unchanged_ranges = vec![]; + for current in self.unchanged_regions.iter() { // For the changed region between the previous region and the current one, // create a new Diff instance. Then adjust the start positions and // offsets to be valid in the context of the larger Diff instance // (`self`). - let mut slices = vec![&self.base_input[previous_base_end..base_range.start]]; - for (i, offset) in offsets.iter().enumerate() { - let changed_range = previous_base_end.wrapping_add(previous_offsets[i] as usize) - ..base_range.start.wrapping_add(*offset as usize); + let mut slices = + vec![&self.base_input[previous.base_range.end..current.base_range.start]]; + for (i, offset) in current.offsets.iter().enumerate() { + let changed_range = previous + .base_range + .end + .wrapping_add(previous.offsets[i] as usize) + ..current.base_range.start.wrapping_add(*offset as usize); slices.push(&self.other_inputs[i][changed_range]); } let refined_diff = Diff::for_tokenizer(&slices, tokenizer); - for (BaseRange(base_range), offsets) in refined_diff.unchanged_regions { - let new_base_start = base_range.start + previous_base_end; - let new_base_end = base_range.end + previous_base_end; + for UnchangedRange { + base_range, + offsets, + } in refined_diff.unchanged_regions + { + let new_base_start = base_range.start + previous.base_range.end; + let new_base_end = base_range.end + previous.base_range.end; let offsets = offsets .into_iter() .enumerate() - .map(|(i, offset)| offset + previous_offsets[i]) + .map(|(i, offset)| offset + previous.offsets[i]) .collect_vec(); - new_unchanged_ranges.insert(BaseRange(new_base_start..new_base_end), offsets); + new_unchanged_ranges.push(UnchangedRange { + base_range: new_base_start..new_base_end, + offsets, + }); } - previous_base_end = base_range.end; - previous_offsets = offsets.clone(); + previous = current.clone(); } - self.unchanged_regions.extend(new_unchanged_ranges); + self.unchanged_regions = self + .unchanged_regions + .iter() + .cloned() + .merge(new_unchanged_ranges) + .collect_vec(); self.compact_unchanged_regions(); } fn compact_unchanged_regions(&mut self) { - let mut compacted = BTreeMap::new(); - let mut previous: Option<(Range, Vec)> = None; - for (BaseRange(base_range), value) in self.unchanged_regions.iter() { - if let Some((prevous_base_range, previous_value)) = previous { - if prevous_base_range.end == base_range.start && previous_value == *value { - previous = Some((prevous_base_range.start..base_range.end, value.clone())); + let mut compacted = vec![]; + let mut maybe_previous: Option = None; + for current in self.unchanged_regions.iter() { + if let Some(previous) = maybe_previous { + if previous.base_range.end == current.base_range.start + && previous.offsets == *current.offsets + { + maybe_previous = Some(UnchangedRange { + base_range: previous.base_range.start..current.base_range.end, + offsets: current.offsets.clone(), + }); continue; } - compacted.insert(BaseRange(prevous_base_range), previous_value); + compacted.push(previous); } - previous = Some((base_range.clone(), value.clone())); + maybe_previous = Some(current.clone()); } - if let Some((prevous_base_range, previous_value)) = previous { - compacted.insert(BaseRange(prevous_base_range), previous_value); + if let Some(previous) = maybe_previous { + compacted.push(previous); } self.unchanged_regions = compacted; } @@ -516,10 +559,9 @@ impl Debug for DiffHunk<'_> { pub struct DiffHunkIterator<'diff, 'input> { diff: &'diff Diff<'input>, - previous_base_range: Range, - previous_offsets: Vec, + previous: UnchangedRange, unchanged_emitted: bool, - unchanged_iter: btree_map::Iter<'diff, BaseRange, Vec>, + unchanged_iter: slice::Iter<'diff, UnchangedRange>, } impl<'diff, 'input> Iterator for DiffHunkIterator<'diff, 'input> { @@ -529,25 +571,29 @@ impl<'diff, 'input> Iterator for DiffHunkIterator<'diff, 'input> { loop { if !self.unchanged_emitted { self.unchanged_emitted = true; - if !self.previous_base_range.is_empty() { + if !self.previous.base_range.is_empty() { return Some(DiffHunk::Matching( - &self.diff.base_input[self.previous_base_range.clone()], + &self.diff.base_input[self.previous.base_range.clone()], )); } } - if let Some((BaseRange(base_range), offsets)) = self.unchanged_iter.next() { - let mut slices = - vec![&self.diff.base_input[self.previous_base_range.end..base_range.start]]; + if let Some(current) = self.unchanged_iter.next() { + let mut slices = vec![ + &self.diff.base_input[self.previous.base_range.end..current.base_range.start], + ]; for (i, input) in self.diff.other_inputs.iter().enumerate() { let start = self - .previous_base_range + .previous + .base_range .end - .wrapping_add(self.previous_offsets[i] as usize); - let end = base_range.start.wrapping_add(offsets[i] as usize); + .wrapping_add(self.previous.offsets[i] as usize); + let end = current + .base_range + .start + .wrapping_add(current.offsets[i] as usize); slices.push(&input[start..end]); } - self.previous_base_range = base_range.clone(); - self.previous_offsets = offsets.clone(); + self.previous = current.clone(); self.unchanged_emitted = false; if slices.iter().any(|slice| !slice.is_empty()) { return Some(DiffHunk::Different(slices)); @@ -732,54 +778,80 @@ mod tests { #[test] fn test_intersect_regions_existing_empty() { - let actual = intersect_regions(btreemap! {}, &[(20..25, 55..60)]); - let expected = btreemap! {}; + let actual = intersect_regions(vec![], &[(20..25, 55..60)]); + let expected = vec![]; assert_eq!(actual, expected); } #[test] fn test_intersect_regions_new_ranges_within_existing() { let actual = intersect_regions( - btreemap! { - BaseRange(20..70) => vec![3], - }, + vec![UnchangedRange { + base_range: 20..70, + offsets: vec![3], + }], &[(25..30, 35..40), (40..50, 40..50)], ); - let expected = btreemap! { - BaseRange(25..30) => vec![3, 10], - BaseRange(40..50) => vec![3, 0], - }; + let expected = vec![ + UnchangedRange { + base_range: 25..30, + offsets: vec![3, 10], + }, + UnchangedRange { + base_range: 40..50, + offsets: vec![3, 0], + }, + ]; assert_eq!(actual, expected); } #[test] fn test_intersect_regions_partial_overlap() { let actual = intersect_regions( - btreemap! { - BaseRange(20..50) => vec![-3], - }, + vec![UnchangedRange { + base_range: 20..50, + offsets: vec![-3], + }], &[(15..25, 5..15), (45..60, 55..70)], ); - let expected = btreemap! { - BaseRange(20..25) => vec![-3, -10], - BaseRange(45..50) => vec![-3, 10], - }; + let expected = vec![ + UnchangedRange { + base_range: 20..25, + offsets: vec![-3, -10], + }, + UnchangedRange { + base_range: 45..50, + offsets: vec![-3, 10], + }, + ]; assert_eq!(actual, expected); } #[test] fn test_intersect_regions_new_range_overlaps_multiple_existing() { let actual = intersect_regions( - btreemap! { - BaseRange(20..50) => vec![3, -8], - BaseRange(70..80) => vec![7, 1], - }, + vec![ + UnchangedRange { + base_range: 20..50, + offsets: vec![3, -8], + }, + UnchangedRange { + base_range: 70..80, + offsets: vec![7, 1], + }, + ], &[(10..100, 5..95)], ); - let expected = btreemap! { - BaseRange(20..50) => vec![3, -8, -5], - BaseRange(70..80) => vec![7, 1, -5], - }; + let expected = vec![ + UnchangedRange { + base_range: 20..50, + offsets: vec![3, -8, -5], + }, + UnchangedRange { + base_range: 70..80, + offsets: vec![7, 1, -5], + }, + ]; assert_eq!(actual, expected); }