mirror of
https://github.com/martinvonz/jj.git
synced 2025-01-18 18:27:38 +00:00
diff: switch from BTreeMap to sorted Vec for unchanged ranges
We don't really need a BTreeMap for keeping the unchanged ranges. The only place it helps a bit is when refining a diff because we may then insert some more unchanged ranges in the list. I think there has to be very many unchanged ranges for that to matter, however. This patch therefore replace the BTreeMap by a sorted Vec. `cargo bench` says that a few tests got ~20% faster. I'm looking into this code now because I'm thinking of copying some of it for the "partial conflict resolution" tool I'm working on for Mercurial.
This commit is contained in:
parent
934564bf8d
commit
fc1731a180
1 changed files with 153 additions and 81 deletions
234
lib/src/diff.rs
234
lib/src/diff.rs
|
@ -13,9 +13,10 @@
|
|||
// limitations under the License.
|
||||
|
||||
use std::cmp::{max, min, Ordering};
|
||||
use std::collections::{btree_map, BTreeMap, HashMap};
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::ops::Range;
|
||||
use std::slice;
|
||||
|
||||
use itertools::Itertools;
|
||||
|
||||
|
@ -290,20 +291,26 @@ pub(crate) fn unchanged_ranges(
|
|||
result
|
||||
}
|
||||
|
||||
/// Wrapper around Range<usize> to provide Ord. We only order by the range's
|
||||
/// start because we make sure to never have overlapping ranges.
|
||||
#[derive(Clone, PartialEq, Eq, Debug)]
|
||||
struct BaseRange(Range<usize>);
|
||||
struct UnchangedRange {
|
||||
base_range: Range<usize>,
|
||||
offsets: Vec<isize>,
|
||||
}
|
||||
|
||||
impl PartialOrd for BaseRange {
|
||||
/// We only order by the base range's start because we make sure to never have
|
||||
/// overlapping ranges.
|
||||
impl PartialOrd for UnchangedRange {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for BaseRange {
|
||||
impl Ord for UnchangedRange {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.0.start.cmp(&other.0.start).then_with(|| self.0.end.cmp(&other.0.end))
|
||||
self.base_range
|
||||
.start
|
||||
.cmp(&other.base_range.start)
|
||||
.then_with(|| self.base_range.end.cmp(&other.base_range.end))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -316,21 +323,25 @@ pub struct Diff<'input> {
|
|||
// The key is a range in the base input. The value is the start of each non-base region
|
||||
// relative to the base region's start. By making them relative, they don't need to change
|
||||
// when the base range changes.
|
||||
unchanged_regions: BTreeMap<BaseRange, Vec<isize>>,
|
||||
unchanged_regions: Vec<UnchangedRange>,
|
||||
}
|
||||
|
||||
/// Takes the current regions and intersects it with the new unchanged ranges
|
||||
/// from a 2-way diff. The result is a map of unchanged regions with one more
|
||||
/// offset in the map's values.
|
||||
fn intersect_regions(
|
||||
current_ranges: BTreeMap<BaseRange, Vec<isize>>,
|
||||
current_ranges: Vec<UnchangedRange>,
|
||||
new_unchanged_ranges: &[(Range<usize>, Range<usize>)],
|
||||
) -> BTreeMap<BaseRange, Vec<isize>> {
|
||||
let mut result = BTreeMap::new();
|
||||
) -> Vec<UnchangedRange> {
|
||||
let mut result = vec![];
|
||||
let mut current_ranges_iter = current_ranges.into_iter().peekable();
|
||||
for (new_base_range, other_range) in new_unchanged_ranges.iter() {
|
||||
assert_eq!(new_base_range.len(), other_range.len());
|
||||
while let Some((BaseRange(base_range), offsets)) = current_ranges_iter.peek() {
|
||||
while let Some(UnchangedRange {
|
||||
base_range,
|
||||
offsets,
|
||||
}) = current_ranges_iter.peek()
|
||||
{
|
||||
// No need to look further if we're past the new range.
|
||||
if base_range.start >= new_base_range.end {
|
||||
break;
|
||||
|
@ -345,7 +356,10 @@ fn intersect_regions(
|
|||
let new_end = min(base_range.end, new_base_range.end);
|
||||
let mut new_offsets = offsets.clone();
|
||||
new_offsets.push(other_range.start.wrapping_sub(new_base_range.start) as isize);
|
||||
result.insert(BaseRange(new_start..new_end), new_offsets);
|
||||
result.push(UnchangedRange {
|
||||
base_range: new_start..new_end,
|
||||
offsets: new_offsets,
|
||||
});
|
||||
if base_range.end >= new_base_range.end {
|
||||
// Break without consuming the item; there may be other new ranges that overlap
|
||||
// with it.
|
||||
|
@ -376,8 +390,10 @@ impl<'input> Diff<'input> {
|
|||
// input as unchanged (compared to itself). Then diff each other input
|
||||
// against the base. Intersect the previously found ranges with the
|
||||
// unchanged ranges in the diff.
|
||||
let mut unchanged_regions = BTreeMap::new();
|
||||
unchanged_regions.insert(BaseRange(0..base_input.len()), vec![]);
|
||||
let mut unchanged_regions = vec![UnchangedRange {
|
||||
base_range: 0..base_input.len(),
|
||||
offsets: vec![],
|
||||
}];
|
||||
for (i, other_token_ranges) in other_token_ranges.iter().enumerate() {
|
||||
let unchanged_diff_ranges = unchanged_ranges(
|
||||
base_input,
|
||||
|
@ -392,7 +408,10 @@ impl<'input> Diff<'input> {
|
|||
.iter()
|
||||
.map(|input| input.len().wrapping_sub(base_input.len()) as isize)
|
||||
.collect_vec();
|
||||
unchanged_regions.insert(BaseRange(base_input.len()..base_input.len()), offsets);
|
||||
unchanged_regions.push(UnchangedRange {
|
||||
base_range: base_input.len()..base_input.len(),
|
||||
offsets,
|
||||
});
|
||||
|
||||
let mut diff = Self {
|
||||
base_input,
|
||||
|
@ -424,8 +443,10 @@ impl<'input> Diff<'input> {
|
|||
let previous_offsets = vec![0; self.other_inputs.len()];
|
||||
DiffHunkIterator {
|
||||
diff: self,
|
||||
previous_base_range: 0..0,
|
||||
previous_offsets,
|
||||
previous: UnchangedRange {
|
||||
base_range: 0..0,
|
||||
offsets: previous_offsets,
|
||||
},
|
||||
unchanged_emitted: true,
|
||||
unchanged_iter: self.unchanged_regions.iter(),
|
||||
}
|
||||
|
@ -434,55 +455,77 @@ impl<'input> Diff<'input> {
|
|||
/// Uses the given tokenizer to split the changed regions into smaller
|
||||
/// regions. Then tries to finds unchanged regions among them.
|
||||
pub fn refine_changed_regions(&mut self, tokenizer: &impl Fn(&[u8]) -> Vec<Range<usize>>) {
|
||||
let mut previous_base_end = 0;
|
||||
let mut previous_offsets = vec![0; self.other_inputs.len()];
|
||||
let mut new_unchanged_ranges = BTreeMap::new();
|
||||
for (BaseRange(base_range), offsets) in self.unchanged_regions.iter() {
|
||||
let mut previous = UnchangedRange {
|
||||
base_range: 0..0,
|
||||
offsets: vec![0; self.other_inputs.len()],
|
||||
};
|
||||
let mut new_unchanged_ranges = vec![];
|
||||
for current in self.unchanged_regions.iter() {
|
||||
// For the changed region between the previous region and the current one,
|
||||
// create a new Diff instance. Then adjust the start positions and
|
||||
// offsets to be valid in the context of the larger Diff instance
|
||||
// (`self`).
|
||||
let mut slices = vec![&self.base_input[previous_base_end..base_range.start]];
|
||||
for (i, offset) in offsets.iter().enumerate() {
|
||||
let changed_range = previous_base_end.wrapping_add(previous_offsets[i] as usize)
|
||||
..base_range.start.wrapping_add(*offset as usize);
|
||||
let mut slices =
|
||||
vec![&self.base_input[previous.base_range.end..current.base_range.start]];
|
||||
for (i, offset) in current.offsets.iter().enumerate() {
|
||||
let changed_range = previous
|
||||
.base_range
|
||||
.end
|
||||
.wrapping_add(previous.offsets[i] as usize)
|
||||
..current.base_range.start.wrapping_add(*offset as usize);
|
||||
slices.push(&self.other_inputs[i][changed_range]);
|
||||
}
|
||||
|
||||
let refined_diff = Diff::for_tokenizer(&slices, tokenizer);
|
||||
|
||||
for (BaseRange(base_range), offsets) in refined_diff.unchanged_regions {
|
||||
let new_base_start = base_range.start + previous_base_end;
|
||||
let new_base_end = base_range.end + previous_base_end;
|
||||
for UnchangedRange {
|
||||
base_range,
|
||||
offsets,
|
||||
} in refined_diff.unchanged_regions
|
||||
{
|
||||
let new_base_start = base_range.start + previous.base_range.end;
|
||||
let new_base_end = base_range.end + previous.base_range.end;
|
||||
let offsets = offsets
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(i, offset)| offset + previous_offsets[i])
|
||||
.map(|(i, offset)| offset + previous.offsets[i])
|
||||
.collect_vec();
|
||||
new_unchanged_ranges.insert(BaseRange(new_base_start..new_base_end), offsets);
|
||||
new_unchanged_ranges.push(UnchangedRange {
|
||||
base_range: new_base_start..new_base_end,
|
||||
offsets,
|
||||
});
|
||||
}
|
||||
previous_base_end = base_range.end;
|
||||
previous_offsets = offsets.clone();
|
||||
previous = current.clone();
|
||||
}
|
||||
self.unchanged_regions.extend(new_unchanged_ranges);
|
||||
self.unchanged_regions = self
|
||||
.unchanged_regions
|
||||
.iter()
|
||||
.cloned()
|
||||
.merge(new_unchanged_ranges)
|
||||
.collect_vec();
|
||||
self.compact_unchanged_regions();
|
||||
}
|
||||
|
||||
fn compact_unchanged_regions(&mut self) {
|
||||
let mut compacted = BTreeMap::new();
|
||||
let mut previous: Option<(Range<usize>, Vec<isize>)> = None;
|
||||
for (BaseRange(base_range), value) in self.unchanged_regions.iter() {
|
||||
if let Some((prevous_base_range, previous_value)) = previous {
|
||||
if prevous_base_range.end == base_range.start && previous_value == *value {
|
||||
previous = Some((prevous_base_range.start..base_range.end, value.clone()));
|
||||
let mut compacted = vec![];
|
||||
let mut maybe_previous: Option<UnchangedRange> = None;
|
||||
for current in self.unchanged_regions.iter() {
|
||||
if let Some(previous) = maybe_previous {
|
||||
if previous.base_range.end == current.base_range.start
|
||||
&& previous.offsets == *current.offsets
|
||||
{
|
||||
maybe_previous = Some(UnchangedRange {
|
||||
base_range: previous.base_range.start..current.base_range.end,
|
||||
offsets: current.offsets.clone(),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
compacted.insert(BaseRange(prevous_base_range), previous_value);
|
||||
compacted.push(previous);
|
||||
}
|
||||
previous = Some((base_range.clone(), value.clone()));
|
||||
maybe_previous = Some(current.clone());
|
||||
}
|
||||
if let Some((prevous_base_range, previous_value)) = previous {
|
||||
compacted.insert(BaseRange(prevous_base_range), previous_value);
|
||||
if let Some(previous) = maybe_previous {
|
||||
compacted.push(previous);
|
||||
}
|
||||
self.unchanged_regions = compacted;
|
||||
}
|
||||
|
@ -516,10 +559,9 @@ impl Debug for DiffHunk<'_> {
|
|||
|
||||
pub struct DiffHunkIterator<'diff, 'input> {
|
||||
diff: &'diff Diff<'input>,
|
||||
previous_base_range: Range<usize>,
|
||||
previous_offsets: Vec<isize>,
|
||||
previous: UnchangedRange,
|
||||
unchanged_emitted: bool,
|
||||
unchanged_iter: btree_map::Iter<'diff, BaseRange, Vec<isize>>,
|
||||
unchanged_iter: slice::Iter<'diff, UnchangedRange>,
|
||||
}
|
||||
|
||||
impl<'diff, 'input> Iterator for DiffHunkIterator<'diff, 'input> {
|
||||
|
@ -529,25 +571,29 @@ impl<'diff, 'input> Iterator for DiffHunkIterator<'diff, 'input> {
|
|||
loop {
|
||||
if !self.unchanged_emitted {
|
||||
self.unchanged_emitted = true;
|
||||
if !self.previous_base_range.is_empty() {
|
||||
if !self.previous.base_range.is_empty() {
|
||||
return Some(DiffHunk::Matching(
|
||||
&self.diff.base_input[self.previous_base_range.clone()],
|
||||
&self.diff.base_input[self.previous.base_range.clone()],
|
||||
));
|
||||
}
|
||||
}
|
||||
if let Some((BaseRange(base_range), offsets)) = self.unchanged_iter.next() {
|
||||
let mut slices =
|
||||
vec![&self.diff.base_input[self.previous_base_range.end..base_range.start]];
|
||||
if let Some(current) = self.unchanged_iter.next() {
|
||||
let mut slices = vec![
|
||||
&self.diff.base_input[self.previous.base_range.end..current.base_range.start],
|
||||
];
|
||||
for (i, input) in self.diff.other_inputs.iter().enumerate() {
|
||||
let start = self
|
||||
.previous_base_range
|
||||
.previous
|
||||
.base_range
|
||||
.end
|
||||
.wrapping_add(self.previous_offsets[i] as usize);
|
||||
let end = base_range.start.wrapping_add(offsets[i] as usize);
|
||||
.wrapping_add(self.previous.offsets[i] as usize);
|
||||
let end = current
|
||||
.base_range
|
||||
.start
|
||||
.wrapping_add(current.offsets[i] as usize);
|
||||
slices.push(&input[start..end]);
|
||||
}
|
||||
self.previous_base_range = base_range.clone();
|
||||
self.previous_offsets = offsets.clone();
|
||||
self.previous = current.clone();
|
||||
self.unchanged_emitted = false;
|
||||
if slices.iter().any(|slice| !slice.is_empty()) {
|
||||
return Some(DiffHunk::Different(slices));
|
||||
|
@ -732,54 +778,80 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_intersect_regions_existing_empty() {
|
||||
let actual = intersect_regions(btreemap! {}, &[(20..25, 55..60)]);
|
||||
let expected = btreemap! {};
|
||||
let actual = intersect_regions(vec![], &[(20..25, 55..60)]);
|
||||
let expected = vec![];
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intersect_regions_new_ranges_within_existing() {
|
||||
let actual = intersect_regions(
|
||||
btreemap! {
|
||||
BaseRange(20..70) => vec![3],
|
||||
},
|
||||
vec![UnchangedRange {
|
||||
base_range: 20..70,
|
||||
offsets: vec![3],
|
||||
}],
|
||||
&[(25..30, 35..40), (40..50, 40..50)],
|
||||
);
|
||||
let expected = btreemap! {
|
||||
BaseRange(25..30) => vec![3, 10],
|
||||
BaseRange(40..50) => vec![3, 0],
|
||||
};
|
||||
let expected = vec![
|
||||
UnchangedRange {
|
||||
base_range: 25..30,
|
||||
offsets: vec![3, 10],
|
||||
},
|
||||
UnchangedRange {
|
||||
base_range: 40..50,
|
||||
offsets: vec![3, 0],
|
||||
},
|
||||
];
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intersect_regions_partial_overlap() {
|
||||
let actual = intersect_regions(
|
||||
btreemap! {
|
||||
BaseRange(20..50) => vec![-3],
|
||||
},
|
||||
vec![UnchangedRange {
|
||||
base_range: 20..50,
|
||||
offsets: vec![-3],
|
||||
}],
|
||||
&[(15..25, 5..15), (45..60, 55..70)],
|
||||
);
|
||||
let expected = btreemap! {
|
||||
BaseRange(20..25) => vec![-3, -10],
|
||||
BaseRange(45..50) => vec![-3, 10],
|
||||
};
|
||||
let expected = vec![
|
||||
UnchangedRange {
|
||||
base_range: 20..25,
|
||||
offsets: vec![-3, -10],
|
||||
},
|
||||
UnchangedRange {
|
||||
base_range: 45..50,
|
||||
offsets: vec![-3, 10],
|
||||
},
|
||||
];
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intersect_regions_new_range_overlaps_multiple_existing() {
|
||||
let actual = intersect_regions(
|
||||
btreemap! {
|
||||
BaseRange(20..50) => vec![3, -8],
|
||||
BaseRange(70..80) => vec![7, 1],
|
||||
},
|
||||
vec![
|
||||
UnchangedRange {
|
||||
base_range: 20..50,
|
||||
offsets: vec![3, -8],
|
||||
},
|
||||
UnchangedRange {
|
||||
base_range: 70..80,
|
||||
offsets: vec![7, 1],
|
||||
},
|
||||
],
|
||||
&[(10..100, 5..95)],
|
||||
);
|
||||
let expected = btreemap! {
|
||||
BaseRange(20..50) => vec![3, -8, -5],
|
||||
BaseRange(70..80) => vec![7, 1, -5],
|
||||
};
|
||||
let expected = vec![
|
||||
UnchangedRange {
|
||||
base_range: 20..50,
|
||||
offsets: vec![3, -8, -5],
|
||||
},
|
||||
UnchangedRange {
|
||||
base_range: 70..80,
|
||||
offsets: vec![7, 1, -5],
|
||||
},
|
||||
];
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue