From 739a5d8617f5760e42dfc523d465b3fd3319bb1a Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Mon, 23 Sep 2024 22:04:15 +0900 Subject: [PATCH] diff: pack (text, ranges) pair in a struct I'll add a few more helper methods there. It might also make sense to cache precomputed hash values. unchanged_ranges() is made private since there are no external callers, and I'm going to add more private types. --- lib/src/diff.rs | 157 ++++++++++++++++++++++-------------------------- 1 file changed, 72 insertions(+), 85 deletions(-) diff --git a/lib/src/diff.rs b/lib/src/diff.rs index 214d9e5e5..23d7bfc11 100644 --- a/lib/src/diff.rs +++ b/lib/src/diff.rs @@ -73,27 +73,46 @@ pub fn find_nonword_ranges(text: &[u8]) -> Vec> { .collect() } +#[derive(Clone, Debug)] +struct DiffSource<'input, 'aux> { + text: &'input BStr, + ranges: &'aux [Range], +} + +impl<'input, 'aux> DiffSource<'input, 'aux> { + fn new + ?Sized>(text: &'input T, ranges: &'aux [Range]) -> Self { + DiffSource { + text: BStr::new(text), + ranges, + } + } + + fn narrowed(&self, positions: Range) -> Self { + DiffSource { + text: self.text, + ranges: &self.ranges[positions], + } + } +} + struct Histogram<'a> { - word_to_positions: HashMap<&'a [u8], Vec>, - count_to_words: BTreeMap>, + word_to_positions: HashMap<&'a BStr, Vec>, + count_to_words: BTreeMap>, } impl Histogram<'_> { - fn calculate<'a>( - text: &'a [u8], - ranges: &[Range], - max_occurrences: usize, - ) -> Histogram<'a> { - let mut word_to_positions: HashMap<&[u8], Vec> = HashMap::new(); - for (i, range) in ranges.iter().enumerate() { - let positions = word_to_positions.entry(&text[range.clone()]).or_default(); + fn calculate<'a>(source: &DiffSource<'a, '_>, max_occurrences: usize) -> Histogram<'a> { + let mut word_to_positions: HashMap<&BStr, Vec> = HashMap::new(); + for (i, range) in source.ranges.iter().enumerate() { + let word = &source.text[range.clone()]; + let positions = word_to_positions.entry(word).or_default(); // Allow one more than max_occurrences, so we can later skip those with more // than max_occurrences if positions.len() <= max_occurrences { positions.push(i); } } - let mut count_to_words: BTreeMap> = BTreeMap::new(); + let mut count_to_words: BTreeMap> = BTreeMap::new(); for (word, ranges) in &word_to_positions { count_to_words.entry(ranges.len()).or_default().push(word); } @@ -162,32 +181,27 @@ fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> { /// Finds unchanged ranges among the ones given as arguments. The data between /// those ranges is ignored. -pub(crate) fn unchanged_ranges( - left: &[u8], - right: &[u8], - left_ranges: &[Range], - right_ranges: &[Range], -) -> Vec<(Range, Range)> { - if left_ranges.is_empty() || right_ranges.is_empty() { +fn unchanged_ranges(left: &DiffSource, right: &DiffSource) -> Vec<(Range, Range)> { + if left.ranges.is_empty() || right.ranges.is_empty() { return vec![]; } // Prioritize LCS-based algorithm than leading/trailing matches - let result = unchanged_ranges_lcs(left, right, left_ranges, right_ranges); + let result = unchanged_ranges_lcs(left, right); if !result.is_empty() { return result; } // Trim leading common ranges (i.e. grow previous unchanged region) - let common_leading_len = iter::zip(left_ranges, right_ranges) - .take_while(|&(l, r)| left[l.clone()] == right[r.clone()]) + let common_leading_len = iter::zip(left.ranges, right.ranges) + .take_while(|&(l, r)| left.text[l.clone()] == right.text[r.clone()]) .count(); - let (left_leading_ranges, left_ranges) = left_ranges.split_at(common_leading_len); - let (right_leading_ranges, right_ranges) = right_ranges.split_at(common_leading_len); + let (left_leading_ranges, left_ranges) = left.ranges.split_at(common_leading_len); + let (right_leading_ranges, right_ranges) = right.ranges.split_at(common_leading_len); // Trim trailing common ranges (i.e. grow next unchanged region) let common_trailing_len = iter::zip(left_ranges.iter().rev(), right_ranges.iter().rev()) - .take_while(|&(l, r)| left[l.clone()] == right[r.clone()]) + .take_while(|&(l, r)| left.text[l.clone()] == right.text[r.clone()]) .count(); let left_trailing_ranges = &left_ranges[(left_ranges.len() - common_trailing_len)..]; let right_trailing_ranges = &right_ranges[(right_ranges.len() - common_trailing_len)..]; @@ -206,25 +220,23 @@ pub(crate) fn unchanged_ranges( } fn unchanged_ranges_lcs( - left: &[u8], - right: &[u8], - left_ranges: &[Range], - right_ranges: &[Range], + left: &DiffSource, + right: &DiffSource, ) -> Vec<(Range, Range)> { let max_occurrences = 100; - let left_histogram = Histogram::calculate(left, left_ranges, max_occurrences); + let left_histogram = Histogram::calculate(left, max_occurrences); if *left_histogram.count_to_words.keys().next().unwrap() > max_occurrences { // If there are very many occurrences of all words, then we just give up. return vec![]; } - let right_histogram = Histogram::calculate(right, right_ranges, max_occurrences); + let right_histogram = Histogram::calculate(right, max_occurrences); // Look for words with few occurrences in `left` (could equally well have picked // `right`?). If any of them also occur in `right`, then we add the words to // the LCS. let Some(uncommon_shared_words) = left_histogram .count_to_words .iter() - .map(|(left_count, left_words)| -> Vec<&[u8]> { + .map(|(left_count, left_words)| -> Vec<&BStr> { left_words .iter() .copied() @@ -281,30 +293,26 @@ fn unchanged_ranges_lcs( let skipped_right_positions = previous_right_position..right_position; if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() { for unchanged_nested_range in unchanged_ranges( - left, - right, - &left_ranges[skipped_left_positions.clone()], - &right_ranges[skipped_right_positions.clone()], + &left.narrowed(skipped_left_positions.clone()), + &right.narrowed(skipped_right_positions.clone()), ) { result.push(unchanged_nested_range); } } result.push(( - left_ranges[left_position].clone(), - right_ranges[right_position].clone(), + left.ranges[left_position].clone(), + right.ranges[right_position].clone(), )); previous_left_position = left_position + 1; previous_right_position = right_position + 1; } // Also recurse into range at end (after common ranges). - let skipped_left_positions = previous_left_position..left_ranges.len(); - let skipped_right_positions = previous_right_position..right_ranges.len(); + let skipped_left_positions = previous_left_position..left.ranges.len(); + let skipped_right_positions = previous_right_position..right.ranges.len(); if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() { for unchanged_nested_range in unchanged_ranges( - left, - right, - &left_ranges[skipped_left_positions], - &right_ranges[skipped_right_positions], + &left.narrowed(skipped_left_positions), + &right.narrowed(skipped_right_positions), ) { result.push(unchanged_nested_range); } @@ -447,17 +455,14 @@ impl<'input> Diff<'input> { // input as unchanged (compared to itself). Then diff each other input // against the base. Intersect the previously found ranges with the // unchanged ranges in the diff. + let base_source = DiffSource::new(base_input, base_token_ranges); let mut unchanged_regions = vec![UnchangedRange { base_range: 0..base_input.len(), offsets: vec![], }]; for (other_input, other_token_ranges) in iter::zip(&other_inputs, other_token_ranges) { - let unchanged_diff_ranges = unchanged_ranges( - base_input, - other_input, - base_token_ranges, - other_token_ranges, - ); + let other_source = DiffSource::new(other_input, other_token_ranges); + let unchanged_diff_ranges = unchanged_ranges(&base_source, &other_source); unchanged_regions = intersect_regions(unchanged_regions, &unchanged_diff_ranges); } // Add an empty range at the end to make life easier for hunks(). @@ -787,10 +792,8 @@ mod tests { fn test_unchanged_ranges_insert_in_middle() { assert_eq!( unchanged_ranges( - b"a b b c", - b"a b X b c", - &[0..1, 2..3, 4..5, 6..7], - &[0..1, 2..3, 4..5, 6..7, 8..9], + &DiffSource::new(b"a b b c", &[0..1, 2..3, 4..5, 6..7]), + &DiffSource::new(b"a b X b c", &[0..1, 2..3, 4..5, 6..7, 8..9]), ), vec![(0..1, 0..1), (2..3, 2..3), (4..5, 6..7), (6..7, 8..9)] ); @@ -802,37 +805,29 @@ mod tests { // "a"s in the second input. We no longer do. assert_eq!( unchanged_ranges( - b"a a a a", - b"a b a c", - &[0..1, 2..3, 4..5, 6..7], - &[0..1, 2..3, 4..5, 6..7], + &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + &DiffSource::new(b"a b a c", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 0..1)] ); assert_eq!( unchanged_ranges( - b"a a a a", - b"b a c a", - &[0..1, 2..3, 4..5, 6..7], - &[0..1, 2..3, 4..5, 6..7], + &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + &DiffSource::new(b"b a c a", &[0..1, 2..3, 4..5, 6..7]), ), vec![(6..7, 6..7)] ); assert_eq!( unchanged_ranges( - b"a a a a", - b"b a a c", - &[0..1, 2..3, 4..5, 6..7], - &[0..1, 2..3, 4..5, 6..7], + &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + &DiffSource::new(b"b a a c", &[0..1, 2..3, 4..5, 6..7]), ), vec![] ); assert_eq!( unchanged_ranges( - b"a a a a", - b"a b c a", - &[0..1, 2..3, 4..5, 6..7], - &[0..1, 2..3, 4..5, 6..7], + &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + &DiffSource::new(b"a b c a", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 0..1), (6..7, 6..7)] ); @@ -844,37 +839,29 @@ mod tests { // "a"s in the second input. We no longer do. assert_eq!( unchanged_ranges( - b"a b a c", - b"a a a a", - &[0..1, 2..3, 4..5, 6..7], - &[0..1, 2..3, 4..5, 6..7], + &DiffSource::new(b"a b a c", &[0..1, 2..3, 4..5, 6..7]), + &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 0..1)] ); assert_eq!( unchanged_ranges( - b"b a c a", - b"a a a a", - &[0..1, 2..3, 4..5, 6..7], - &[0..1, 2..3, 4..5, 6..7], + &DiffSource::new(b"b a c a", &[0..1, 2..3, 4..5, 6..7]), + &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), ), vec![(6..7, 6..7)] ); assert_eq!( unchanged_ranges( - b"b a a c", - b"a a a a", - &[0..1, 2..3, 4..5, 6..7], - &[0..1, 2..3, 4..5, 6..7], + &DiffSource::new(b"b a a c", &[0..1, 2..3, 4..5, 6..7]), + &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), ), vec![] ); assert_eq!( unchanged_ranges( - b"a b c a", - b"a a a a", - &[0..1, 2..3, 4..5, 6..7], - &[0..1, 2..3, 4..5, 6..7], + &DiffSource::new(b"a b c a", &[0..1, 2..3, 4..5, 6..7]), + &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 0..1), (6..7, 6..7)] );