diff: pack (text, ranges) pair in a struct

I'll add a few more helper methods there. It might also make sense to cache precomputed hash values. unchanged_ranges() is made private since there are no external callers, and I'm going to add more private types.
2025-01-20 11:25:34 +00:00 · 2024-09-23 22:04:15 +09:00 · 2024-09-23 22:04:15 +09:00 · 739a5d8617
commit 739a5d8617
parent 1b469321e2
1 changed files with 72 additions and 85 deletions
--- a/lib/src/diff.rs
+++ b/lib/src/diff.rs
@ -73,27 +73,46 @@ pub fn find_nonword_ranges(text: &[u8]) -> Vec<Range<usize>> {
        .collect()
 }

+#[derive(Clone, Debug)]
+struct DiffSource<'input, 'aux> {
+    text: &'input BStr,
+    ranges: &'aux [Range<usize>],
+}
+
+impl<'input, 'aux> DiffSource<'input, 'aux> {
+    fn new<T: AsRef<[u8]> + ?Sized>(text: &'input T, ranges: &'aux [Range<usize>]) -> Self {
+        DiffSource {
+            text: BStr::new(text),
+            ranges,
+        }
+    }
+
+    fn narrowed(&self, positions: Range<usize>) -> Self {
+        DiffSource {
+            text: self.text,
+            ranges: &self.ranges[positions],
+        }
+    }
+}
+
 struct Histogram<'a> {
-    word_to_positions: HashMap<&'a [u8], Vec<usize>>,
-    count_to_words: BTreeMap<usize, Vec<&'a [u8]>>,
+    word_to_positions: HashMap<&'a BStr, Vec<usize>>,
+    count_to_words: BTreeMap<usize, Vec<&'a BStr>>,
 }

 impl Histogram<'_> {
-    fn calculate<'a>(
-        text: &'a [u8],
-        ranges: &[Range<usize>],
-        max_occurrences: usize,
-    ) -> Histogram<'a> {
-        let mut word_to_positions: HashMap<&[u8], Vec<usize>> = HashMap::new();
-        for (i, range) in ranges.iter().enumerate() {
-            let positions = word_to_positions.entry(&text[range.clone()]).or_default();
+    fn calculate<'a>(source: &DiffSource<'a, '_>, max_occurrences: usize) -> Histogram<'a> {
+        let mut word_to_positions: HashMap<&BStr, Vec<usize>> = HashMap::new();
+        for (i, range) in source.ranges.iter().enumerate() {
+            let word = &source.text[range.clone()];
+            let positions = word_to_positions.entry(word).or_default();
            // Allow one more than max_occurrences, so we can later skip those with more
            // than max_occurrences
            if positions.len() <= max_occurrences {
                positions.push(i);
            }
        }
-        let mut count_to_words: BTreeMap<usize, Vec<&[u8]>> = BTreeMap::new();
+        let mut count_to_words: BTreeMap<usize, Vec<&BStr>> = BTreeMap::new();
        for (word, ranges) in &word_to_positions {
            count_to_words.entry(ranges.len()).or_default().push(word);
        }
@ -162,32 +181,27 @@ fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> {

 /// Finds unchanged ranges among the ones given as arguments. The data between
 /// those ranges is ignored.
-pub(crate) fn unchanged_ranges(
-    left: &[u8],
-    right: &[u8],
-    left_ranges: &[Range<usize>],
-    right_ranges: &[Range<usize>],
-) -> Vec<(Range<usize>, Range<usize>)> {
-    if left_ranges.is_empty() || right_ranges.is_empty() {
+fn unchanged_ranges(left: &DiffSource, right: &DiffSource) -> Vec<(Range<usize>, Range<usize>)> {
+    if left.ranges.is_empty() || right.ranges.is_empty() {
        return vec![];
    }

    // Prioritize LCS-based algorithm than leading/trailing matches
-    let result = unchanged_ranges_lcs(left, right, left_ranges, right_ranges);
+    let result = unchanged_ranges_lcs(left, right);
    if !result.is_empty() {
        return result;
    }

    // Trim leading common ranges (i.e. grow previous unchanged region)
-    let common_leading_len = iter::zip(left_ranges, right_ranges)
-        .take_while(|&(l, r)| left[l.clone()] == right[r.clone()])
+    let common_leading_len = iter::zip(left.ranges, right.ranges)
+        .take_while(|&(l, r)| left.text[l.clone()] == right.text[r.clone()])
        .count();
-    let (left_leading_ranges, left_ranges) = left_ranges.split_at(common_leading_len);
-    let (right_leading_ranges, right_ranges) = right_ranges.split_at(common_leading_len);
+    let (left_leading_ranges, left_ranges) = left.ranges.split_at(common_leading_len);
+    let (right_leading_ranges, right_ranges) = right.ranges.split_at(common_leading_len);

    // Trim trailing common ranges (i.e. grow next unchanged region)
    let common_trailing_len = iter::zip(left_ranges.iter().rev(), right_ranges.iter().rev())
-        .take_while(|&(l, r)| left[l.clone()] == right[r.clone()])
+        .take_while(|&(l, r)| left.text[l.clone()] == right.text[r.clone()])
        .count();
    let left_trailing_ranges = &left_ranges[(left_ranges.len() - common_trailing_len)..];
    let right_trailing_ranges = &right_ranges[(right_ranges.len() - common_trailing_len)..];
@ -206,25 +220,23 @@ pub(crate) fn unchanged_ranges(
 }

 fn unchanged_ranges_lcs(
-    left: &[u8],
-    right: &[u8],
-    left_ranges: &[Range<usize>],
-    right_ranges: &[Range<usize>],
+    left: &DiffSource,
+    right: &DiffSource,
 ) -> Vec<(Range<usize>, Range<usize>)> {
    let max_occurrences = 100;
-    let left_histogram = Histogram::calculate(left, left_ranges, max_occurrences);
+    let left_histogram = Histogram::calculate(left, max_occurrences);
    if *left_histogram.count_to_words.keys().next().unwrap() > max_occurrences {
        // If there are very many occurrences of all words, then we just give up.
        return vec![];
    }
-    let right_histogram = Histogram::calculate(right, right_ranges, max_occurrences);
+    let right_histogram = Histogram::calculate(right, max_occurrences);
    // Look for words with few occurrences in `left` (could equally well have picked
    // `right`?). If any of them also occur in `right`, then we add the words to
    // the LCS.
    let Some(uncommon_shared_words) = left_histogram
        .count_to_words
        .iter()
-        .map(|(left_count, left_words)| -> Vec<&[u8]> {
+        .map(|(left_count, left_words)| -> Vec<&BStr> {
            left_words
                .iter()
                .copied()
@ -281,30 +293,26 @@ fn unchanged_ranges_lcs(
        let skipped_right_positions = previous_right_position..right_position;
        if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() {
            for unchanged_nested_range in unchanged_ranges(
-                left,
-                right,
-                &left_ranges[skipped_left_positions.clone()],
-                &right_ranges[skipped_right_positions.clone()],
+                &left.narrowed(skipped_left_positions.clone()),
+                &right.narrowed(skipped_right_positions.clone()),
            ) {
                result.push(unchanged_nested_range);
            }
        }
        result.push((
-            left_ranges[left_position].clone(),
-            right_ranges[right_position].clone(),
+            left.ranges[left_position].clone(),
+            right.ranges[right_position].clone(),
        ));
        previous_left_position = left_position + 1;
        previous_right_position = right_position + 1;
    }
    // Also recurse into range at end (after common ranges).
-    let skipped_left_positions = previous_left_position..left_ranges.len();
-    let skipped_right_positions = previous_right_position..right_ranges.len();
+    let skipped_left_positions = previous_left_position..left.ranges.len();
+    let skipped_right_positions = previous_right_position..right.ranges.len();
    if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() {
        for unchanged_nested_range in unchanged_ranges(
-            left,
-            right,
-            &left_ranges[skipped_left_positions],
-            &right_ranges[skipped_right_positions],
+            &left.narrowed(skipped_left_positions),
+            &right.narrowed(skipped_right_positions),
        ) {
            result.push(unchanged_nested_range);
        }
@ -447,17 +455,14 @@ impl<'input> Diff<'input> {
        // input as unchanged (compared to itself). Then diff each other input
        // against the base. Intersect the previously found ranges with the
        // unchanged ranges in the diff.
+        let base_source = DiffSource::new(base_input, base_token_ranges);
        let mut unchanged_regions = vec![UnchangedRange {
            base_range: 0..base_input.len(),
            offsets: vec![],
        }];
        for (other_input, other_token_ranges) in iter::zip(&other_inputs, other_token_ranges) {
-            let unchanged_diff_ranges = unchanged_ranges(
-                base_input,
-                other_input,
-                base_token_ranges,
-                other_token_ranges,
-            );
+            let other_source = DiffSource::new(other_input, other_token_ranges);
+            let unchanged_diff_ranges = unchanged_ranges(&base_source, &other_source);
            unchanged_regions = intersect_regions(unchanged_regions, &unchanged_diff_ranges);
        }
        // Add an empty range at the end to make life easier for hunks().
@ -787,10 +792,8 @@ mod tests {
    fn test_unchanged_ranges_insert_in_middle() {
        assert_eq!(
            unchanged_ranges(
-                b"a b b c",
-                b"a b X b c",
-                &[0..1, 2..3, 4..5, 6..7],
-                &[0..1, 2..3, 4..5, 6..7, 8..9],
+                &DiffSource::new(b"a b b c", &[0..1, 2..3, 4..5, 6..7]),
+                &DiffSource::new(b"a b X b c", &[0..1, 2..3, 4..5, 6..7, 8..9]),
            ),
            vec![(0..1, 0..1), (2..3, 2..3), (4..5, 6..7), (6..7, 8..9)]
        );
@ -802,37 +805,29 @@ mod tests {
        // "a"s in the second input. We no longer do.
        assert_eq!(
            unchanged_ranges(
-                b"a a a a",
-                b"a b a c",
-                &[0..1, 2..3, 4..5, 6..7],
-                &[0..1, 2..3, 4..5, 6..7],
+                &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
+                &DiffSource::new(b"a b a c", &[0..1, 2..3, 4..5, 6..7]),
            ),
            vec![(0..1, 0..1)]
        );
        assert_eq!(
            unchanged_ranges(
-                b"a a a a",
-                b"b a c a",
-                &[0..1, 2..3, 4..5, 6..7],
-                &[0..1, 2..3, 4..5, 6..7],
+                &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
+                &DiffSource::new(b"b a c a", &[0..1, 2..3, 4..5, 6..7]),
            ),
            vec![(6..7, 6..7)]
        );
        assert_eq!(
            unchanged_ranges(
-                b"a a a a",
-                b"b a a c",
-                &[0..1, 2..3, 4..5, 6..7],
-                &[0..1, 2..3, 4..5, 6..7],
+                &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
+                &DiffSource::new(b"b a a c", &[0..1, 2..3, 4..5, 6..7]),
            ),
            vec![]
        );
        assert_eq!(
            unchanged_ranges(
-                b"a a a a",
-                b"a b c a",
-                &[0..1, 2..3, 4..5, 6..7],
-                &[0..1, 2..3, 4..5, 6..7],
+                &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
+                &DiffSource::new(b"a b c a", &[0..1, 2..3, 4..5, 6..7]),
            ),
            vec![(0..1, 0..1), (6..7, 6..7)]
        );
@ -844,37 +839,29 @@ mod tests {
        // "a"s in the second input. We no longer do.
        assert_eq!(
            unchanged_ranges(
-                b"a b a c",
-                b"a a a a",
-                &[0..1, 2..3, 4..5, 6..7],
-                &[0..1, 2..3, 4..5, 6..7],
+                &DiffSource::new(b"a b a c", &[0..1, 2..3, 4..5, 6..7]),
+                &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
            ),
            vec![(0..1, 0..1)]
        );
        assert_eq!(
            unchanged_ranges(
-                b"b a c a",
-                b"a a a a",
-                &[0..1, 2..3, 4..5, 6..7],
-                &[0..1, 2..3, 4..5, 6..7],
+                &DiffSource::new(b"b a c a", &[0..1, 2..3, 4..5, 6..7]),
+                &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
            ),
            vec![(6..7, 6..7)]
        );
        assert_eq!(
            unchanged_ranges(
-                b"b a a c",
-                b"a a a a",
-                &[0..1, 2..3, 4..5, 6..7],
-                &[0..1, 2..3, 4..5, 6..7],
+                &DiffSource::new(b"b a a c", &[0..1, 2..3, 4..5, 6..7]),
+                &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
            ),
            vec![]
        );
        assert_eq!(
            unchanged_ranges(
-                b"a b c a",
-                b"a a a a",
-                &[0..1, 2..3, 4..5, 6..7],
-                &[0..1, 2..3, 4..5, 6..7],
+                &DiffSource::new(b"a b c a", &[0..1, 2..3, 4..5, 6..7]),
+                &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
            ),
            vec![(0..1, 0..1), (6..7, 6..7)]
        );