diff: reuse result vec when recurse into unchanged_ranges()

It's silly that we build new Vec for each recursion stack and merge elements back. I don't see a measurable performance difference in the diff bench, but this change will help simplify the next patch. If a result vec were created for each unchanged_ranges() invocation, it would probably make more sense to return a list of "local" word positions. Then, callers would have to translate the returned positions to the caller's local positions.
2025-02-08 05:30:25 +00:00 · 2024-09-24 18:01:11 +09:00 · 2024-09-24 18:01:11 +09:00 · 483db9d7d2
commit 483db9d7d2
parent f6277bbdb8
1 changed files with 36 additions and 26 deletions
--- a/lib/src/diff.rs
+++ b/lib/src/diff.rs
@ -190,15 +190,20 @@ fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> {
 /// Finds unchanged ranges among the ones given as arguments. The data between
 /// those ranges is ignored.
-fn unchanged_ranges(left: &DiffSource, right: &DiffSource) -> Vec<(Range<usize>, Range<usize>)> {
+fn collect_unchanged_ranges(
    found_ranges: &mut Vec<(Range<usize>, Range<usize>)>,
    left: &DiffSource,
    right: &DiffSource,
 ) {
    if left.ranges.is_empty() || right.ranges.is_empty() {
-        return vec![];
+        return;
    }
    // Prioritize LCS-based algorithm than leading/trailing matches
-    let result = unchanged_ranges_lcs(left, right);
+    let old_len = found_ranges.len();
-    if !result.is_empty() {
+    collect_unchanged_ranges_lcs(found_ranges, left, right);
-        return result;
+    if found_ranges.len() != old_len {
        return;
    }
    // Trim leading common ranges (i.e. grow previous unchanged region)
@ -215,7 +220,7 @@ fn unchanged_ranges(left: &DiffSource, right: &DiffSource) -> Vec<(Range<usize>,
    let left_trailing_ranges = &left_ranges[(left_ranges.len() - common_trailing_len)..];
    let right_trailing_ranges = &right_ranges[(right_ranges.len() - common_trailing_len)..];
-    itertools::chain(
+    found_ranges.extend(itertools::chain(
        iter::zip(
            left_leading_ranges.iter().cloned(),
            right_leading_ranges.iter().cloned(),
@ -224,20 +229,20 @@ fn unchanged_ranges(left: &DiffSource, right: &DiffSource) -> Vec<(Range<usize>,
            left_trailing_ranges.iter().cloned(),
            right_trailing_ranges.iter().cloned(),
        ),
-    )
+    ));
    .collect()
 }
-fn unchanged_ranges_lcs(
+fn collect_unchanged_ranges_lcs(
    found_ranges: &mut Vec<(Range<usize>, Range<usize>)>,
    left: &DiffSource,
    right: &DiffSource,
-) -> Vec<(Range<usize>, Range<usize>)> {
+) {
    let max_occurrences = 100;
    let left_histogram = Histogram::calculate(left, max_occurrences);
    let left_count_to_entries = left_histogram.build_count_to_entries();
    if *left_count_to_entries.keys().next().unwrap() > max_occurrences {
        // If there are very many occurrences of all words, then we just give up.
-        return vec![];
+        return;
    }
    let right_histogram = Histogram::calculate(right, max_occurrences);
    // Look for words with few occurrences in `left` (could equally well have picked
@ -256,7 +261,7 @@ fn unchanged_ranges_lcs(
            both_positions.peek().is_some().then_some(both_positions)
        })
    else {
-        return vec![];
+        return;
    };
    // [(index into ranges, serial to identify {word, occurrence #})]
@ -283,31 +288,26 @@ fn unchanged_ranges_lcs(
    // Produce output ranges, recursing into the modified areas between the elements
    // in the LCS.
    let mut result = vec![];
    let mut previous_left_position = WordPosition(0);
    let mut previous_right_position = WordPosition(0);
    for (left_index, right_index) in lcs {
        let (left_position, _) = left_positions[left_index];
        let (right_position, _) = right_positions[right_index];
-        for unchanged_nested_range in unchanged_ranges(
+        collect_unchanged_ranges(
            found_ranges,
            &left.narrowed(previous_left_position..left_position),
            &right.narrowed(previous_right_position..right_position),
-        ) {
+        );
-            result.push(unchanged_nested_range);
+        found_ranges.push((left.range_at(left_position), right.range_at(right_position)));
        }
        result.push((left.range_at(left_position), right.range_at(right_position)));
        previous_left_position = WordPosition(left_position.0 + 1);
        previous_right_position = WordPosition(right_position.0 + 1);
    }
    // Also recurse into range at end (after common ranges).
-    for unchanged_nested_range in unchanged_ranges(
+    collect_unchanged_ranges(
        found_ranges,
        &left.narrowed(previous_left_position..WordPosition(left.ranges.len())),
        &right.narrowed(previous_right_position..WordPosition(right.ranges.len())),
-    ) {
+    );
        result.push(unchanged_nested_range);
    }
    result
 }
 #[derive(Clone, PartialEq, Eq, Debug)]
@ -451,8 +451,9 @@ impl<'input> Diff<'input> {
        }];
        for (other_input, other_token_ranges) in iter::zip(&other_inputs, other_token_ranges) {
            let other_source = DiffSource::new(other_input, other_token_ranges);
-            let unchanged_diff_ranges = unchanged_ranges(&base_source, &other_source);
+            let mut new_ranges = Vec::new();
-            unchanged_regions = intersect_regions(unchanged_regions, &unchanged_diff_ranges);
+            collect_unchanged_ranges(&mut new_ranges, &base_source, &other_source);
            unchanged_regions = intersect_regions(unchanged_regions, &new_ranges);
        }
        // Add an empty range at the end to make life easier for hunks().
        let offsets = other_inputs
@ -777,6 +778,15 @@ mod tests {
        );
    }
    fn unchanged_ranges(
        left: &DiffSource,
        right: &DiffSource,
    ) -> Vec<(Range<usize>, Range<usize>)> {
        let mut found_ranges = Vec::new();
        collect_unchanged_ranges(&mut found_ranges, left, right);
        found_ranges
    }
    #[test]
    fn test_unchanged_ranges_insert_in_middle() {
        assert_eq!(