From 831bbc0b1191cd2f94ad8599a219ba528631c50c Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Mon, 1 Jul 2024 22:30:11 +0900 Subject: [PATCH] diff: match up leading/trailing ranges if no match found by uncommon lcs This is adapted from Breezy/Python patiencediff. AFAICT, Git implementation is slightly different (and maybe more efficient?), but it's not super easy to integrate with our diff logic. I'm not sure which one is better overall, but I think the result is good so long as "uncommon LCS" matching is attempted first. https://github.com/breezy-team/patiencediff/blob/a9a3e4edc34c5820e9703727ece596c7759666da/patiencediff/_patiencediff_py.py#L108 This patch prevents some weird test changes that would otherwise be introduced by the next patch. --- lib/src/diff.rs | 106 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 105 insertions(+), 1 deletion(-) diff --git a/lib/src/diff.rs b/lib/src/diff.rs index 7ca31124f..ccf42577c 100644 --- a/lib/src/diff.rs +++ b/lib/src/diff.rs @@ -18,7 +18,7 @@ use std::cmp::{max, min, Ordering}; use std::collections::{BTreeMap, HashMap}; use std::fmt::{Debug, Formatter}; use std::ops::Range; -use std::slice; +use std::{iter, slice}; use itertools::Itertools; @@ -168,6 +168,56 @@ pub(crate) fn unchanged_ranges( return vec![]; } + // Prioritize LCS-based algorithm than leading/trailing matches + let result = unchanged_ranges_lcs(left, right, left_ranges, right_ranges); + if !result.is_empty() { + return result; + } + + // Trim leading common ranges (i.e. grow previous unchanged region) + let common_leading_len = iter::zip(left_ranges, right_ranges) + .take_while(|&(l, r)| left[l.clone()] == right[r.clone()]) + .count(); + if common_leading_len > 0 { + let (left_leading_ranges, left_ranges) = left_ranges.split_at(common_leading_len); + let (right_leading_ranges, right_ranges) = right_ranges.split_at(common_leading_len); + let mut result = unchanged_ranges(left, right, left_ranges, right_ranges); + result.splice( + 0..0, + iter::zip( + left_leading_ranges.iter().cloned(), + right_leading_ranges.iter().cloned(), + ), + ); + return result; + } + + // Trim trailing common ranges (i.e. grow next unchanged region) + let common_trailing_len = iter::zip(left_ranges.iter().rev(), right_ranges.iter().rev()) + .take_while(|&(l, r)| left[l.clone()] == right[r.clone()]) + .count(); + if common_trailing_len > 0 { + let (left_ranges, left_trailing_ranges) = + left_ranges.split_at(left_ranges.len() - common_trailing_len); + let (right_ranges, right_trailing_ranges) = + right_ranges.split_at(right_ranges.len() - common_trailing_len); + let mut result = unchanged_ranges(left, right, left_ranges, right_ranges); + result.extend(iter::zip( + left_trailing_ranges.iter().cloned(), + right_trailing_ranges.iter().cloned(), + )); + return result; + } + + vec![] +} + +fn unchanged_ranges_lcs( + left: &[u8], + right: &[u8], + left_ranges: &[Range], + right_ranges: &[Range], +) -> Vec<(Range, Range)> { let max_occurrences = 100; let left_histogram = Histogram::calculate(left, left_ranges, max_occurrences); if *left_histogram.count_to_words.keys().next().unwrap() > max_occurrences { @@ -759,6 +809,33 @@ mod tests { ), vec![(0..1, 0..1), (2..3, 4..5)] ); + assert_eq!( + unchanged_ranges( + b"a a a a", + b"b a c a", + &[0..1, 2..3, 4..5, 6..7], + &[0..1, 2..3, 4..5, 6..7], + ), + vec![(0..1, 2..3), (2..3, 6..7)] + ); + assert_eq!( + unchanged_ranges( + b"a a a a", + b"b a a c", + &[0..1, 2..3, 4..5, 6..7], + &[0..1, 2..3, 4..5, 6..7], + ), + vec![(0..1, 2..3), (2..3, 4..5)] + ); + assert_eq!( + unchanged_ranges( + b"a a a a", + b"a b c a", + &[0..1, 2..3, 4..5, 6..7], + &[0..1, 2..3, 4..5, 6..7], + ), + vec![(0..1, 0..1), (2..3, 6..7)] + ); } #[test] @@ -772,6 +849,33 @@ mod tests { ), vec![(0..1, 0..1), (4..5, 2..3)] ); + assert_eq!( + unchanged_ranges( + b"b a c a", + b"a a a a", + &[0..1, 2..3, 4..5, 6..7], + &[0..1, 2..3, 4..5, 6..7], + ), + vec![(2..3, 0..1), (6..7, 2..3)] + ); + assert_eq!( + unchanged_ranges( + b"b a a c", + b"a a a a", + &[0..1, 2..3, 4..5, 6..7], + &[0..1, 2..3, 4..5, 6..7], + ), + vec![(2..3, 0..1), (4..5, 2..3)] + ); + assert_eq!( + unchanged_ranges( + b"a b c a", + b"a a a a", + &[0..1, 2..3, 4..5, 6..7], + &[0..1, 2..3, 4..5, 6..7], + ), + vec![(0..1, 0..1), (6..7, 2..3)] + ); } #[test]