// Copyright 2021 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. use std::cmp::min; use std::collections::{BTreeMap, HashMap}; use std::fmt::{Debug, Formatter}; use std::ops::Range; pub fn find_line_ranges(text: &[u8]) -> Vec> { let mut ranges = vec![]; let mut start = 0; loop { match text[start..].iter().position(|b| *b == b'\n') { None => { break; } Some(i) => { ranges.push(start..start + i + 1); start += i + 1; } } } if start < text.len() { ranges.push(start..text.len()); } ranges } fn is_word_byte(b: u8) -> bool { // TODO: Make this configurable (probably higher up in the call stack) matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_') } pub fn find_word_ranges(text: &[u8]) -> Vec> { let mut word_ranges = vec![]; let mut word_start_pos = 0; let mut in_word = false; for (i, b) in text.iter().enumerate() { if in_word && !is_word_byte(*b) { in_word = false; word_ranges.push(word_start_pos..i); word_start_pos = i; } else if !in_word && is_word_byte(*b) { in_word = true; word_start_pos = i; } } if in_word && word_start_pos < text.len() { word_ranges.push(word_start_pos..text.len()); } word_ranges } pub fn find_nonword_ranges(text: &[u8]) -> Vec> { let mut ranges = vec![]; for (i, b) in text.iter().enumerate() { if !is_word_byte(*b) { ranges.push(i..i + 1); } } ranges } struct Histogram<'a> { word_to_positions: HashMap<&'a [u8], Vec>, count_to_words: BTreeMap>, } impl Histogram<'_> { fn calculate<'a>( text: &'a [u8], ranges: &[Range], max_occurrences: usize, ) -> Histogram<'a> { let mut word_to_positions: HashMap<&[u8], Vec> = HashMap::new(); for (i, range) in ranges.iter().enumerate() { let positions = word_to_positions.entry(&text[range.clone()]).or_default(); // Allow one more than max_occurrences, so we can later skip those with more // than max_occurrences if positions.len() <= max_occurrences { positions.push(i); } } let mut count_to_words: BTreeMap> = BTreeMap::new(); for (word, ranges) in &word_to_positions { count_to_words.entry(ranges.len()).or_default().push(word); } Histogram { word_to_positions, count_to_words, } } } #[derive(Clone, PartialEq, Eq, Hash, Debug)] enum RangeDiff { Unchanged(Range, Range), Replaced(Range, Range), } impl RangeDiff { fn is_empty(&self) -> bool { match self { RangeDiff::Unchanged(left_range, right_range) => { left_range.is_empty() && right_range.is_empty() } RangeDiff::Replaced(left_range, right_range) => { left_range.is_empty() && right_range.is_empty() } } } } #[derive(Clone, PartialEq, Eq, Hash)] pub enum SliceDiff<'a> { Unchanged(&'a [u8]), Replaced(&'a [u8], &'a [u8]), } impl Debug for SliceDiff<'_> { fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> { match self { SliceDiff::Unchanged(data) => f .debug_tuple("SliceDiff::Unchanged") .field(&String::from_utf8_lossy(data)) .finish(), SliceDiff::Replaced(left, right) => f .debug_tuple("SliceDiff::Replaced") .field(&String::from_utf8_lossy(left)) .field(&String::from_utf8_lossy(right)) .finish(), } } } /// Finds the LCS given a array where the value of `input[i]` indicates that /// the position of element `i` in the right array is at position `input[i]` in /// the left array. /// /// For example (some have multiple valid outputs): /// /// [0,1,2] => [(0,0),(1,1),(2,2)] /// [2,1,0] => [(0,2)] /// [0,1,4,2,3,5,6] => [(0,0),(1,1),(2,3),(3,4),(5,5),(6,6)] /// [0,1,4,3,2,5,6] => [(0,0),(1,1),(4,2),(5,5),(6,6)] fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> { if input.is_empty() { return vec![]; } let mut chain = vec![(0, 0, 0); input.len()]; let mut global_longest = 0; let mut global_longest_right_pos = 0; for (right_pos, &left_pos) in input.iter().enumerate() { let mut longest_from_here = 1; let mut previous_right_pos = usize::MAX; for i in (0..right_pos).rev() { let (previous_len, previous_left_pos, _) = chain[i]; if previous_left_pos < left_pos { let len = previous_len + 1; if len > longest_from_here { longest_from_here = len; previous_right_pos = i; if len > global_longest { global_longest = len; global_longest_right_pos = right_pos; // If this is the longest chain globally so far, we cannot find a // longer one by using a previous value, so break early. break; } } } } chain[right_pos] = (longest_from_here, left_pos, previous_right_pos); } let mut result = vec![]; let mut right_pos = global_longest_right_pos; loop { let (_, left_pos, previous_right_pos) = chain[right_pos]; result.push((left_pos, right_pos)); if previous_right_pos == usize::MAX { break; } right_pos = previous_right_pos; } result.reverse(); result } /// Finds unchanged ranges among the ones given as arguments. The data between /// those ranges is ignored. pub(crate) fn unchanged_ranges( left: &[u8], right: &[u8], left_ranges: &[Range], right_ranges: &[Range], ) -> Vec<(Range, Range)> { if left_ranges.is_empty() || right_ranges.is_empty() { return vec![]; } let max_occurrences = 100; let mut left_histogram = Histogram::calculate(left, left_ranges, max_occurrences); if *left_histogram.count_to_words.first_entry().unwrap().key() > max_occurrences { // If there are very many occurrences of all words, then we just give up. return vec![]; } let mut right_histogram = Histogram::calculate(right, right_ranges, max_occurrences); // Look for words with few occurrences in `left` (could equally well have picked // `right`?). If any of them also occur in `right`, then we add the words to // the LCS. let mut uncommon_shared_words = vec![]; while !left_histogram.count_to_words.is_empty() && uncommon_shared_words.is_empty() { let left_words = left_histogram.count_to_words.pop_first().unwrap().1; for left_word in left_words { if right_histogram.word_to_positions.contains_key(left_word) { uncommon_shared_words.push(left_word); } } } if uncommon_shared_words.is_empty() { return vec![]; } // Let's say our inputs are "a b a b" and "a b c c b a b". We will have found // the least common words to be "a" and "b". We now assume that each // occurrence of each word lines up in the left and right input. We do that // by numbering the shared occurrences, effectively instead comparing "a1 b1 // a2 b2" and "a1 b1 c c b2 a2 b". We then walk the common words in the // right input in order (["a1", "b1", "b2", "a2"]), and record the index of // that word in the left input ([0,1,3,2]). We then find the LCS and split // points based on that ([0,1,3] or [0,1,2] are both valid). // [(index into left_ranges, word, occurrence #)] let mut left_positions = vec![]; let mut right_positions = vec![]; for uncommon_shared_word in uncommon_shared_words { let left_occurrences = left_histogram .word_to_positions .get_mut(uncommon_shared_word) .unwrap(); let right_occurrences = right_histogram .word_to_positions .get_mut(uncommon_shared_word) .unwrap(); let shared_count = min(left_occurrences.len(), right_occurrences.len()); for occurrence in 0..shared_count { left_positions.push(( left_occurrences[occurrence], uncommon_shared_word, occurrence, )); right_positions.push(( right_occurrences[occurrence], uncommon_shared_word, occurrence, )); } } left_positions.sort(); right_positions.sort(); let mut left_position_map = HashMap::new(); for (i, (_pos, word, occurrence)) in left_positions.iter().enumerate() { left_position_map.insert((*word, *occurrence), i); } let mut left_index_by_right_index = vec![]; for (_pos, word, occurrence) in &right_positions { left_index_by_right_index.push(*left_position_map.get(&(*word, *occurrence)).unwrap()); } let lcs = find_lcs(&left_index_by_right_index); // Produce output ranges, recursing into the modified areas between the elements // in the LCS. let mut result = vec![]; let mut previous_left_position = 0; let mut previous_right_position = 0; for (left_index, right_index) in lcs { let left_position = left_positions[left_index].0; let right_position = right_positions[right_index].0; let skipped_left_positions = previous_left_position..left_position; let skipped_right_positions = previous_right_position..right_position; if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() { for unchanged_nested_range in unchanged_ranges( left, right, &left_ranges[skipped_left_positions.clone()], &right_ranges[skipped_right_positions.clone()], ) { result.push(unchanged_nested_range); } } result.push(( left_ranges[left_position].clone(), right_ranges[right_position].clone(), )); previous_left_position = left_position + 1; previous_right_position = right_position + 1; } // Also recurse into range at end (after common ranges). let skipped_left_positions = previous_left_position..left_ranges.len(); let skipped_right_positions = previous_right_position..right_ranges.len(); if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() { for unchanged_nested_range in unchanged_ranges( left, right, &left_ranges[skipped_left_positions], &right_ranges[skipped_right_positions], ) { result.push(unchanged_nested_range); } } result } /// Adds ranges between around the `input` ranges so that the full ranges of /// `left` and `right` are covered. fn fill_in_range_gaps( left: &[u8], right: &[u8], input: &[(Range, Range)], ) -> Vec { let mut output = vec![]; let mut previous_left_end_pos = 0; let mut previous_right_end_pos = 0; // Add an empty range at the end in order to fill in any gap just before the // end (without needing to duplicate code for that after the loop). for (left_range, right_range) in input .iter() .chain(&[(left.len()..left.len(), right.len()..right.len())]) { let left_gap_range = previous_left_end_pos..left_range.start; let right_gap_range = previous_right_end_pos..right_range.start; if !left_gap_range.is_empty() || !right_gap_range.is_empty() { if left[left_gap_range.clone()] == right[right_gap_range.clone()] { output.push(RangeDiff::Unchanged(left_gap_range, right_gap_range)); } else { output.push(RangeDiff::Replaced(left_gap_range, right_gap_range)); } } previous_left_end_pos = left_range.end; previous_right_end_pos = right_range.end; if !(left_range.is_empty() && right_range.is_empty()) { output.push(RangeDiff::Unchanged( left_range.clone(), right_range.clone(), )); } } output } /// Combines adjacent ranges of the same type into larger ranges. Removes empty /// ranges. fn compact_ranges(input: &[RangeDiff]) -> Vec { if input.is_empty() { return vec![]; } let mut output = vec![]; let mut current_range = input[0].clone(); for range in input.iter().skip(1) { match (&mut current_range, range) { (RangeDiff::Unchanged(left1, right1), RangeDiff::Unchanged(left2, right2)) => { left1.end = left2.end; right1.end = right2.end; } (RangeDiff::Replaced(left1, right1), RangeDiff::Replaced(left2, right2)) => { left1.end = left2.end; right1.end = right2.end; } _ => { // The previous range was unchanged and this one was replaced, or vice versa. // If the new range is empty, just ignore it, so we can possibly compact // with the previous one. if !range.is_empty() { if !current_range.is_empty() { output.push(current_range.clone()); } current_range = range.clone(); } } } } if !current_range.is_empty() { output.push(current_range); } output } fn refine_changed_ranges<'a>( left: &'a [u8], right: &'a [u8], input: &[RangeDiff], tokenizer: &impl Fn(&[u8]) -> Vec>, ) -> Vec { let mut output = vec![]; for range_diff in input { match range_diff { RangeDiff::Replaced(left_range, right_range) => { let left_slice = &left[left_range.clone()]; let right_slice = &right[right_range.clone()]; let refined_left_ranges: Vec> = tokenizer(&left_slice); let refined_right_ranges: Vec> = tokenizer(&right_slice); let unchanged_refined_ranges = unchanged_ranges( &left_slice, &right_slice, &refined_left_ranges, &refined_right_ranges, ); let all_refined_ranges = fill_in_range_gaps(left_slice, right_slice, &unchanged_refined_ranges); let compacted_refined_range_diffs = compact_ranges(&all_refined_ranges); for refined_range_diff in compacted_refined_range_diffs { match refined_range_diff { RangeDiff::Unchanged(refined_left_range, refined_right_range) => output .push(RangeDiff::Unchanged( left_range.start + refined_left_range.start ..left_range.start + refined_left_range.end, right_range.start + refined_right_range.start ..right_range.start + refined_right_range.end, )), RangeDiff::Replaced(refined_left_range, refined_right_range) => output .push(RangeDiff::Replaced( left_range.start + refined_left_range.start ..left_range.start + refined_left_range.end, right_range.start + refined_right_range.start ..right_range.start + refined_right_range.end, )), } } } range => { output.push(range.clone()); } } } output } fn range_diffs_to_slice_diffs<'a>( left: &'a [u8], right: &'a [u8], range_diffs: &[RangeDiff], ) -> Vec> { let mut slice_diffs = vec![]; for range in range_diffs { match range { RangeDiff::Unchanged(left_range, _right_range) => { slice_diffs.push(SliceDiff::Unchanged(&left[left_range.clone()])); } RangeDiff::Replaced(left_range, right_range) => { slice_diffs.push(SliceDiff::Replaced( &left[left_range.clone()], &right[right_range.clone()], )); } } } slice_diffs } /// Diffs two slices of bytes. The returned diff hunks may be any length (may /// span many lines or may be only part of a line). This currently uses /// Histogram diff (or maybe something similar; I'm not sure I understood the /// algorithm correctly). It first diffs lines in the input and then refines /// the changed ranges at the word level. /// /// TODO: Diff at even lower level in the non-word ranges? pub fn diff<'a>(left: &'a [u8], right: &'a [u8]) -> Vec> { if left == right { return vec![SliceDiff::Unchanged(left)]; } if left.is_empty() { return vec![SliceDiff::Replaced(b"", right)]; } if right.is_empty() { return vec![SliceDiff::Replaced(left, b"")]; } let range_diffs = vec![RangeDiff::Replaced(0..left.len(), 0..right.len())]; let range_diffs = refine_changed_ranges(left, right, &range_diffs, &find_line_ranges); let range_diffs = refine_changed_ranges(left, right, &range_diffs, &find_word_ranges); let range_diffs = refine_changed_ranges(left, right, &range_diffs, &find_nonword_ranges); range_diffs_to_slice_diffs(left, right, &range_diffs) } #[cfg(test)] mod tests { use super::*; #[test] fn test_find_line_ranges_empty() { assert_eq!(find_line_ranges(b""), vec![]); } #[test] fn test_find_line_ranges_blank_line() { assert_eq!(find_line_ranges(b"\n"), vec![0..1]); } #[test] fn test_find_line_ranges_missing_newline_at_eof() { assert_eq!(find_line_ranges(b"foo"), vec![0..3]); } #[test] fn test_find_line_ranges_multiple_lines() { assert_eq!(find_line_ranges(b"a\nbb\nccc\n"), vec![0..2, 2..5, 5..9]); } #[test] fn test_find_word_ranges_empty() { assert_eq!(find_word_ranges(b""), vec![]); } #[test] fn test_find_word_ranges_single_word() { assert_eq!(find_word_ranges(b"Abc"), vec![0..3]); } #[test] fn test_find_word_ranges_no_word() { assert_eq!(find_word_ranges(b"+-*/"), vec![]); } #[test] fn test_find_word_ranges_word_then_non_word() { assert_eq!(find_word_ranges(b"Abc "), vec![0..3]); } #[test] fn test_find_word_ranges_non_word_then_word() { assert_eq!(find_word_ranges(b" Abc"), vec![3..6]); } #[test] fn test_find_lcs_empty() { let empty: Vec<(usize, usize)> = vec![]; assert_eq!(find_lcs(&[]), empty); } #[test] fn test_find_lcs_single_element() { assert_eq!(find_lcs(&[0]), vec![(0, 0)]); } #[test] fn test_find_lcs_in_order() { assert_eq!(find_lcs(&[0, 1, 2]), vec![(0, 0), (1, 1), (2, 2)]); } #[test] fn test_find_lcs_reverse_order() { assert_eq!(find_lcs(&[2, 1, 0]), vec![(2, 0)]); } #[test] fn test_find_lcs_two_swapped() { assert_eq!( find_lcs(&[0, 1, 4, 3, 2, 5, 6]), vec![(0, 0), (1, 1), (2, 4), (5, 5), (6, 6)] ); } #[test] fn test_find_lcs_element_moved_earlier() { assert_eq!( find_lcs(&[0, 1, 4, 2, 3, 5, 6]), vec![(0, 0), (1, 1), (2, 3), (3, 4), (5, 5), (6, 6)] ); } #[test] fn test_find_lcs_element_moved_later() { assert_eq!( find_lcs(&[0, 1, 3, 4, 2, 5, 6]), vec![(0, 0), (1, 1), (3, 2), (4, 3), (5, 5), (6, 6)] ); } #[test] fn test_find_lcs_interleaved_longest_chains() { assert_eq!( find_lcs(&[0, 4, 2, 9, 6, 5, 1, 3, 7, 8]), vec![(0, 0), (1, 6), (3, 7), (7, 8), (8, 9)] ); } #[test] fn test_find_word_ranges_many_words() { assert_eq!( find_word_ranges(b"fn find_words(text: &[u8])"), vec![0..2, 3..13, 14..18, 22..24] ); } #[test] fn test_fill_in_gaps_empty() { assert_eq!( fill_in_range_gaps(b"abc", b"abcde", &[]), vec![RangeDiff::Replaced(0..3, 0..5),] ); } #[test] fn test_fill_in_gaps_only_middle() { assert_eq!( fill_in_range_gaps( b"a b c", b"a x b y c", &[(0..2, 0..2), (2..4, 4..6), (4..5, 8..9),] ), vec![ RangeDiff::Unchanged(0..2, 0..2), RangeDiff::Replaced(2..2, 2..4), RangeDiff::Unchanged(2..4, 4..6), RangeDiff::Replaced(4..4, 6..8), RangeDiff::Unchanged(4..5, 8..9), ] ); } #[test] fn test_fill_in_gaps_empty_gap() { assert_eq!( fill_in_range_gaps(b"a b", b"a b", &[(0..1, 0..1), (1..2, 1..2), (2..3, 2..3),]), vec![ RangeDiff::Unchanged(0..1, 0..1), RangeDiff::Unchanged(1..2, 1..2), RangeDiff::Unchanged(2..3, 2..3), ] ); } #[test] fn test_fill_in_gaps_before_and_after() { assert_eq!( fill_in_range_gaps(b" a ", b" a ", &[(1..2, 1..2),]), vec![ RangeDiff::Unchanged(0..1, 0..1), RangeDiff::Unchanged(1..2, 1..2), RangeDiff::Unchanged(2..3, 2..3), ] ); } #[test] fn test_compact_ranges_all_unchanged() { assert_eq!( compact_ranges(&[ RangeDiff::Unchanged(0..1, 0..2), RangeDiff::Unchanged(1..2, 2..4), RangeDiff::Unchanged(2..3, 4..6), ]), vec![RangeDiff::Unchanged(0..3, 0..6),] ); } #[test] fn test_compact_ranges_all_replaced() { assert_eq!( compact_ranges(&[ RangeDiff::Replaced(0..1, 0..2), RangeDiff::Replaced(1..2, 2..4), RangeDiff::Replaced(2..3, 4..6), ]), vec![RangeDiff::Replaced(0..3, 0..6),] ); } #[test] fn test_compact_ranges_mixed() { assert_eq!( compact_ranges(&[ RangeDiff::Replaced(0..1, 0..2), RangeDiff::Replaced(1..2, 2..4), RangeDiff::Unchanged(2..3, 4..6), RangeDiff::Unchanged(3..4, 6..8), RangeDiff::Replaced(4..5, 8..10), RangeDiff::Replaced(5..6, 10..12), ]), vec![ RangeDiff::Replaced(0..2, 0..4), RangeDiff::Unchanged(2..4, 4..8), RangeDiff::Replaced(4..6, 8..12), ] ); } #[test] fn test_compact_ranges_mixed_empty_range() { assert_eq!( compact_ranges(&[ RangeDiff::Replaced(0..1, 0..2), RangeDiff::Replaced(1..2, 2..4), RangeDiff::Unchanged(2..2, 4..4), RangeDiff::Replaced(3..4, 6..8), RangeDiff::Replaced(4..5, 8..10), ]), vec![RangeDiff::Replaced(0..5, 0..10)] ); } #[test] fn test_unchanged_ranges_insert_in_middle() { assert_eq!( unchanged_ranges( b"a b b c", b"a b X b c", &[0..1, 2..3, 4..5, 6..7], &[0..1, 2..3, 4..5, 6..7, 8..9], ), vec![(0..1, 0..1), (2..3, 2..3), (4..5, 6..7), (6..7, 8..9)] ); } #[test] fn test_unchanged_ranges_non_unique_removed() { assert_eq!( unchanged_ranges( b"a a a a", b"a b a c", &[0..1, 2..3, 4..5, 6..7], &[0..1, 2..3, 4..5, 6..7], ), vec![(0..1, 0..1), (2..3, 4..5)] ); } #[test] fn test_unchanged_ranges_non_unique_added() { assert_eq!( unchanged_ranges( b"a b a c", b"a a a a", &[0..1, 2..3, 4..5, 6..7], &[0..1, 2..3, 4..5, 6..7], ), vec![(0..1, 0..1), (4..5, 2..3)] ); } #[test] fn test_diff_nothing_in_common() { assert_eq!( diff(b"aaa", b"bb"), vec![SliceDiff::Replaced(b"aaa", b"bb")] ); } #[test] fn test_diff_insert_in_middle() { assert_eq!( diff(b"a z", b"a S z"), vec![ // TODO: Should compact these two unchanged ranges SliceDiff::Unchanged(b"a"), SliceDiff::Unchanged(b" "), SliceDiff::Replaced(b"", b"S "), SliceDiff::Unchanged(b"z"), ] ); } #[test] fn test_diff_no_unique_middle_flips() { assert_eq!( diff(b"a R R S S z", b"a S S R R z"), vec![ SliceDiff::Unchanged(b"a"), SliceDiff::Unchanged(b" "), SliceDiff::Replaced(b"R R ", b""), SliceDiff::Unchanged(b"S S"), SliceDiff::Unchanged(b" "), SliceDiff::Replaced(b"", b"R R "), SliceDiff::Unchanged(b"z") ], ); } #[test] fn test_diff_recursion_needed() { assert_eq!( diff( b"a q x q y q z q b q y q x q c", b"a r r x q y z q b y q x r r c", ), vec![ SliceDiff::Unchanged(b"a"), SliceDiff::Unchanged(b" "), SliceDiff::Replaced(b"q", b"r"), SliceDiff::Unchanged(b" "), SliceDiff::Replaced(b"", b"r "), SliceDiff::Unchanged(b"x q y"), SliceDiff::Unchanged(b" "), SliceDiff::Replaced(b"q ", b""), SliceDiff::Unchanged(b"z q b"), SliceDiff::Unchanged(b" "), SliceDiff::Replaced(b"q ", b""), SliceDiff::Unchanged(b"y q x"), SliceDiff::Unchanged(b" "), SliceDiff::Replaced(b"q", b"r"), SliceDiff::Unchanged(b" "), SliceDiff::Replaced(b"", b"r "), SliceDiff::Unchanged(b"c"), ] ); } #[test] fn test_diff_real_case_write_fmt() { // This is from src/ui.rs in commit f44d246e3f88 in this repo. It highlights the // need for recursion into the range at the end: after splitting at "Arguments" // and "styler", the region at the end has the unique words "write_fmt" // and "fmt", but we forgot to recurse into that region, so we ended up // saying that "write_fmt(fmt).unwrap()" was replaced by b"write_fmt(fmt)". assert_eq!(diff( b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) {\n self.styler().write_fmt(fmt).unwrap()\n", b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) -> io::Result<()> {\n self.styler().write_fmt(fmt)\n" ), vec![ SliceDiff::Unchanged(b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_"), SliceDiff::Unchanged(b">) "), SliceDiff::Replaced(b"", b"-> io::Result<()> "), SliceDiff::Unchanged(b"{\n "), SliceDiff::Unchanged(b"self.styler().write_fmt(fmt"), SliceDiff::Unchanged(b")"), SliceDiff::Replaced(b".unwrap()", b""), SliceDiff::Unchanged(b"\n") ] ); } #[test] fn test_diff_real_case_gitgit_read_tree_c() { // This is the diff from commit e497ea2a9b in the git.git repo assert_eq!( diff( br##"/* * GIT - The information manager from hell * * Copyright (C) Linus Torvalds, 2005 */ #include "#cache.h" static int unpack(unsigned char *sha1) { void *buffer; unsigned long size; char type[20]; buffer = read_sha1_file(sha1, type, &size); if (!buffer) usage("unable to read sha1 file"); if (strcmp(type, "tree")) usage("expected a 'tree' node"); while (size) { int len = strlen(buffer)+1; unsigned char *sha1 = buffer + len; char *path = strchr(buffer, ' ')+1; unsigned int mode; if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1) usage("corrupt 'tree' file"); buffer = sha1 + 20; size -= len + 20; printf("%o %s (%s)\n", mode, path, sha1_to_hex(sha1)); } return 0; } int main(int argc, char **argv) { int fd; unsigned char sha1[20]; if (argc != 2) usage("read-tree "); if (get_sha1_hex(argv[1], sha1) < 0) usage("read-tree "); sha1_file_directory = getenv(DB_ENVIRONMENT); if (!sha1_file_directory) sha1_file_directory = DEFAULT_DB_ENVIRONMENT; if (unpack(sha1) < 0) usage("unpack failed"); return 0; } "##, br##"/* * GIT - The information manager from hell * * Copyright (C) Linus Torvalds, 2005 */ #include "#cache.h" static void create_directories(const char *path) { int len = strlen(path); char *buf = malloc(len + 1); const char *slash = path; while ((slash = strchr(slash+1, '/')) != NULL) { len = slash - path; memcpy(buf, path, len); buf[len] = 0; mkdir(buf, 0700); } } static int create_file(const char *path) { int fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600); if (fd < 0) { if (errno == ENOENT) { create_directories(path); fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600); } } return fd; } static int unpack(unsigned char *sha1) { void *buffer; unsigned long size; char type[20]; buffer = read_sha1_file(sha1, type, &size); if (!buffer) usage("unable to read sha1 file"); if (strcmp(type, "tree")) usage("expected a 'tree' node"); while (size) { int len = strlen(buffer)+1; unsigned char *sha1 = buffer + len; char *path = strchr(buffer, ' ')+1; char *data; unsigned long filesize; unsigned int mode; int fd; if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1) usage("corrupt 'tree' file"); buffer = sha1 + 20; size -= len + 20; data = read_sha1_file(sha1, type, &filesize); if (!data || strcmp(type, "blob")) usage("tree file refers to bad file data"); fd = create_file(path); if (fd < 0) usage("unable to create file"); if (write(fd, data, filesize) != filesize) usage("unable to write file"); fchmod(fd, mode); close(fd); free(data); } return 0; } int main(int argc, char **argv) { int fd; unsigned char sha1[20]; if (argc != 2) usage("read-tree "); if (get_sha1_hex(argv[1], sha1) < 0) usage("read-tree "); sha1_file_directory = getenv(DB_ENVIRONMENT); if (!sha1_file_directory) sha1_file_directory = DEFAULT_DB_ENVIRONMENT; if (unpack(sha1) < 0) usage("unpack failed"); return 0; } "##, ), vec![ SliceDiff::Unchanged(b"/*\n * GIT - The information manager from hell\n *\n * Copyright (C) Linus Torvalds, 2005\n */\n#include \"#cache.h\"\n\n"), SliceDiff::Replaced(b"", b"static void create_directories(const char *path)\n{\n\tint len = strlen(path);\n\tchar *buf = malloc(len + 1);\n\tconst char *slash = path;\n\n\twhile ((slash = strchr(slash+1, \'/\')) != NULL) {\n\t\tlen = slash - path;\n\t\tmemcpy(buf, path, len);\n\t\tbuf[len] = 0;\n\t\tmkdir(buf, 0700);\n\t}\n}\n\nstatic int create_file(const char *path)\n{\n\tint fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\tif (fd < 0) {\n\t\tif (errno == ENOENT) {\n\t\t\tcreate_directories(path);\n\t\t\tfd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\t\t}\n\t}\n\treturn fd;\n}\n\n"), SliceDiff::Unchanged(b"static int unpack(unsigned char *sha1)\n{\n\tvoid *buffer;\n\tunsigned long size;\n\tchar type[20];\n\n\tbuffer = read_sha1_file(sha1, type, &size);\n\tif (!buffer)\n\t\tusage(\"unable to read sha1 file\");\n\tif (strcmp(type, \"tree\"))\n\t\tusage(\"expected a \'tree\' node\");\n\twhile (size) {\n\t\tint len = strlen(buffer)+1;\n\t\tunsigned char *sha1 = buffer + len;\n\t\tchar *path = strchr(buffer, \' \')+1;\n"), SliceDiff::Replaced(b"", b"\t\tchar *data;\n\t\tunsigned long filesize;\n"), SliceDiff::Unchanged(b"\t\tunsigned int mode;\n"), SliceDiff::Replaced(b"", b"\t\tint fd;\n\n"), SliceDiff::Unchanged(b"\t\tif (size < len + 20 || sscanf(buffer, \"%o\", &mode) != 1)\n\t\t\tusage(\"corrupt \'tree\' file\");\n\t\tbuffer = sha1 + 20;\n\t\tsize -= len + 20;\n"), SliceDiff::Unchanged(b"\t\t"), SliceDiff::Replaced(b"printf", b"data = read_sha1_file"), SliceDiff::Unchanged(b"("), SliceDiff::Replaced(b"\"%o %s (%s)\\n\", mode, path, sha1_to_hex(", b""), SliceDiff::Unchanged(b"sha1"), SliceDiff::Replaced(b"", b", type, &filesize"), SliceDiff::Unchanged(b")"), SliceDiff::Replaced(b")", b""), SliceDiff::Unchanged(b";\n"), SliceDiff::Replaced(b"", b"\t\tif (!data || strcmp(type, \"blob\"))\n\t\t\tusage(\"tree file refers to bad file data\");\n\t\tfd = create_file(path);\n\t\tif (fd < 0)\n\t\t\tusage(\"unable to create file\");\n\t\tif (write(fd, data, filesize) != filesize)\n\t\t\tusage(\"unable to write file\");\n\t\tfchmod(fd, mode);\n\t\tclose(fd);\n\t\tfree(data);\n"), SliceDiff::Unchanged(b"\t}\n\treturn 0;\n}\n\nint main(int argc, char **argv)\n{\n\tint fd;\n\tunsigned char sha1[20];\n\n\tif (argc != 2)\n\t\tusage(\"read-tree \");\n\tif (get_sha1_hex(argv[1], sha1) < 0)\n\t\tusage(\"read-tree \");\n\tsha1_file_directory = getenv(DB_ENVIRONMENT);\n\tif (!sha1_file_directory)\n\t\tsha1_file_directory = DEFAULT_DB_ENVIRONMENT;\n\tif (unpack(sha1) < 0)\n\t\tusage(\"unpack failed\");\n\treturn 0;\n}\n") ] ); } }