mirror of
https://github.com/martinvonz/jj.git
synced 2025-02-01 09:08:51 +00:00
diff: use low-level HashTable in Histogram
This change made some diff benches slow, maybe because the generated code becomes slightly worse due to the added abstraction? I'll revisit the performance problem later. There are a couple of ways to mitigate it. ``` group new old ----- --- --- bench_diff_git_git_read_tree_c 1.02 61.0±0.23µs 1.00 59.7±0.38µs bench_diff_lines/modified/10k 1.00 41.6±0.24ms 1.02 42.3±0.22ms bench_diff_lines/modified/1k 1.00 3.8±0.07ms 1.00 3.8±0.03ms bench_diff_lines/reversed/10k 1.29 23.4±0.20ms 1.00 18.2±0.26ms bench_diff_lines/reversed/1k 1.05 517.2±5.55µs 1.00 493.7±59.72µs bench_diff_lines/unchanged/10k 1.00 3.9±0.10ms 1.08 4.2±0.10ms bench_diff_lines/unchanged/1k 1.01 356.8±2.33µs 1.00 353.7±1.99µs ``` (I don't get stable results on my noisy machine, so the results would vary.)
This commit is contained in:
parent
de137c8f9a
commit
c5f926103a
1 changed files with 132 additions and 31 deletions
163
lib/src/diff.rs
163
lib/src/diff.rs
|
@ -16,7 +16,6 @@
|
||||||
|
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::collections::HashMap;
|
|
||||||
use std::hash::BuildHasher;
|
use std::hash::BuildHasher;
|
||||||
use std::hash::Hash;
|
use std::hash::Hash;
|
||||||
use std::hash::Hasher;
|
use std::hash::Hasher;
|
||||||
|
@ -26,6 +25,7 @@ use std::ops::Range;
|
||||||
use std::slice;
|
use std::slice;
|
||||||
|
|
||||||
use bstr::BStr;
|
use bstr::BStr;
|
||||||
|
use hashbrown::HashTable;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
|
|
||||||
pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
|
pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
|
||||||
|
@ -189,7 +189,6 @@ struct WordComparator<C, S> {
|
||||||
hash_builder: S,
|
hash_builder: S,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(unused)] // TODO
|
|
||||||
impl<C: CompareBytes> WordComparator<C, RandomState> {
|
impl<C: CompareBytes> WordComparator<C, RandomState> {
|
||||||
fn new(compare: C) -> Self {
|
fn new(compare: C) -> Self {
|
||||||
WordComparator {
|
WordComparator {
|
||||||
|
@ -200,7 +199,6 @@ impl<C: CompareBytes> WordComparator<C, RandomState> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(unused)] // TODO
|
|
||||||
impl<C: CompareBytes, S: BuildHasher> WordComparator<C, S> {
|
impl<C: CompareBytes, S: BuildHasher> WordComparator<C, S> {
|
||||||
fn eq(&self, left: &[u8], right: &[u8]) -> bool {
|
fn eq(&self, left: &[u8], right: &[u8]) -> bool {
|
||||||
self.compare.eq(left, right)
|
self.compare.eq(left, right)
|
||||||
|
@ -252,15 +250,25 @@ impl<'input, 'aux> DiffSource<'input, 'aux> {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Histogram<'input> {
|
struct Histogram<'input> {
|
||||||
word_to_positions: HashMap<&'input BStr, Vec<WordPosition>>,
|
word_to_positions: HashTable<HistogramEntry<'input>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type HistogramEntry<'input> = (&'input BStr, Vec<WordPosition>);
|
||||||
|
|
||||||
impl<'input> Histogram<'input> {
|
impl<'input> Histogram<'input> {
|
||||||
fn calculate(source: &DiffSource<'input, '_>, max_occurrences: usize) -> Self {
|
fn calculate<C: CompareBytes, S: BuildHasher>(
|
||||||
let mut word_to_positions: HashMap<&BStr, Vec<WordPosition>> = HashMap::new();
|
source: &DiffSource<'input, '_>,
|
||||||
|
comp: &WordComparator<C, S>,
|
||||||
|
max_occurrences: usize,
|
||||||
|
) -> Self {
|
||||||
|
let mut word_to_positions: HashTable<HistogramEntry> = HashTable::new();
|
||||||
for (i, range) in source.ranges.iter().enumerate() {
|
for (i, range) in source.ranges.iter().enumerate() {
|
||||||
let word = &source.text[range.clone()];
|
let word = &source.text[range.clone()];
|
||||||
let positions = word_to_positions.entry(word).or_default();
|
let hash = comp.hash_one(word);
|
||||||
|
let (_, positions) = word_to_positions
|
||||||
|
.entry(hash, |(w, _)| comp.eq(w, word), |(w, _)| comp.hash_one(w))
|
||||||
|
.or_insert_with(|| (word, vec![]))
|
||||||
|
.into_mut();
|
||||||
// Allow one more than max_occurrences, so we can later skip those with more
|
// Allow one more than max_occurrences, so we can later skip those with more
|
||||||
// than max_occurrences
|
// than max_occurrences
|
||||||
if positions.len() <= max_occurrences {
|
if positions.len() <= max_occurrences {
|
||||||
|
@ -270,14 +278,27 @@ impl<'input> Histogram<'input> {
|
||||||
Histogram { word_to_positions }
|
Histogram { word_to_positions }
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_count_to_entries(&self) -> BTreeMap<usize, Vec<(&'input BStr, &Vec<WordPosition>)>> {
|
fn build_count_to_entries(&self) -> BTreeMap<usize, Vec<&HistogramEntry<'input>>> {
|
||||||
let mut count_to_entries: BTreeMap<usize, Vec<_>> = BTreeMap::new();
|
let mut count_to_entries: BTreeMap<usize, Vec<_>> = BTreeMap::new();
|
||||||
for (word, positions) in &self.word_to_positions {
|
for entry in &self.word_to_positions {
|
||||||
|
let (_, positions) = entry;
|
||||||
let entries = count_to_entries.entry(positions.len()).or_default();
|
let entries = count_to_entries.entry(positions.len()).or_default();
|
||||||
entries.push((*word, positions));
|
entries.push(entry);
|
||||||
}
|
}
|
||||||
count_to_entries
|
count_to_entries
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn positions_by_word<C: CompareBytes, S: BuildHasher>(
|
||||||
|
&self,
|
||||||
|
word: &BStr,
|
||||||
|
comp: &WordComparator<C, S>,
|
||||||
|
) -> Option<&[WordPosition]> {
|
||||||
|
let hash = comp.hash_one(word);
|
||||||
|
let (_, positions) = self
|
||||||
|
.word_to_positions
|
||||||
|
.find(hash, |(w, _)| comp.eq(w, word))?;
|
||||||
|
Some(positions)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Finds the LCS given a array where the value of `input[i]` indicates that
|
/// Finds the LCS given a array where the value of `input[i]` indicates that
|
||||||
|
@ -338,10 +359,11 @@ fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> {
|
||||||
|
|
||||||
/// Finds unchanged word (or token) positions among the ones given as
|
/// Finds unchanged word (or token) positions among the ones given as
|
||||||
/// arguments. The data between those words is ignored.
|
/// arguments. The data between those words is ignored.
|
||||||
fn collect_unchanged_words(
|
fn collect_unchanged_words<C: CompareBytes, S: BuildHasher>(
|
||||||
found_positions: &mut Vec<(WordPosition, WordPosition)>,
|
found_positions: &mut Vec<(WordPosition, WordPosition)>,
|
||||||
left: &DiffSource,
|
left: &DiffSource,
|
||||||
right: &DiffSource,
|
right: &DiffSource,
|
||||||
|
comp: &WordComparator<C, S>,
|
||||||
) {
|
) {
|
||||||
if left.ranges.is_empty() || right.ranges.is_empty() {
|
if left.ranges.is_empty() || right.ranges.is_empty() {
|
||||||
return;
|
return;
|
||||||
|
@ -349,21 +371,21 @@ fn collect_unchanged_words(
|
||||||
|
|
||||||
// Prioritize LCS-based algorithm than leading/trailing matches
|
// Prioritize LCS-based algorithm than leading/trailing matches
|
||||||
let old_len = found_positions.len();
|
let old_len = found_positions.len();
|
||||||
collect_unchanged_words_lcs(found_positions, left, right);
|
collect_unchanged_words_lcs(found_positions, left, right, comp);
|
||||||
if found_positions.len() != old_len {
|
if found_positions.len() != old_len {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Trim leading common ranges (i.e. grow previous unchanged region)
|
// Trim leading common ranges (i.e. grow previous unchanged region)
|
||||||
let common_leading_len = iter::zip(left.ranges, right.ranges)
|
let common_leading_len = iter::zip(left.ranges, right.ranges)
|
||||||
.take_while(|&(l, r)| left.text[l.clone()] == right.text[r.clone()])
|
.take_while(|&(l, r)| comp.eq(&left.text[l.clone()], &right.text[r.clone()]))
|
||||||
.count();
|
.count();
|
||||||
let left_ranges = &left.ranges[common_leading_len..];
|
let left_ranges = &left.ranges[common_leading_len..];
|
||||||
let right_ranges = &right.ranges[common_leading_len..];
|
let right_ranges = &right.ranges[common_leading_len..];
|
||||||
|
|
||||||
// Trim trailing common ranges (i.e. grow next unchanged region)
|
// Trim trailing common ranges (i.e. grow next unchanged region)
|
||||||
let common_trailing_len = iter::zip(left_ranges.iter().rev(), right_ranges.iter().rev())
|
let common_trailing_len = iter::zip(left_ranges.iter().rev(), right_ranges.iter().rev())
|
||||||
.take_while(|&(l, r)| left.text[l.clone()] == right.text[r.clone()])
|
.take_while(|&(l, r)| comp.eq(&left.text[l.clone()], &right.text[r.clone()]))
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
found_positions.extend(itertools::chain(
|
found_positions.extend(itertools::chain(
|
||||||
|
@ -382,19 +404,20 @@ fn collect_unchanged_words(
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect_unchanged_words_lcs(
|
fn collect_unchanged_words_lcs<C: CompareBytes, S: BuildHasher>(
|
||||||
found_positions: &mut Vec<(WordPosition, WordPosition)>,
|
found_positions: &mut Vec<(WordPosition, WordPosition)>,
|
||||||
left: &DiffSource,
|
left: &DiffSource,
|
||||||
right: &DiffSource,
|
right: &DiffSource,
|
||||||
|
comp: &WordComparator<C, S>,
|
||||||
) {
|
) {
|
||||||
let max_occurrences = 100;
|
let max_occurrences = 100;
|
||||||
let left_histogram = Histogram::calculate(left, max_occurrences);
|
let left_histogram = Histogram::calculate(left, comp, max_occurrences);
|
||||||
let left_count_to_entries = left_histogram.build_count_to_entries();
|
let left_count_to_entries = left_histogram.build_count_to_entries();
|
||||||
if *left_count_to_entries.keys().next().unwrap() > max_occurrences {
|
if *left_count_to_entries.keys().next().unwrap() > max_occurrences {
|
||||||
// If there are very many occurrences of all words, then we just give up.
|
// If there are very many occurrences of all words, then we just give up.
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
let right_histogram = Histogram::calculate(right, max_occurrences);
|
let right_histogram = Histogram::calculate(right, comp, max_occurrences);
|
||||||
// Look for words with few occurrences in `left` (could equally well have picked
|
// Look for words with few occurrences in `left` (could equally well have picked
|
||||||
// `right`?). If any of them also occur in `right`, then we add the words to
|
// `right`?). If any of them also occur in `right`, then we add the words to
|
||||||
// the LCS.
|
// the LCS.
|
||||||
|
@ -403,7 +426,7 @@ fn collect_unchanged_words_lcs(
|
||||||
let mut both_positions = left_entries
|
let mut both_positions = left_entries
|
||||||
.iter()
|
.iter()
|
||||||
.filter_map(|&(word, left_positions)| {
|
.filter_map(|&(word, left_positions)| {
|
||||||
let right_positions = right_histogram.word_to_positions.get(word)?;
|
let right_positions = right_histogram.positions_by_word(word, comp)?;
|
||||||
(left_positions.len() == right_positions.len())
|
(left_positions.len() == right_positions.len())
|
||||||
.then_some((left_positions, right_positions))
|
.then_some((left_positions, right_positions))
|
||||||
})
|
})
|
||||||
|
@ -447,6 +470,7 @@ fn collect_unchanged_words_lcs(
|
||||||
found_positions,
|
found_positions,
|
||||||
&left.narrowed(previous_left_position..left_position),
|
&left.narrowed(previous_left_position..left_position),
|
||||||
&right.narrowed(previous_right_position..right_position),
|
&right.narrowed(previous_right_position..right_position),
|
||||||
|
comp,
|
||||||
);
|
);
|
||||||
found_positions.push((
|
found_positions.push((
|
||||||
left.map_to_global(left_position),
|
left.map_to_global(left_position),
|
||||||
|
@ -460,6 +484,7 @@ fn collect_unchanged_words_lcs(
|
||||||
found_positions,
|
found_positions,
|
||||||
&left.narrowed(previous_left_position..WordPosition(left.ranges.len())),
|
&left.narrowed(previous_left_position..WordPosition(left.ranges.len())),
|
||||||
&right.narrowed(previous_right_position..WordPosition(right.ranges.len())),
|
&right.narrowed(previous_right_position..WordPosition(right.ranges.len())),
|
||||||
|
comp,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -533,6 +558,7 @@ impl<'input> Diff<'input> {
|
||||||
pub fn for_tokenizer<T: AsRef<[u8]> + ?Sized + 'input>(
|
pub fn for_tokenizer<T: AsRef<[u8]> + ?Sized + 'input>(
|
||||||
inputs: impl IntoIterator<Item = &'input T>,
|
inputs: impl IntoIterator<Item = &'input T>,
|
||||||
tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>,
|
tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>,
|
||||||
|
compare: impl CompareBytes,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let mut inputs = inputs.into_iter().map(BStr::new);
|
let mut inputs = inputs.into_iter().map(BStr::new);
|
||||||
let base_input = inputs.next().expect("inputs must not be empty");
|
let base_input = inputs.next().expect("inputs must not be empty");
|
||||||
|
@ -541,7 +567,9 @@ impl<'input> Diff<'input> {
|
||||||
let base_token_ranges: Vec<Range<usize>>;
|
let base_token_ranges: Vec<Range<usize>>;
|
||||||
let other_token_ranges: Vec<Vec<Range<usize>>>;
|
let other_token_ranges: Vec<Vec<Range<usize>>>;
|
||||||
// No need to tokenize if one of the inputs is empty. Non-empty inputs
|
// No need to tokenize if one of the inputs is empty. Non-empty inputs
|
||||||
// are all different.
|
// are all different as long as the tokenizer emits non-empty ranges.
|
||||||
|
// This means "" and " " are different even if the compare function is
|
||||||
|
// ignore-whitespace. They are tokenized as [] and [" "] respectively.
|
||||||
if base_input.is_empty() || other_inputs.iter().any(|input| input.is_empty()) {
|
if base_input.is_empty() || other_inputs.iter().any(|input| input.is_empty()) {
|
||||||
base_token_ranges = vec![];
|
base_token_ranges = vec![];
|
||||||
other_token_ranges = iter::repeat(vec![]).take(other_inputs.len()).collect();
|
other_token_ranges = iter::repeat(vec![]).take(other_inputs.len()).collect();
|
||||||
|
@ -557,6 +585,7 @@ impl<'input> Diff<'input> {
|
||||||
other_inputs,
|
other_inputs,
|
||||||
&base_token_ranges,
|
&base_token_ranges,
|
||||||
&other_token_ranges,
|
&other_token_ranges,
|
||||||
|
compare,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -565,8 +594,10 @@ impl<'input> Diff<'input> {
|
||||||
other_inputs: Vec<&'input BStr>,
|
other_inputs: Vec<&'input BStr>,
|
||||||
base_token_ranges: &[Range<usize>],
|
base_token_ranges: &[Range<usize>],
|
||||||
other_token_ranges: &[Vec<Range<usize>>],
|
other_token_ranges: &[Vec<Range<usize>>],
|
||||||
|
compare: impl CompareBytes,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
assert_eq!(other_inputs.len(), other_token_ranges.len());
|
assert_eq!(other_inputs.len(), other_token_ranges.len());
|
||||||
|
let comp = WordComparator::new(compare);
|
||||||
let base_source = DiffSource::new(base_input, base_token_ranges);
|
let base_source = DiffSource::new(base_input, base_token_ranges);
|
||||||
let other_sources = iter::zip(&other_inputs, other_token_ranges)
|
let other_sources = iter::zip(&other_inputs, other_token_ranges)
|
||||||
.map(|(input, token_ranges)| DiffSource::new(input, token_ranges))
|
.map(|(input, token_ranges)| DiffSource::new(input, token_ranges))
|
||||||
|
@ -585,7 +616,12 @@ impl<'input> Diff<'input> {
|
||||||
// found ranges with the ranges in the diff.
|
// found ranges with the ranges in the diff.
|
||||||
[first_other_source, tail_other_sources @ ..] => {
|
[first_other_source, tail_other_sources @ ..] => {
|
||||||
let mut first_positions = Vec::new();
|
let mut first_positions = Vec::new();
|
||||||
collect_unchanged_words(&mut first_positions, &base_source, first_other_source);
|
collect_unchanged_words(
|
||||||
|
&mut first_positions,
|
||||||
|
&base_source,
|
||||||
|
first_other_source,
|
||||||
|
&comp,
|
||||||
|
);
|
||||||
if tail_other_sources.is_empty() {
|
if tail_other_sources.is_empty() {
|
||||||
first_positions
|
first_positions
|
||||||
.iter()
|
.iter()
|
||||||
|
@ -607,7 +643,12 @@ impl<'input> Diff<'input> {
|
||||||
first_positions,
|
first_positions,
|
||||||
|current_positions, other_source| {
|
|current_positions, other_source| {
|
||||||
let mut new_positions = Vec::new();
|
let mut new_positions = Vec::new();
|
||||||
collect_unchanged_words(&mut new_positions, &base_source, other_source);
|
collect_unchanged_words(
|
||||||
|
&mut new_positions,
|
||||||
|
&base_source,
|
||||||
|
other_source,
|
||||||
|
&comp,
|
||||||
|
);
|
||||||
intersect_unchanged_words(current_positions, &new_positions)
|
intersect_unchanged_words(current_positions, &new_positions)
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
@ -646,14 +687,14 @@ impl<'input> Diff<'input> {
|
||||||
pub fn unrefined<T: AsRef<[u8]> + ?Sized + 'input>(
|
pub fn unrefined<T: AsRef<[u8]> + ?Sized + 'input>(
|
||||||
inputs: impl IntoIterator<Item = &'input T>,
|
inputs: impl IntoIterator<Item = &'input T>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
Diff::for_tokenizer(inputs, |_| vec![])
|
Diff::for_tokenizer(inputs, |_| vec![], CompareBytesExactly)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares `inputs` line by line.
|
/// Compares `inputs` line by line.
|
||||||
pub fn by_line<T: AsRef<[u8]> + ?Sized + 'input>(
|
pub fn by_line<T: AsRef<[u8]> + ?Sized + 'input>(
|
||||||
inputs: impl IntoIterator<Item = &'input T>,
|
inputs: impl IntoIterator<Item = &'input T>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
Diff::for_tokenizer(inputs, find_line_ranges)
|
Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares `inputs` word by word.
|
/// Compares `inputs` word by word.
|
||||||
|
@ -663,8 +704,8 @@ impl<'input> Diff<'input> {
|
||||||
pub fn by_word<T: AsRef<[u8]> + ?Sized + 'input>(
|
pub fn by_word<T: AsRef<[u8]> + ?Sized + 'input>(
|
||||||
inputs: impl IntoIterator<Item = &'input T>,
|
inputs: impl IntoIterator<Item = &'input T>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let mut diff = Diff::for_tokenizer(inputs, find_word_ranges);
|
let mut diff = Diff::for_tokenizer(inputs, find_word_ranges, CompareBytesExactly);
|
||||||
diff.refine_changed_regions(find_nonword_ranges);
|
diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly);
|
||||||
diff
|
diff
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -695,7 +736,11 @@ impl<'input> Diff<'input> {
|
||||||
|
|
||||||
/// Uses the given tokenizer to split the changed regions into smaller
|
/// Uses the given tokenizer to split the changed regions into smaller
|
||||||
/// regions. Then tries to finds unchanged regions among them.
|
/// regions. Then tries to finds unchanged regions among them.
|
||||||
pub fn refine_changed_regions(&mut self, tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>) {
|
pub fn refine_changed_regions(
|
||||||
|
&mut self,
|
||||||
|
tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>,
|
||||||
|
compare: impl CompareBytes,
|
||||||
|
) {
|
||||||
let mut previous = UnchangedRange {
|
let mut previous = UnchangedRange {
|
||||||
base: 0..0,
|
base: 0..0,
|
||||||
others: vec![0..0; self.other_inputs.len()],
|
others: vec![0..0; self.other_inputs.len()],
|
||||||
|
@ -707,7 +752,7 @@ impl<'input> Diff<'input> {
|
||||||
// offsets to be valid in the context of the larger Diff instance
|
// offsets to be valid in the context of the larger Diff instance
|
||||||
// (`self`).
|
// (`self`).
|
||||||
let refined_diff =
|
let refined_diff =
|
||||||
Diff::for_tokenizer(self.hunk_between(&previous, current), &tokenizer);
|
Diff::for_tokenizer(self.hunk_between(&previous, current), &tokenizer, &compare);
|
||||||
for refined in &refined_diff.unchanged_regions {
|
for refined in &refined_diff.unchanged_regions {
|
||||||
let new_base_start = refined.base.start + previous.base.end;
|
let new_base_start = refined.base.start + previous.base.end;
|
||||||
let new_base_end = refined.base.end + previous.base.end;
|
let new_base_end = refined.base.end + previous.base.end;
|
||||||
|
@ -824,9 +869,9 @@ impl<'diff, 'input> Iterator for DiffHunkIterator<'diff, 'input> {
|
||||||
pub fn diff<'a, T: AsRef<[u8]> + ?Sized + 'a>(
|
pub fn diff<'a, T: AsRef<[u8]> + ?Sized + 'a>(
|
||||||
inputs: impl IntoIterator<Item = &'a T>,
|
inputs: impl IntoIterator<Item = &'a T>,
|
||||||
) -> Vec<DiffHunk<'a>> {
|
) -> Vec<DiffHunk<'a>> {
|
||||||
let mut diff = Diff::for_tokenizer(inputs, find_line_ranges);
|
let mut diff = Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly);
|
||||||
diff.refine_changed_regions(find_word_ranges);
|
diff.refine_changed_regions(find_word_ranges, CompareBytesExactly);
|
||||||
diff.refine_changed_regions(find_nonword_ranges);
|
diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly);
|
||||||
diff.hunks().collect()
|
diff.hunks().collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1004,8 +1049,9 @@ mod tests {
|
||||||
left: &DiffSource,
|
left: &DiffSource,
|
||||||
right: &DiffSource,
|
right: &DiffSource,
|
||||||
) -> Vec<(Range<usize>, Range<usize>)> {
|
) -> Vec<(Range<usize>, Range<usize>)> {
|
||||||
|
let comp = WordComparator::new(CompareBytesExactly);
|
||||||
let mut positions = Vec::new();
|
let mut positions = Vec::new();
|
||||||
collect_unchanged_words(&mut positions, left, right);
|
collect_unchanged_words(&mut positions, left, right, &comp);
|
||||||
positions
|
positions
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(left_pos, right_pos)| (left.range_at(left_pos), right.range_at(right_pos)))
|
.map(|(left_pos, right_pos)| (left.range_at(left_pos), right.range_at(right_pos)))
|
||||||
|
@ -1220,6 +1266,7 @@ mod tests {
|
||||||
let diff = Diff::for_tokenizer(
|
let diff = Diff::for_tokenizer(
|
||||||
["a\nb\nc\nd\ne\nf\ng", "a\nb\nc\nX\ne\nf\ng"],
|
["a\nb\nc\nd\ne\nf\ng", "a\nb\nc\nX\ne\nf\ng"],
|
||||||
find_line_ranges,
|
find_line_ranges,
|
||||||
|
CompareBytesExactly,
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
diff.hunks().collect_vec(),
|
diff.hunks().collect_vec(),
|
||||||
|
@ -1290,6 +1337,60 @@ mod tests {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_diff_ignore_all_whitespace() {
|
||||||
|
fn diff(inputs: [&str; 2]) -> Vec<DiffHunk<'_>> {
|
||||||
|
let diff =
|
||||||
|
Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesIgnoreAllWhitespace);
|
||||||
|
diff.hunks().collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]);
|
||||||
|
assert_eq!(diff(["a\n", " a\r\n"]), vec![DiffHunk::matching("a\n")]);
|
||||||
|
assert_eq!(
|
||||||
|
diff(["a\n", " a\nb"]),
|
||||||
|
vec![DiffHunk::matching("a\n"), DiffHunk::different(["", "b"])]
|
||||||
|
);
|
||||||
|
|
||||||
|
// No LCS matches, so trim leading/trailing common lines
|
||||||
|
assert_eq!(
|
||||||
|
diff(["a\nc\n", " a\n a\n"]),
|
||||||
|
vec![
|
||||||
|
DiffHunk::matching("a\n"),
|
||||||
|
DiffHunk::different(["c\n", " a\n"]),
|
||||||
|
]
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
diff(["c\na\n", " a\n a\n"]),
|
||||||
|
vec![
|
||||||
|
DiffHunk::different(["c\n", " a\n"]),
|
||||||
|
DiffHunk::matching("a\n"),
|
||||||
|
]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_diff_ignore_whitespace_amount() {
|
||||||
|
fn diff(inputs: [&str; 2]) -> Vec<DiffHunk<'_>> {
|
||||||
|
let diff =
|
||||||
|
Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesIgnoreWhitespaceAmount);
|
||||||
|
diff.hunks().collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]);
|
||||||
|
// whitespace at line end is ignored
|
||||||
|
assert_eq!(diff(["a\n", "a\r\n"]), vec![DiffHunk::matching("a\n")]);
|
||||||
|
// but whitespace at line start isn't
|
||||||
|
assert_eq!(
|
||||||
|
diff(["a\n", " a\n"]),
|
||||||
|
vec![DiffHunk::different(["a\n", " a\n"])]
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
diff(["a\n", "a \nb"]),
|
||||||
|
vec![DiffHunk::matching("a\n"), DiffHunk::different(["", "b"])]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_diff_real_case_write_fmt() {
|
fn test_diff_real_case_write_fmt() {
|
||||||
// This is from src/ui.rs in commit f44d246e3f88 in this repo. It highlights the
|
// This is from src/ui.rs in commit f44d246e3f88 in this repo. It highlights the
|
||||||
|
|
Loading…
Reference in a new issue