From c5f926103afc1254f3ef175c247dd04223546d31 Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Thu, 26 Sep 2024 21:12:42 +0900 Subject: [PATCH] diff: use low-level HashTable in Histogram MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change made some diff benches slow, maybe because the generated code becomes slightly worse due to the added abstraction? I'll revisit the performance problem later. There are a couple of ways to mitigate it. ``` group new old ----- --- --- bench_diff_git_git_read_tree_c 1.02 61.0±0.23µs 1.00 59.7±0.38µs bench_diff_lines/modified/10k 1.00 41.6±0.24ms 1.02 42.3±0.22ms bench_diff_lines/modified/1k 1.00 3.8±0.07ms 1.00 3.8±0.03ms bench_diff_lines/reversed/10k 1.29 23.4±0.20ms 1.00 18.2±0.26ms bench_diff_lines/reversed/1k 1.05 517.2±5.55µs 1.00 493.7±59.72µs bench_diff_lines/unchanged/10k 1.00 3.9±0.10ms 1.08 4.2±0.10ms bench_diff_lines/unchanged/1k 1.01 356.8±2.33µs 1.00 353.7±1.99µs ``` (I don't get stable results on my noisy machine, so the results would vary.) --- lib/src/diff.rs | 163 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 132 insertions(+), 31 deletions(-) diff --git a/lib/src/diff.rs b/lib/src/diff.rs index 9bc9ee69a..95d2c4f11 100644 --- a/lib/src/diff.rs +++ b/lib/src/diff.rs @@ -16,7 +16,6 @@ use std::cmp::Ordering; use std::collections::BTreeMap; -use std::collections::HashMap; use std::hash::BuildHasher; use std::hash::Hash; use std::hash::Hasher; @@ -26,6 +25,7 @@ use std::ops::Range; use std::slice; use bstr::BStr; +use hashbrown::HashTable; use itertools::Itertools; pub fn find_line_ranges(text: &[u8]) -> Vec> { @@ -189,7 +189,6 @@ struct WordComparator { hash_builder: S, } -#[allow(unused)] // TODO impl WordComparator { fn new(compare: C) -> Self { WordComparator { @@ -200,7 +199,6 @@ impl WordComparator { } } -#[allow(unused)] // TODO impl WordComparator { fn eq(&self, left: &[u8], right: &[u8]) -> bool { self.compare.eq(left, right) @@ -252,15 +250,25 @@ impl<'input, 'aux> DiffSource<'input, 'aux> { } struct Histogram<'input> { - word_to_positions: HashMap<&'input BStr, Vec>, + word_to_positions: HashTable>, } +type HistogramEntry<'input> = (&'input BStr, Vec); + impl<'input> Histogram<'input> { - fn calculate(source: &DiffSource<'input, '_>, max_occurrences: usize) -> Self { - let mut word_to_positions: HashMap<&BStr, Vec> = HashMap::new(); + fn calculate( + source: &DiffSource<'input, '_>, + comp: &WordComparator, + max_occurrences: usize, + ) -> Self { + let mut word_to_positions: HashTable = HashTable::new(); for (i, range) in source.ranges.iter().enumerate() { let word = &source.text[range.clone()]; - let positions = word_to_positions.entry(word).or_default(); + let hash = comp.hash_one(word); + let (_, positions) = word_to_positions + .entry(hash, |(w, _)| comp.eq(w, word), |(w, _)| comp.hash_one(w)) + .or_insert_with(|| (word, vec![])) + .into_mut(); // Allow one more than max_occurrences, so we can later skip those with more // than max_occurrences if positions.len() <= max_occurrences { @@ -270,14 +278,27 @@ impl<'input> Histogram<'input> { Histogram { word_to_positions } } - fn build_count_to_entries(&self) -> BTreeMap)>> { + fn build_count_to_entries(&self) -> BTreeMap>> { let mut count_to_entries: BTreeMap> = BTreeMap::new(); - for (word, positions) in &self.word_to_positions { + for entry in &self.word_to_positions { + let (_, positions) = entry; let entries = count_to_entries.entry(positions.len()).or_default(); - entries.push((*word, positions)); + entries.push(entry); } count_to_entries } + + fn positions_by_word( + &self, + word: &BStr, + comp: &WordComparator, + ) -> Option<&[WordPosition]> { + let hash = comp.hash_one(word); + let (_, positions) = self + .word_to_positions + .find(hash, |(w, _)| comp.eq(w, word))?; + Some(positions) + } } /// Finds the LCS given a array where the value of `input[i]` indicates that @@ -338,10 +359,11 @@ fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> { /// Finds unchanged word (or token) positions among the ones given as /// arguments. The data between those words is ignored. -fn collect_unchanged_words( +fn collect_unchanged_words( found_positions: &mut Vec<(WordPosition, WordPosition)>, left: &DiffSource, right: &DiffSource, + comp: &WordComparator, ) { if left.ranges.is_empty() || right.ranges.is_empty() { return; @@ -349,21 +371,21 @@ fn collect_unchanged_words( // Prioritize LCS-based algorithm than leading/trailing matches let old_len = found_positions.len(); - collect_unchanged_words_lcs(found_positions, left, right); + collect_unchanged_words_lcs(found_positions, left, right, comp); if found_positions.len() != old_len { return; } // Trim leading common ranges (i.e. grow previous unchanged region) let common_leading_len = iter::zip(left.ranges, right.ranges) - .take_while(|&(l, r)| left.text[l.clone()] == right.text[r.clone()]) + .take_while(|&(l, r)| comp.eq(&left.text[l.clone()], &right.text[r.clone()])) .count(); let left_ranges = &left.ranges[common_leading_len..]; let right_ranges = &right.ranges[common_leading_len..]; // Trim trailing common ranges (i.e. grow next unchanged region) let common_trailing_len = iter::zip(left_ranges.iter().rev(), right_ranges.iter().rev()) - .take_while(|&(l, r)| left.text[l.clone()] == right.text[r.clone()]) + .take_while(|&(l, r)| comp.eq(&left.text[l.clone()], &right.text[r.clone()])) .count(); found_positions.extend(itertools::chain( @@ -382,19 +404,20 @@ fn collect_unchanged_words( )); } -fn collect_unchanged_words_lcs( +fn collect_unchanged_words_lcs( found_positions: &mut Vec<(WordPosition, WordPosition)>, left: &DiffSource, right: &DiffSource, + comp: &WordComparator, ) { let max_occurrences = 100; - let left_histogram = Histogram::calculate(left, max_occurrences); + let left_histogram = Histogram::calculate(left, comp, max_occurrences); let left_count_to_entries = left_histogram.build_count_to_entries(); if *left_count_to_entries.keys().next().unwrap() > max_occurrences { // If there are very many occurrences of all words, then we just give up. return; } - let right_histogram = Histogram::calculate(right, max_occurrences); + let right_histogram = Histogram::calculate(right, comp, max_occurrences); // Look for words with few occurrences in `left` (could equally well have picked // `right`?). If any of them also occur in `right`, then we add the words to // the LCS. @@ -403,7 +426,7 @@ fn collect_unchanged_words_lcs( let mut both_positions = left_entries .iter() .filter_map(|&(word, left_positions)| { - let right_positions = right_histogram.word_to_positions.get(word)?; + let right_positions = right_histogram.positions_by_word(word, comp)?; (left_positions.len() == right_positions.len()) .then_some((left_positions, right_positions)) }) @@ -447,6 +470,7 @@ fn collect_unchanged_words_lcs( found_positions, &left.narrowed(previous_left_position..left_position), &right.narrowed(previous_right_position..right_position), + comp, ); found_positions.push(( left.map_to_global(left_position), @@ -460,6 +484,7 @@ fn collect_unchanged_words_lcs( found_positions, &left.narrowed(previous_left_position..WordPosition(left.ranges.len())), &right.narrowed(previous_right_position..WordPosition(right.ranges.len())), + comp, ); } @@ -533,6 +558,7 @@ impl<'input> Diff<'input> { pub fn for_tokenizer + ?Sized + 'input>( inputs: impl IntoIterator, tokenizer: impl Fn(&[u8]) -> Vec>, + compare: impl CompareBytes, ) -> Self { let mut inputs = inputs.into_iter().map(BStr::new); let base_input = inputs.next().expect("inputs must not be empty"); @@ -541,7 +567,9 @@ impl<'input> Diff<'input> { let base_token_ranges: Vec>; let other_token_ranges: Vec>>; // No need to tokenize if one of the inputs is empty. Non-empty inputs - // are all different. + // are all different as long as the tokenizer emits non-empty ranges. + // This means "" and " " are different even if the compare function is + // ignore-whitespace. They are tokenized as [] and [" "] respectively. if base_input.is_empty() || other_inputs.iter().any(|input| input.is_empty()) { base_token_ranges = vec![]; other_token_ranges = iter::repeat(vec![]).take(other_inputs.len()).collect(); @@ -557,6 +585,7 @@ impl<'input> Diff<'input> { other_inputs, &base_token_ranges, &other_token_ranges, + compare, ) } @@ -565,8 +594,10 @@ impl<'input> Diff<'input> { other_inputs: Vec<&'input BStr>, base_token_ranges: &[Range], other_token_ranges: &[Vec>], + compare: impl CompareBytes, ) -> Self { assert_eq!(other_inputs.len(), other_token_ranges.len()); + let comp = WordComparator::new(compare); let base_source = DiffSource::new(base_input, base_token_ranges); let other_sources = iter::zip(&other_inputs, other_token_ranges) .map(|(input, token_ranges)| DiffSource::new(input, token_ranges)) @@ -585,7 +616,12 @@ impl<'input> Diff<'input> { // found ranges with the ranges in the diff. [first_other_source, tail_other_sources @ ..] => { let mut first_positions = Vec::new(); - collect_unchanged_words(&mut first_positions, &base_source, first_other_source); + collect_unchanged_words( + &mut first_positions, + &base_source, + first_other_source, + &comp, + ); if tail_other_sources.is_empty() { first_positions .iter() @@ -607,7 +643,12 @@ impl<'input> Diff<'input> { first_positions, |current_positions, other_source| { let mut new_positions = Vec::new(); - collect_unchanged_words(&mut new_positions, &base_source, other_source); + collect_unchanged_words( + &mut new_positions, + &base_source, + other_source, + &comp, + ); intersect_unchanged_words(current_positions, &new_positions) }, ); @@ -646,14 +687,14 @@ impl<'input> Diff<'input> { pub fn unrefined + ?Sized + 'input>( inputs: impl IntoIterator, ) -> Self { - Diff::for_tokenizer(inputs, |_| vec![]) + Diff::for_tokenizer(inputs, |_| vec![], CompareBytesExactly) } /// Compares `inputs` line by line. pub fn by_line + ?Sized + 'input>( inputs: impl IntoIterator, ) -> Self { - Diff::for_tokenizer(inputs, find_line_ranges) + Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly) } /// Compares `inputs` word by word. @@ -663,8 +704,8 @@ impl<'input> Diff<'input> { pub fn by_word + ?Sized + 'input>( inputs: impl IntoIterator, ) -> Self { - let mut diff = Diff::for_tokenizer(inputs, find_word_ranges); - diff.refine_changed_regions(find_nonword_ranges); + let mut diff = Diff::for_tokenizer(inputs, find_word_ranges, CompareBytesExactly); + diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly); diff } @@ -695,7 +736,11 @@ impl<'input> Diff<'input> { /// Uses the given tokenizer to split the changed regions into smaller /// regions. Then tries to finds unchanged regions among them. - pub fn refine_changed_regions(&mut self, tokenizer: impl Fn(&[u8]) -> Vec>) { + pub fn refine_changed_regions( + &mut self, + tokenizer: impl Fn(&[u8]) -> Vec>, + compare: impl CompareBytes, + ) { let mut previous = UnchangedRange { base: 0..0, others: vec![0..0; self.other_inputs.len()], @@ -707,7 +752,7 @@ impl<'input> Diff<'input> { // offsets to be valid in the context of the larger Diff instance // (`self`). let refined_diff = - Diff::for_tokenizer(self.hunk_between(&previous, current), &tokenizer); + Diff::for_tokenizer(self.hunk_between(&previous, current), &tokenizer, &compare); for refined in &refined_diff.unchanged_regions { let new_base_start = refined.base.start + previous.base.end; let new_base_end = refined.base.end + previous.base.end; @@ -824,9 +869,9 @@ impl<'diff, 'input> Iterator for DiffHunkIterator<'diff, 'input> { pub fn diff<'a, T: AsRef<[u8]> + ?Sized + 'a>( inputs: impl IntoIterator, ) -> Vec> { - let mut diff = Diff::for_tokenizer(inputs, find_line_ranges); - diff.refine_changed_regions(find_word_ranges); - diff.refine_changed_regions(find_nonword_ranges); + let mut diff = Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly); + diff.refine_changed_regions(find_word_ranges, CompareBytesExactly); + diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly); diff.hunks().collect() } @@ -1004,8 +1049,9 @@ mod tests { left: &DiffSource, right: &DiffSource, ) -> Vec<(Range, Range)> { + let comp = WordComparator::new(CompareBytesExactly); let mut positions = Vec::new(); - collect_unchanged_words(&mut positions, left, right); + collect_unchanged_words(&mut positions, left, right, &comp); positions .into_iter() .map(|(left_pos, right_pos)| (left.range_at(left_pos), right.range_at(right_pos))) @@ -1220,6 +1266,7 @@ mod tests { let diff = Diff::for_tokenizer( ["a\nb\nc\nd\ne\nf\ng", "a\nb\nc\nX\ne\nf\ng"], find_line_ranges, + CompareBytesExactly, ); assert_eq!( diff.hunks().collect_vec(), @@ -1290,6 +1337,60 @@ mod tests { ); } + #[test] + fn test_diff_ignore_all_whitespace() { + fn diff(inputs: [&str; 2]) -> Vec> { + let diff = + Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesIgnoreAllWhitespace); + diff.hunks().collect() + } + + assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]); + assert_eq!(diff(["a\n", " a\r\n"]), vec![DiffHunk::matching("a\n")]); + assert_eq!( + diff(["a\n", " a\nb"]), + vec![DiffHunk::matching("a\n"), DiffHunk::different(["", "b"])] + ); + + // No LCS matches, so trim leading/trailing common lines + assert_eq!( + diff(["a\nc\n", " a\n a\n"]), + vec![ + DiffHunk::matching("a\n"), + DiffHunk::different(["c\n", " a\n"]), + ] + ); + assert_eq!( + diff(["c\na\n", " a\n a\n"]), + vec![ + DiffHunk::different(["c\n", " a\n"]), + DiffHunk::matching("a\n"), + ] + ); + } + + #[test] + fn test_diff_ignore_whitespace_amount() { + fn diff(inputs: [&str; 2]) -> Vec> { + let diff = + Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesIgnoreWhitespaceAmount); + diff.hunks().collect() + } + + assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]); + // whitespace at line end is ignored + assert_eq!(diff(["a\n", "a\r\n"]), vec![DiffHunk::matching("a\n")]); + // but whitespace at line start isn't + assert_eq!( + diff(["a\n", " a\n"]), + vec![DiffHunk::different(["a\n", " a\n"])] + ); + assert_eq!( + diff(["a\n", "a \nb"]), + vec![DiffHunk::matching("a\n"), DiffHunk::different(["", "b"])] + ); + } + #[test] fn test_diff_real_case_write_fmt() { // This is from src/ui.rs in commit f44d246e3f88 in this repo. It highlights the