From c5f926103afc1254f3ef175c247dd04223546d31 Mon Sep 17 00:00:00 2001
From: Yuya Nishihara <yuya@tcha.org>
Date: Thu, 26 Sep 2024 21:12:42 +0900
Subject: [PATCH] diff: use low-level HashTable in Histogram
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change made some diff benches slow, maybe because the generated code
becomes slightly worse due to the added abstraction? I'll revisit the
performance problem later. There are a couple of ways to mitigate it.

```
group                             new                     old
-----                             ---                     ---
bench_diff_git_git_read_tree_c    1.02     61.0±0.23µs    1.00     59.7±0.38µs
bench_diff_lines/modified/10k     1.00     41.6±0.24ms    1.02     42.3±0.22ms
bench_diff_lines/modified/1k      1.00      3.8±0.07ms    1.00      3.8±0.03ms
bench_diff_lines/reversed/10k     1.29     23.4±0.20ms    1.00     18.2±0.26ms
bench_diff_lines/reversed/1k      1.05    517.2±5.55µs    1.00   493.7±59.72µs
bench_diff_lines/unchanged/10k    1.00      3.9±0.10ms    1.08      4.2±0.10ms
bench_diff_lines/unchanged/1k     1.01    356.8±2.33µs    1.00    353.7±1.99µs
```
(I don't get stable results on my noisy machine, so the results would vary.)
---
 lib/src/diff.rs | 163 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 132 insertions(+), 31 deletions(-)
diff --git a/lib/src/diff.rs b/lib/src/diff.rs
index 9bc9ee69a..95d2c4f11 100644
--- a/lib/src/diff.rs
+++ b/lib/src/diff.rs
@@ -16,7 +16,6 @@
 
 use std::cmp::Ordering;
 use std::collections::BTreeMap;
-use std::collections::HashMap;
 use std::hash::BuildHasher;
 use std::hash::Hash;
 use std::hash::Hasher;
@@ -26,6 +25,7 @@ use std::ops::Range;
 use std::slice;
 
 use bstr::BStr;
+use hashbrown::HashTable;
 use itertools::Itertools;
 
 pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
@@ -189,7 +189,6 @@ struct WordComparator<C, S> {
     hash_builder: S,
 }
 
-#[allow(unused)] // TODO
 impl<C: CompareBytes> WordComparator<C, RandomState> {
     fn new(compare: C) -> Self {
         WordComparator {
@@ -200,7 +199,6 @@ impl<C: CompareBytes> WordComparator<C, RandomState> {
     }
 }
 
-#[allow(unused)] // TODO
 impl<C: CompareBytes, S: BuildHasher> WordComparator<C, S> {
     fn eq(&self, left: &[u8], right: &[u8]) -> bool {
         self.compare.eq(left, right)
@@ -252,15 +250,25 @@ impl<'input, 'aux> DiffSource<'input, 'aux> {
 }
 
 struct Histogram<'input> {
-    word_to_positions: HashMap<&'input BStr, Vec<WordPosition>>,
+    word_to_positions: HashTable<HistogramEntry<'input>>,
 }
 
+type HistogramEntry<'input> = (&'input BStr, Vec<WordPosition>);
+
 impl<'input> Histogram<'input> {
-    fn calculate(source: &DiffSource<'input, '_>, max_occurrences: usize) -> Self {
-        let mut word_to_positions: HashMap<&BStr, Vec<WordPosition>> = HashMap::new();
+    fn calculate<C: CompareBytes, S: BuildHasher>(
+        source: &DiffSource<'input, '_>,
+        comp: &WordComparator<C, S>,
+        max_occurrences: usize,
+    ) -> Self {
+        let mut word_to_positions: HashTable<HistogramEntry> = HashTable::new();
         for (i, range) in source.ranges.iter().enumerate() {
             let word = &source.text[range.clone()];
-            let positions = word_to_positions.entry(word).or_default();
+            let hash = comp.hash_one(word);
+            let (_, positions) = word_to_positions
+                .entry(hash, |(w, _)| comp.eq(w, word), |(w, _)| comp.hash_one(w))
+                .or_insert_with(|| (word, vec![]))
+                .into_mut();
             // Allow one more than max_occurrences, so we can later skip those with more
             // than max_occurrences
             if positions.len() <= max_occurrences {
@@ -270,14 +278,27 @@ impl<'input> Histogram<'input> {
         Histogram { word_to_positions }
     }
 
-    fn build_count_to_entries(&self) -> BTreeMap<usize, Vec<(&'input BStr, &Vec<WordPosition>)>> {
+    fn build_count_to_entries(&self) -> BTreeMap<usize, Vec<&HistogramEntry<'input>>> {
         let mut count_to_entries: BTreeMap<usize, Vec<_>> = BTreeMap::new();
-        for (word, positions) in &self.word_to_positions {
+        for entry in &self.word_to_positions {
+            let (_, positions) = entry;
             let entries = count_to_entries.entry(positions.len()).or_default();
-            entries.push((*word, positions));
+            entries.push(entry);
         }
         count_to_entries
     }
+
+    fn positions_by_word<C: CompareBytes, S: BuildHasher>(
+        &self,
+        word: &BStr,
+        comp: &WordComparator<C, S>,
+    ) -> Option<&[WordPosition]> {
+        let hash = comp.hash_one(word);
+        let (_, positions) = self
+            .word_to_positions
+            .find(hash, |(w, _)| comp.eq(w, word))?;
+        Some(positions)
+    }
 }
 
 /// Finds the LCS given a array where the value of `input[i]` indicates that
@@ -338,10 +359,11 @@ fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> {
 
 /// Finds unchanged word (or token) positions among the ones given as
 /// arguments. The data between those words is ignored.
-fn collect_unchanged_words(
+fn collect_unchanged_words<C: CompareBytes, S: BuildHasher>(
     found_positions: &mut Vec<(WordPosition, WordPosition)>,
     left: &DiffSource,
     right: &DiffSource,
+    comp: &WordComparator<C, S>,
 ) {
     if left.ranges.is_empty() || right.ranges.is_empty() {
         return;
@@ -349,21 +371,21 @@ fn collect_unchanged_words(
 
     // Prioritize LCS-based algorithm than leading/trailing matches
     let old_len = found_positions.len();
-    collect_unchanged_words_lcs(found_positions, left, right);
+    collect_unchanged_words_lcs(found_positions, left, right, comp);
     if found_positions.len() != old_len {
         return;
     }
 
     // Trim leading common ranges (i.e. grow previous unchanged region)
     let common_leading_len = iter::zip(left.ranges, right.ranges)
-        .take_while(|&(l, r)| left.text[l.clone()] == right.text[r.clone()])
+        .take_while(|&(l, r)| comp.eq(&left.text[l.clone()], &right.text[r.clone()]))
         .count();
     let left_ranges = &left.ranges[common_leading_len..];
     let right_ranges = &right.ranges[common_leading_len..];
 
     // Trim trailing common ranges (i.e. grow next unchanged region)
     let common_trailing_len = iter::zip(left_ranges.iter().rev(), right_ranges.iter().rev())
-        .take_while(|&(l, r)| left.text[l.clone()] == right.text[r.clone()])
+        .take_while(|&(l, r)| comp.eq(&left.text[l.clone()], &right.text[r.clone()]))
         .count();
 
     found_positions.extend(itertools::chain(
@@ -382,19 +404,20 @@ fn collect_unchanged_words(
     ));
 }
 
-fn collect_unchanged_words_lcs(
+fn collect_unchanged_words_lcs<C: CompareBytes, S: BuildHasher>(
     found_positions: &mut Vec<(WordPosition, WordPosition)>,
     left: &DiffSource,
     right: &DiffSource,
+    comp: &WordComparator<C, S>,
 ) {
     let max_occurrences = 100;
-    let left_histogram = Histogram::calculate(left, max_occurrences);
+    let left_histogram = Histogram::calculate(left, comp, max_occurrences);
     let left_count_to_entries = left_histogram.build_count_to_entries();
     if *left_count_to_entries.keys().next().unwrap() > max_occurrences {
         // If there are very many occurrences of all words, then we just give up.
         return;
     }
-    let right_histogram = Histogram::calculate(right, max_occurrences);
+    let right_histogram = Histogram::calculate(right, comp, max_occurrences);
     // Look for words with few occurrences in `left` (could equally well have picked
     // `right`?). If any of them also occur in `right`, then we add the words to
     // the LCS.
@@ -403,7 +426,7 @@ fn collect_unchanged_words_lcs(
             let mut both_positions = left_entries
                 .iter()
                 .filter_map(|&(word, left_positions)| {
-                    let right_positions = right_histogram.word_to_positions.get(word)?;
+                    let right_positions = right_histogram.positions_by_word(word, comp)?;
                     (left_positions.len() == right_positions.len())
                         .then_some((left_positions, right_positions))
                 })
@@ -447,6 +470,7 @@ fn collect_unchanged_words_lcs(
             found_positions,
             &left.narrowed(previous_left_position..left_position),
             &right.narrowed(previous_right_position..right_position),
+            comp,
         );
         found_positions.push((
             left.map_to_global(left_position),
@@ -460,6 +484,7 @@ fn collect_unchanged_words_lcs(
         found_positions,
         &left.narrowed(previous_left_position..WordPosition(left.ranges.len())),
         &right.narrowed(previous_right_position..WordPosition(right.ranges.len())),
+        comp,
     );
 }
 
@@ -533,6 +558,7 @@ impl<'input> Diff<'input> {
     pub fn for_tokenizer<T: AsRef<[u8]> + ?Sized + 'input>(
         inputs: impl IntoIterator<Item = &'input T>,
         tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>,
+        compare: impl CompareBytes,
     ) -> Self {
         let mut inputs = inputs.into_iter().map(BStr::new);
         let base_input = inputs.next().expect("inputs must not be empty");
@@ -541,7 +567,9 @@ impl<'input> Diff<'input> {
         let base_token_ranges: Vec<Range<usize>>;
         let other_token_ranges: Vec<Vec<Range<usize>>>;
         // No need to tokenize if one of the inputs is empty. Non-empty inputs
-        // are all different.
+        // are all different as long as the tokenizer emits non-empty ranges.
+        // This means "" and " " are different even if the compare function is
+        // ignore-whitespace. They are tokenized as [] and [" "] respectively.
         if base_input.is_empty() || other_inputs.iter().any(|input| input.is_empty()) {
             base_token_ranges = vec![];
             other_token_ranges = iter::repeat(vec![]).take(other_inputs.len()).collect();
@@ -557,6 +585,7 @@ impl<'input> Diff<'input> {
             other_inputs,
             &base_token_ranges,
             &other_token_ranges,
+            compare,
         )
     }
 
@@ -565,8 +594,10 @@ impl<'input> Diff<'input> {
         other_inputs: Vec<&'input BStr>,
         base_token_ranges: &[Range<usize>],
         other_token_ranges: &[Vec<Range<usize>>],
+        compare: impl CompareBytes,
     ) -> Self {
         assert_eq!(other_inputs.len(), other_token_ranges.len());
+        let comp = WordComparator::new(compare);
         let base_source = DiffSource::new(base_input, base_token_ranges);
         let other_sources = iter::zip(&other_inputs, other_token_ranges)
             .map(|(input, token_ranges)| DiffSource::new(input, token_ranges))
@@ -585,7 +616,12 @@ impl<'input> Diff<'input> {
             // found ranges with the ranges in the diff.
             [first_other_source, tail_other_sources @ ..] => {
                 let mut first_positions = Vec::new();
-                collect_unchanged_words(&mut first_positions, &base_source, first_other_source);
+                collect_unchanged_words(
+                    &mut first_positions,
+                    &base_source,
+                    first_other_source,
+                    &comp,
+                );
                 if tail_other_sources.is_empty() {
                     first_positions
                         .iter()
@@ -607,7 +643,12 @@ impl<'input> Diff<'input> {
                         first_positions,
                         |current_positions, other_source| {
                             let mut new_positions = Vec::new();
-                            collect_unchanged_words(&mut new_positions, &base_source, other_source);
+                            collect_unchanged_words(
+                                &mut new_positions,
+                                &base_source,
+                                other_source,
+                                &comp,
+                            );
                             intersect_unchanged_words(current_positions, &new_positions)
                         },
                     );
@@ -646,14 +687,14 @@ impl<'input> Diff<'input> {
     pub fn unrefined<T: AsRef<[u8]> + ?Sized + 'input>(
         inputs: impl IntoIterator<Item = &'input T>,
     ) -> Self {
-        Diff::for_tokenizer(inputs, |_| vec![])
+        Diff::for_tokenizer(inputs, |_| vec![], CompareBytesExactly)
     }
 
     /// Compares `inputs` line by line.
     pub fn by_line<T: AsRef<[u8]> + ?Sized + 'input>(
         inputs: impl IntoIterator<Item = &'input T>,
     ) -> Self {
-        Diff::for_tokenizer(inputs, find_line_ranges)
+        Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly)
     }
 
     /// Compares `inputs` word by word.
@@ -663,8 +704,8 @@ impl<'input> Diff<'input> {
     pub fn by_word<T: AsRef<[u8]> + ?Sized + 'input>(
         inputs: impl IntoIterator<Item = &'input T>,
     ) -> Self {
-        let mut diff = Diff::for_tokenizer(inputs, find_word_ranges);
-        diff.refine_changed_regions(find_nonword_ranges);
+        let mut diff = Diff::for_tokenizer(inputs, find_word_ranges, CompareBytesExactly);
+        diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly);
         diff
     }
 
@@ -695,7 +736,11 @@ impl<'input> Diff<'input> {
 
     /// Uses the given tokenizer to split the changed regions into smaller
     /// regions. Then tries to finds unchanged regions among them.
-    pub fn refine_changed_regions(&mut self, tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>) {
+    pub fn refine_changed_regions(
+        &mut self,
+        tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>,
+        compare: impl CompareBytes,
+    ) {
         let mut previous = UnchangedRange {
             base: 0..0,
             others: vec![0..0; self.other_inputs.len()],
@@ -707,7 +752,7 @@ impl<'input> Diff<'input> {
             // offsets to be valid in the context of the larger Diff instance
             // (`self`).
             let refined_diff =
-                Diff::for_tokenizer(self.hunk_between(&previous, current), &tokenizer);
+                Diff::for_tokenizer(self.hunk_between(&previous, current), &tokenizer, &compare);
             for refined in &refined_diff.unchanged_regions {
                 let new_base_start = refined.base.start + previous.base.end;
                 let new_base_end = refined.base.end + previous.base.end;
@@ -824,9 +869,9 @@ impl<'diff, 'input> Iterator for DiffHunkIterator<'diff, 'input> {
 pub fn diff<'a, T: AsRef<[u8]> + ?Sized + 'a>(
     inputs: impl IntoIterator<Item = &'a T>,
 ) -> Vec<DiffHunk<'a>> {
-    let mut diff = Diff::for_tokenizer(inputs, find_line_ranges);
-    diff.refine_changed_regions(find_word_ranges);
-    diff.refine_changed_regions(find_nonword_ranges);
+    let mut diff = Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly);
+    diff.refine_changed_regions(find_word_ranges, CompareBytesExactly);
+    diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly);
     diff.hunks().collect()
 }
 
@@ -1004,8 +1049,9 @@ mod tests {
         left: &DiffSource,
         right: &DiffSource,
     ) -> Vec<(Range<usize>, Range<usize>)> {
+        let comp = WordComparator::new(CompareBytesExactly);
         let mut positions = Vec::new();
-        collect_unchanged_words(&mut positions, left, right);
+        collect_unchanged_words(&mut positions, left, right, &comp);
         positions
             .into_iter()
             .map(|(left_pos, right_pos)| (left.range_at(left_pos), right.range_at(right_pos)))
@@ -1220,6 +1266,7 @@ mod tests {
         let diff = Diff::for_tokenizer(
             ["a\nb\nc\nd\ne\nf\ng", "a\nb\nc\nX\ne\nf\ng"],
             find_line_ranges,
+            CompareBytesExactly,
         );
         assert_eq!(
             diff.hunks().collect_vec(),
@@ -1290,6 +1337,60 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_diff_ignore_all_whitespace() {
+        fn diff(inputs: [&str; 2]) -> Vec<DiffHunk<'_>> {
+            let diff =
+                Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesIgnoreAllWhitespace);
+            diff.hunks().collect()
+        }
+
+        assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]);
+        assert_eq!(diff(["a\n", " a\r\n"]), vec![DiffHunk::matching("a\n")]);
+        assert_eq!(
+            diff(["a\n", " a\nb"]),
+            vec![DiffHunk::matching("a\n"), DiffHunk::different(["", "b"])]
+        );
+
+        // No LCS matches, so trim leading/trailing common lines
+        assert_eq!(
+            diff(["a\nc\n", " a\n a\n"]),
+            vec![
+                DiffHunk::matching("a\n"),
+                DiffHunk::different(["c\n", " a\n"]),
+            ]
+        );
+        assert_eq!(
+            diff(["c\na\n", " a\n a\n"]),
+            vec![
+                DiffHunk::different(["c\n", " a\n"]),
+                DiffHunk::matching("a\n"),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_diff_ignore_whitespace_amount() {
+        fn diff(inputs: [&str; 2]) -> Vec<DiffHunk<'_>> {
+            let diff =
+                Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesIgnoreWhitespaceAmount);
+            diff.hunks().collect()
+        }
+
+        assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]);
+        // whitespace at line end is ignored
+        assert_eq!(diff(["a\n", "a\r\n"]), vec![DiffHunk::matching("a\n")]);
+        // but whitespace at line start isn't
+        assert_eq!(
+            diff(["a\n", " a\n"]),
+            vec![DiffHunk::different(["a\n", " a\n"])]
+        );
+        assert_eq!(
+            diff(["a\n", "a \nb"]),
+            vec![DiffHunk::matching("a\n"), DiffHunk::different(["", "b"])]
+        );
+    }
+
     #[test]
     fn test_diff_real_case_write_fmt() {
         // This is from src/ui.rs in commit f44d246e3f88 in this repo. It highlights the