diff: remove redundant hash map lookup of uncommon shared words

2025-02-01 09:08:51 +00:00 · 2024-09-27 11:54:02 +09:00 · 2024-09-27 11:54:02 +09:00 · 64e1ae277d
commit 64e1ae277d
parent 5c52b4ec13
1 changed files with 23 additions and 31 deletions
--- a/lib/src/diff.rs
+++ b/lib/src/diff.rs
@ -122,12 +122,13 @@ impl<'input> Histogram<'input> {
        Histogram { word_to_positions }
    }
-    fn build_count_to_words(&self) -> BTreeMap<usize, Vec<&'input BStr>> {
+    fn build_count_to_entries(&self) -> BTreeMap<usize, Vec<(&'input BStr, &Vec<WordPosition>)>> {
-        let mut count_to_words: BTreeMap<usize, Vec<&BStr>> = BTreeMap::new();
+        let mut count_to_entries: BTreeMap<usize, Vec<_>> = BTreeMap::new();
-        for (word, ranges) in &self.word_to_positions {
+        for (word, positions) in &self.word_to_positions {
-            count_to_words.entry(ranges.len()).or_default().push(word);
+            let entries = count_to_entries.entry(positions.len()).or_default();
            entries.push((*word, positions));
        }
-        count_to_words
+        count_to_entries
    }
 }
@ -233,8 +234,8 @@ fn unchanged_ranges_lcs(
 ) -> Vec<(Range<usize>, Range<usize>)> {
    let max_occurrences = 100;
    let left_histogram = Histogram::calculate(left, max_occurrences);
-    let left_count_to_words = left_histogram.build_count_to_words();
+    let left_count_to_entries = left_histogram.build_count_to_entries();
-    if *left_count_to_words.keys().next().unwrap() > max_occurrences {
+    if *left_count_to_entries.keys().next().unwrap() > max_occurrences {
        // If there are very many occurrences of all words, then we just give up.
        return vec![];
    }
@ -242,38 +243,29 @@ fn unchanged_ranges_lcs(
    // Look for words with few occurrences in `left` (could equally well have picked
    // `right`?). If any of them also occur in `right`, then we add the words to
    // the LCS.
-    let Some(uncommon_shared_words) = left_count_to_words
+    let Some(uncommon_shared_word_positions) =
-        .iter()
+        left_count_to_entries.values().find_map(|left_entries| {
-        .map(|(left_count, left_words)| -> Vec<&BStr> {
+            let mut both_positions = left_entries
            left_words
                .iter()
-                .copied()
+                .filter_map(|&(word, left_positions)| {
-                .filter(|left_word| {
+                    let right_positions = right_histogram.word_to_positions.get(word)?;
-                    let right_count = right_histogram
+                    (left_positions.len() == right_positions.len())
-                        .word_to_positions
+                        .then_some((left_positions, right_positions))
                        .get(left_word)
                        .map_or(0, |right_positions| right_positions.len());
                    *left_count == right_count
                })
-                .collect()
+                .peekable();
            both_positions.peek().is_some().then_some(both_positions)
        })
        .find(|words| !words.is_empty())
    else {
        return vec![];
    };
    // [(index into ranges, serial to identify {word, occurrence #})]
-    let (mut left_positions, mut right_positions): (Vec<_>, Vec<_>) = uncommon_shared_words
+    let (mut left_positions, mut right_positions): (Vec<_>, Vec<_>) =
-        .iter()
+        uncommon_shared_word_positions
-        .flat_map(|word| {
+            .flat_map(|(lefts, rights)| iter::zip(lefts, rights))
-            let left_occurrences = &left_histogram.word_to_positions[word];
+            .enumerate()
-            let right_occurrences = &right_histogram.word_to_positions[word];
+            .map(|(serial, (&left_pos, &right_pos))| ((left_pos, serial), (right_pos, serial)))
-            assert_eq!(left_occurrences.len(), right_occurrences.len());
+            .unzip();
            iter::zip(left_occurrences, right_occurrences)
        })
        .enumerate()
        .map(|(serial, (&left_pos, &right_pos))| ((left_pos, serial), (right_pos, serial)))
        .unzip();
    left_positions.sort_unstable_by_key(|&(pos, _serial)| pos);
    right_positions.sort_unstable_by_key(|&(pos, _serial)| pos);
    let left_index_by_right_index: Vec<usize> = {