diff: remove unneeded mutation of histogram

The iterator might look a bit involved, but it clarifies that we never combine
words from different buckets.
This commit is contained in:
Yuya Nishihara 2024-07-01 18:11:30 +09:00
parent 774769a00a
commit ba087f9350

View file

@ -169,31 +169,29 @@ pub(crate) fn unchanged_ranges(
} }
let max_occurrences = 100; let max_occurrences = 100;
let mut left_histogram = Histogram::calculate(left, left_ranges, max_occurrences); let left_histogram = Histogram::calculate(left, left_ranges, max_occurrences);
if *left_histogram.count_to_words.keys().next().unwrap() > max_occurrences { if *left_histogram.count_to_words.keys().next().unwrap() > max_occurrences {
// If there are very many occurrences of all words, then we just give up. // If there are very many occurrences of all words, then we just give up.
return vec![]; return vec![];
} }
let mut right_histogram = Histogram::calculate(right, right_ranges, max_occurrences); let right_histogram = Histogram::calculate(right, right_ranges, max_occurrences);
// Look for words with few occurrences in `left` (could equally well have picked // Look for words with few occurrences in `left` (could equally well have picked
// `right`?). If any of them also occur in `right`, then we add the words to // `right`?). If any of them also occur in `right`, then we add the words to
// the LCS. // the LCS.
let mut uncommon_shared_words = vec![]; let Some(uncommon_shared_words) = left_histogram
while !left_histogram.count_to_words.is_empty() && uncommon_shared_words.is_empty() { .count_to_words
let left_words = left_histogram .values()
.count_to_words .map(|left_words| -> Vec<&[u8]> {
.first_entry() left_words
.map(|x| x.remove()) .iter()
.unwrap(); .copied()
for left_word in left_words { .filter(|left_word| right_histogram.word_to_positions.contains_key(left_word))
if right_histogram.word_to_positions.contains_key(left_word) { .collect()
uncommon_shared_words.push(left_word); })
} .find(|words| !words.is_empty())
} else {
}
if uncommon_shared_words.is_empty() {
return vec![]; return vec![];
} };
// Let's say our inputs are "a b a b" and "a b c c b a b". We will have found // Let's say our inputs are "a b a b" and "a b c c b a b". We will have found
// the least common words to be "a" and "b". We now assume that each // the least common words to be "a" and "b". We now assume that each
@ -208,14 +206,8 @@ pub(crate) fn unchanged_ranges(
let mut left_positions = vec![]; let mut left_positions = vec![];
let mut right_positions = vec![]; let mut right_positions = vec![];
for uncommon_shared_word in uncommon_shared_words { for uncommon_shared_word in uncommon_shared_words {
let left_occurrences = left_histogram let left_occurrences = &left_histogram.word_to_positions[uncommon_shared_word];
.word_to_positions let right_occurrences = &right_histogram.word_to_positions[uncommon_shared_word];
.get_mut(uncommon_shared_word)
.unwrap();
let right_occurrences = right_histogram
.word_to_positions
.get_mut(uncommon_shared_word)
.unwrap();
let shared_count = min(left_occurrences.len(), right_occurrences.len()); let shared_count = min(left_occurrences.len(), right_occurrences.len());
for occurrence in 0..shared_count { for occurrence in 0..shared_count {
left_positions.push(( left_positions.push((