ok/jj
1
0
Fork 0
forked from mirrors/jj

diff: use low-level HashTable in Histogram

This change made some diff benches slow, maybe because the generated code
becomes slightly worse due to the added abstraction? I'll revisit the
performance problem later. There are a couple of ways to mitigate it.

```
group                             new                     old
-----                             ---                     ---
bench_diff_git_git_read_tree_c    1.02     61.0±0.23µs    1.00     59.7±0.38µs
bench_diff_lines/modified/10k     1.00     41.6±0.24ms    1.02     42.3±0.22ms
bench_diff_lines/modified/1k      1.00      3.8±0.07ms    1.00      3.8±0.03ms
bench_diff_lines/reversed/10k     1.29     23.4±0.20ms    1.00     18.2±0.26ms
bench_diff_lines/reversed/1k      1.05    517.2±5.55µs    1.00   493.7±59.72µs
bench_diff_lines/unchanged/10k    1.00      3.9±0.10ms    1.08      4.2±0.10ms
bench_diff_lines/unchanged/1k     1.01    356.8±2.33µs    1.00    353.7±1.99µs
```
(I don't get stable results on my noisy machine, so the results would vary.)
This commit is contained in:
Yuya Nishihara 2024-09-26 21:12:42 +09:00
parent de137c8f9a
commit c5f926103a

View file

@ -16,7 +16,6 @@
use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::collections::HashMap;
use std::hash::BuildHasher;
use std::hash::Hash;
use std::hash::Hasher;
@ -26,6 +25,7 @@ use std::ops::Range;
use std::slice;
use bstr::BStr;
use hashbrown::HashTable;
use itertools::Itertools;
pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
@ -189,7 +189,6 @@ struct WordComparator<C, S> {
hash_builder: S,
}
#[allow(unused)] // TODO
impl<C: CompareBytes> WordComparator<C, RandomState> {
fn new(compare: C) -> Self {
WordComparator {
@ -200,7 +199,6 @@ impl<C: CompareBytes> WordComparator<C, RandomState> {
}
}
#[allow(unused)] // TODO
impl<C: CompareBytes, S: BuildHasher> WordComparator<C, S> {
fn eq(&self, left: &[u8], right: &[u8]) -> bool {
self.compare.eq(left, right)
@ -252,15 +250,25 @@ impl<'input, 'aux> DiffSource<'input, 'aux> {
}
struct Histogram<'input> {
word_to_positions: HashMap<&'input BStr, Vec<WordPosition>>,
word_to_positions: HashTable<HistogramEntry<'input>>,
}
type HistogramEntry<'input> = (&'input BStr, Vec<WordPosition>);
impl<'input> Histogram<'input> {
fn calculate(source: &DiffSource<'input, '_>, max_occurrences: usize) -> Self {
let mut word_to_positions: HashMap<&BStr, Vec<WordPosition>> = HashMap::new();
fn calculate<C: CompareBytes, S: BuildHasher>(
source: &DiffSource<'input, '_>,
comp: &WordComparator<C, S>,
max_occurrences: usize,
) -> Self {
let mut word_to_positions: HashTable<HistogramEntry> = HashTable::new();
for (i, range) in source.ranges.iter().enumerate() {
let word = &source.text[range.clone()];
let positions = word_to_positions.entry(word).or_default();
let hash = comp.hash_one(word);
let (_, positions) = word_to_positions
.entry(hash, |(w, _)| comp.eq(w, word), |(w, _)| comp.hash_one(w))
.or_insert_with(|| (word, vec![]))
.into_mut();
// Allow one more than max_occurrences, so we can later skip those with more
// than max_occurrences
if positions.len() <= max_occurrences {
@ -270,14 +278,27 @@ impl<'input> Histogram<'input> {
Histogram { word_to_positions }
}
fn build_count_to_entries(&self) -> BTreeMap<usize, Vec<(&'input BStr, &Vec<WordPosition>)>> {
fn build_count_to_entries(&self) -> BTreeMap<usize, Vec<&HistogramEntry<'input>>> {
let mut count_to_entries: BTreeMap<usize, Vec<_>> = BTreeMap::new();
for (word, positions) in &self.word_to_positions {
for entry in &self.word_to_positions {
let (_, positions) = entry;
let entries = count_to_entries.entry(positions.len()).or_default();
entries.push((*word, positions));
entries.push(entry);
}
count_to_entries
}
fn positions_by_word<C: CompareBytes, S: BuildHasher>(
&self,
word: &BStr,
comp: &WordComparator<C, S>,
) -> Option<&[WordPosition]> {
let hash = comp.hash_one(word);
let (_, positions) = self
.word_to_positions
.find(hash, |(w, _)| comp.eq(w, word))?;
Some(positions)
}
}
/// Finds the LCS given a array where the value of `input[i]` indicates that
@ -338,10 +359,11 @@ fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> {
/// Finds unchanged word (or token) positions among the ones given as
/// arguments. The data between those words is ignored.
fn collect_unchanged_words(
fn collect_unchanged_words<C: CompareBytes, S: BuildHasher>(
found_positions: &mut Vec<(WordPosition, WordPosition)>,
left: &DiffSource,
right: &DiffSource,
comp: &WordComparator<C, S>,
) {
if left.ranges.is_empty() || right.ranges.is_empty() {
return;
@ -349,21 +371,21 @@ fn collect_unchanged_words(
// Prioritize LCS-based algorithm than leading/trailing matches
let old_len = found_positions.len();
collect_unchanged_words_lcs(found_positions, left, right);
collect_unchanged_words_lcs(found_positions, left, right, comp);
if found_positions.len() != old_len {
return;
}
// Trim leading common ranges (i.e. grow previous unchanged region)
let common_leading_len = iter::zip(left.ranges, right.ranges)
.take_while(|&(l, r)| left.text[l.clone()] == right.text[r.clone()])
.take_while(|&(l, r)| comp.eq(&left.text[l.clone()], &right.text[r.clone()]))
.count();
let left_ranges = &left.ranges[common_leading_len..];
let right_ranges = &right.ranges[common_leading_len..];
// Trim trailing common ranges (i.e. grow next unchanged region)
let common_trailing_len = iter::zip(left_ranges.iter().rev(), right_ranges.iter().rev())
.take_while(|&(l, r)| left.text[l.clone()] == right.text[r.clone()])
.take_while(|&(l, r)| comp.eq(&left.text[l.clone()], &right.text[r.clone()]))
.count();
found_positions.extend(itertools::chain(
@ -382,19 +404,20 @@ fn collect_unchanged_words(
));
}
fn collect_unchanged_words_lcs(
fn collect_unchanged_words_lcs<C: CompareBytes, S: BuildHasher>(
found_positions: &mut Vec<(WordPosition, WordPosition)>,
left: &DiffSource,
right: &DiffSource,
comp: &WordComparator<C, S>,
) {
let max_occurrences = 100;
let left_histogram = Histogram::calculate(left, max_occurrences);
let left_histogram = Histogram::calculate(left, comp, max_occurrences);
let left_count_to_entries = left_histogram.build_count_to_entries();
if *left_count_to_entries.keys().next().unwrap() > max_occurrences {
// If there are very many occurrences of all words, then we just give up.
return;
}
let right_histogram = Histogram::calculate(right, max_occurrences);
let right_histogram = Histogram::calculate(right, comp, max_occurrences);
// Look for words with few occurrences in `left` (could equally well have picked
// `right`?). If any of them also occur in `right`, then we add the words to
// the LCS.
@ -403,7 +426,7 @@ fn collect_unchanged_words_lcs(
let mut both_positions = left_entries
.iter()
.filter_map(|&(word, left_positions)| {
let right_positions = right_histogram.word_to_positions.get(word)?;
let right_positions = right_histogram.positions_by_word(word, comp)?;
(left_positions.len() == right_positions.len())
.then_some((left_positions, right_positions))
})
@ -447,6 +470,7 @@ fn collect_unchanged_words_lcs(
found_positions,
&left.narrowed(previous_left_position..left_position),
&right.narrowed(previous_right_position..right_position),
comp,
);
found_positions.push((
left.map_to_global(left_position),
@ -460,6 +484,7 @@ fn collect_unchanged_words_lcs(
found_positions,
&left.narrowed(previous_left_position..WordPosition(left.ranges.len())),
&right.narrowed(previous_right_position..WordPosition(right.ranges.len())),
comp,
);
}
@ -533,6 +558,7 @@ impl<'input> Diff<'input> {
pub fn for_tokenizer<T: AsRef<[u8]> + ?Sized + 'input>(
inputs: impl IntoIterator<Item = &'input T>,
tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>,
compare: impl CompareBytes,
) -> Self {
let mut inputs = inputs.into_iter().map(BStr::new);
let base_input = inputs.next().expect("inputs must not be empty");
@ -541,7 +567,9 @@ impl<'input> Diff<'input> {
let base_token_ranges: Vec<Range<usize>>;
let other_token_ranges: Vec<Vec<Range<usize>>>;
// No need to tokenize if one of the inputs is empty. Non-empty inputs
// are all different.
// are all different as long as the tokenizer emits non-empty ranges.
// This means "" and " " are different even if the compare function is
// ignore-whitespace. They are tokenized as [] and [" "] respectively.
if base_input.is_empty() || other_inputs.iter().any(|input| input.is_empty()) {
base_token_ranges = vec![];
other_token_ranges = iter::repeat(vec![]).take(other_inputs.len()).collect();
@ -557,6 +585,7 @@ impl<'input> Diff<'input> {
other_inputs,
&base_token_ranges,
&other_token_ranges,
compare,
)
}
@ -565,8 +594,10 @@ impl<'input> Diff<'input> {
other_inputs: Vec<&'input BStr>,
base_token_ranges: &[Range<usize>],
other_token_ranges: &[Vec<Range<usize>>],
compare: impl CompareBytes,
) -> Self {
assert_eq!(other_inputs.len(), other_token_ranges.len());
let comp = WordComparator::new(compare);
let base_source = DiffSource::new(base_input, base_token_ranges);
let other_sources = iter::zip(&other_inputs, other_token_ranges)
.map(|(input, token_ranges)| DiffSource::new(input, token_ranges))
@ -585,7 +616,12 @@ impl<'input> Diff<'input> {
// found ranges with the ranges in the diff.
[first_other_source, tail_other_sources @ ..] => {
let mut first_positions = Vec::new();
collect_unchanged_words(&mut first_positions, &base_source, first_other_source);
collect_unchanged_words(
&mut first_positions,
&base_source,
first_other_source,
&comp,
);
if tail_other_sources.is_empty() {
first_positions
.iter()
@ -607,7 +643,12 @@ impl<'input> Diff<'input> {
first_positions,
|current_positions, other_source| {
let mut new_positions = Vec::new();
collect_unchanged_words(&mut new_positions, &base_source, other_source);
collect_unchanged_words(
&mut new_positions,
&base_source,
other_source,
&comp,
);
intersect_unchanged_words(current_positions, &new_positions)
},
);
@ -646,14 +687,14 @@ impl<'input> Diff<'input> {
pub fn unrefined<T: AsRef<[u8]> + ?Sized + 'input>(
inputs: impl IntoIterator<Item = &'input T>,
) -> Self {
Diff::for_tokenizer(inputs, |_| vec![])
Diff::for_tokenizer(inputs, |_| vec![], CompareBytesExactly)
}
/// Compares `inputs` line by line.
pub fn by_line<T: AsRef<[u8]> + ?Sized + 'input>(
inputs: impl IntoIterator<Item = &'input T>,
) -> Self {
Diff::for_tokenizer(inputs, find_line_ranges)
Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly)
}
/// Compares `inputs` word by word.
@ -663,8 +704,8 @@ impl<'input> Diff<'input> {
pub fn by_word<T: AsRef<[u8]> + ?Sized + 'input>(
inputs: impl IntoIterator<Item = &'input T>,
) -> Self {
let mut diff = Diff::for_tokenizer(inputs, find_word_ranges);
diff.refine_changed_regions(find_nonword_ranges);
let mut diff = Diff::for_tokenizer(inputs, find_word_ranges, CompareBytesExactly);
diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly);
diff
}
@ -695,7 +736,11 @@ impl<'input> Diff<'input> {
/// Uses the given tokenizer to split the changed regions into smaller
/// regions. Then tries to finds unchanged regions among them.
pub fn refine_changed_regions(&mut self, tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>) {
pub fn refine_changed_regions(
&mut self,
tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>,
compare: impl CompareBytes,
) {
let mut previous = UnchangedRange {
base: 0..0,
others: vec![0..0; self.other_inputs.len()],
@ -707,7 +752,7 @@ impl<'input> Diff<'input> {
// offsets to be valid in the context of the larger Diff instance
// (`self`).
let refined_diff =
Diff::for_tokenizer(self.hunk_between(&previous, current), &tokenizer);
Diff::for_tokenizer(self.hunk_between(&previous, current), &tokenizer, &compare);
for refined in &refined_diff.unchanged_regions {
let new_base_start = refined.base.start + previous.base.end;
let new_base_end = refined.base.end + previous.base.end;
@ -824,9 +869,9 @@ impl<'diff, 'input> Iterator for DiffHunkIterator<'diff, 'input> {
pub fn diff<'a, T: AsRef<[u8]> + ?Sized + 'a>(
inputs: impl IntoIterator<Item = &'a T>,
) -> Vec<DiffHunk<'a>> {
let mut diff = Diff::for_tokenizer(inputs, find_line_ranges);
diff.refine_changed_regions(find_word_ranges);
diff.refine_changed_regions(find_nonword_ranges);
let mut diff = Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly);
diff.refine_changed_regions(find_word_ranges, CompareBytesExactly);
diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly);
diff.hunks().collect()
}
@ -1004,8 +1049,9 @@ mod tests {
left: &DiffSource,
right: &DiffSource,
) -> Vec<(Range<usize>, Range<usize>)> {
let comp = WordComparator::new(CompareBytesExactly);
let mut positions = Vec::new();
collect_unchanged_words(&mut positions, left, right);
collect_unchanged_words(&mut positions, left, right, &comp);
positions
.into_iter()
.map(|(left_pos, right_pos)| (left.range_at(left_pos), right.range_at(right_pos)))
@ -1220,6 +1266,7 @@ mod tests {
let diff = Diff::for_tokenizer(
["a\nb\nc\nd\ne\nf\ng", "a\nb\nc\nX\ne\nf\ng"],
find_line_ranges,
CompareBytesExactly,
);
assert_eq!(
diff.hunks().collect_vec(),
@ -1290,6 +1337,60 @@ mod tests {
);
}
#[test]
fn test_diff_ignore_all_whitespace() {
fn diff(inputs: [&str; 2]) -> Vec<DiffHunk<'_>> {
let diff =
Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesIgnoreAllWhitespace);
diff.hunks().collect()
}
assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]);
assert_eq!(diff(["a\n", " a\r\n"]), vec![DiffHunk::matching("a\n")]);
assert_eq!(
diff(["a\n", " a\nb"]),
vec![DiffHunk::matching("a\n"), DiffHunk::different(["", "b"])]
);
// No LCS matches, so trim leading/trailing common lines
assert_eq!(
diff(["a\nc\n", " a\n a\n"]),
vec![
DiffHunk::matching("a\n"),
DiffHunk::different(["c\n", " a\n"]),
]
);
assert_eq!(
diff(["c\na\n", " a\n a\n"]),
vec![
DiffHunk::different(["c\n", " a\n"]),
DiffHunk::matching("a\n"),
]
);
}
#[test]
fn test_diff_ignore_whitespace_amount() {
fn diff(inputs: [&str; 2]) -> Vec<DiffHunk<'_>> {
let diff =
Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesIgnoreWhitespaceAmount);
diff.hunks().collect()
}
assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]);
// whitespace at line end is ignored
assert_eq!(diff(["a\n", "a\r\n"]), vec![DiffHunk::matching("a\n")]);
// but whitespace at line start isn't
assert_eq!(
diff(["a\n", " a\n"]),
vec![DiffHunk::different(["a\n", " a\n"])]
);
assert_eq!(
diff(["a\n", "a \nb"]),
vec![DiffHunk::matching("a\n"), DiffHunk::different(["", "b"])]
);
}
#[test]
fn test_diff_real_case_write_fmt() {
// This is from src/ui.rs in commit f44d246e3f88 in this repo. It highlights the