From a62c8776e8e4ee05ea54ed89832f31c93d177545 Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Wed, 14 Aug 2024 15:01:56 +0900 Subject: [PATCH] diff: move empty content optimization from diff() to Diff::for_tokenizer() unchanged_ranges() already has the fast path for empty content, but we can also disable tokenization. --- lib/src/diff.rs | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/lib/src/diff.rs b/lib/src/diff.rs index ddd1189f8..5daa918e4 100644 --- a/lib/src/diff.rs +++ b/lib/src/diff.rs @@ -416,11 +416,20 @@ impl<'input> Diff<'input> { let base_input = inputs.next().expect("inputs must not be empty"); let other_inputs = inputs.collect_vec(); // First tokenize each input - let base_token_ranges = tokenizer(base_input); - let other_token_ranges = other_inputs - .iter() - .map(|other_input| tokenizer(other_input)) - .collect_vec(); + let base_token_ranges: Vec>; + let other_token_ranges: Vec>>; + // No need to tokenize if one of the inputs is empty. Non-empty inputs + // are all different. + if base_input.is_empty() || other_inputs.iter().any(|input| input.is_empty()) { + base_token_ranges = vec![]; + other_token_ranges = iter::repeat(vec![]).take(other_inputs.len()).collect(); + } else { + base_token_ranges = tokenizer(base_input); + other_token_ranges = other_inputs + .iter() + .map(|other_input| tokenizer(other_input)) + .collect(); + } Self::with_inputs_and_token_ranges( base_input, other_inputs, @@ -654,12 +663,6 @@ pub fn diff<'a>(left: &'a [u8], right: &'a [u8]) -> Vec> { if left == right { return vec![DiffHunk::matching(left)]; } - if left.is_empty() { - return vec![DiffHunk::different([b"", right])]; - } - if right.is_empty() { - return vec![DiffHunk::different([left, b""])]; - } Diff::default_refinement([left, right]) .hunks()