diff: consider uncommon words to match only if they have the same count

Patience diff starts by lining up unique elements (e.g. lines) to find
matching segments of the inputs. After that, it refines the
non-matching segments by repeating the process. Histogram expands on
that by not just considering unique elements but by continuing with
elements of count 2, then 3, etc.

Before this commit, when diffing "a b a b b" against "a b a b a b", we
would match the two "a"s in the first input against the first two "a"s
in the second input. After this patch, we ignore the "a"s because
their counts differ, so we try to align the "b"s instead.

I have had this commit lying around since I wrote the histogram diff
implementation in 1e657c5331. I vaguely remember thinking that the
way I had implemented it (without this commit) was a bit weird, but I
wasn't sure if this commit would be an improvement or not. The bug
report from @chooglen today of a case where we behave differently from
Git is enough to make me think that we make this change after all.

#761
This commit is contained in:
Martin von Zweigbergk 2021-04-22 22:05:36 -07:00 committed by Yuya Nishihara
parent 831bbc0b11
commit fefe07b3c3
3 changed files with 49 additions and 38 deletions

View file

@ -59,13 +59,13 @@ fn test_obslog_with_or_without_diff() {
@ rlvkpnrz test.user@example.com 2001-02-03 08:05:10 66b42ad3
my description
Resolved conflict in file1:
1 1: <<<<<<< Conflict 1 of 1resolved
1 : <<<<<<< Conflict 1 of 1
2 : %%%%%%% Changes from base to side #1
3 : -foo
4 : +++++++ Contents of side #2
5 : foo
6 : bar
7 : >>>>>>> Conflict 1 of 1 ends
7 1: >>>>>>> Conflict 1 of 1 endsresolved
rlvkpnrz hidden test.user@example.com 2001-02-03 08:05:09 cf73917d conflict
my description
rlvkpnrz hidden test.user@example.com 2001-02-03 08:05:09 068224a7

View file

@ -230,12 +230,18 @@ fn unchanged_ranges_lcs(
// the LCS.
let Some(uncommon_shared_words) = left_histogram
.count_to_words
.values()
.map(|left_words| -> Vec<&[u8]> {
.iter()
.map(|(left_count, left_words)| -> Vec<&[u8]> {
left_words
.iter()
.copied()
.filter(|left_word| right_histogram.word_to_positions.contains_key(left_word))
.filter(|left_word| {
let right_count = right_histogram
.word_to_positions
.get(left_word)
.map_or(0, |right_positions| right_positions.len());
*left_count == right_count
})
.collect()
})
.find(|words| !words.is_empty())
@ -243,23 +249,14 @@ fn unchanged_ranges_lcs(
return vec![];
};
// Let's say our inputs are "a b a b" and "a b c c b a b". We will have found
// the least common words to be "a" and "b". We now assume that each
// occurrence of each word lines up in the left and right input. We do that
// by numbering the shared occurrences, effectively instead comparing "a1 b1
// a2 b2" and "a1 b1 c c b2 a2 b". We then walk the common words in the
// right input in order (["a1", "b1", "b2", "a2"]), and record the index of
// that word in the left input ([0,1,3,2]). We then find the LCS and split
// points based on that ([0,1,3] or [0,1,2] are both valid).
// [(index into left_ranges, word, occurrence #)]
let mut left_positions = vec![];
let mut right_positions = vec![];
for uncommon_shared_word in uncommon_shared_words {
let left_occurrences = &left_histogram.word_to_positions[uncommon_shared_word];
let right_occurrences = &right_histogram.word_to_positions[uncommon_shared_word];
let shared_count = min(left_occurrences.len(), right_occurrences.len());
for occurrence in 0..shared_count {
assert_eq!(left_occurrences.len(), right_occurrences.len());
for occurrence in 0..left_occurrences.len() {
left_positions.push((
left_occurrences[occurrence],
uncommon_shared_word,
@ -800,6 +797,8 @@ mod tests {
#[test]
fn test_unchanged_ranges_non_unique_removed() {
// We used to consider the first two "a" in the first input to match the two
// "a"s in the second input. We no longer do.
assert_eq!(
unchanged_ranges(
b"a a a a",
@ -807,7 +806,7 @@ mod tests {
&[0..1, 2..3, 4..5, 6..7],
&[0..1, 2..3, 4..5, 6..7],
),
vec![(0..1, 0..1), (2..3, 4..5)]
vec![(0..1, 0..1)]
);
assert_eq!(
unchanged_ranges(
@ -816,7 +815,7 @@ mod tests {
&[0..1, 2..3, 4..5, 6..7],
&[0..1, 2..3, 4..5, 6..7],
),
vec![(0..1, 2..3), (2..3, 6..7)]
vec![(6..7, 6..7)]
);
assert_eq!(
unchanged_ranges(
@ -825,7 +824,7 @@ mod tests {
&[0..1, 2..3, 4..5, 6..7],
&[0..1, 2..3, 4..5, 6..7],
),
vec![(0..1, 2..3), (2..3, 4..5)]
vec![]
);
assert_eq!(
unchanged_ranges(
@ -834,12 +833,14 @@ mod tests {
&[0..1, 2..3, 4..5, 6..7],
&[0..1, 2..3, 4..5, 6..7],
),
vec![(0..1, 0..1), (2..3, 6..7)]
vec![(0..1, 0..1), (6..7, 6..7)]
);
}
#[test]
fn test_unchanged_ranges_non_unique_added() {
// We used to consider the first two "a" in the first input to match the two
// "a"s in the second input. We no longer do.
assert_eq!(
unchanged_ranges(
b"a b a c",
@ -847,7 +848,7 @@ mod tests {
&[0..1, 2..3, 4..5, 6..7],
&[0..1, 2..3, 4..5, 6..7],
),
vec![(0..1, 0..1), (4..5, 2..3)]
vec![(0..1, 0..1)]
);
assert_eq!(
unchanged_ranges(
@ -856,7 +857,7 @@ mod tests {
&[0..1, 2..3, 4..5, 6..7],
&[0..1, 2..3, 4..5, 6..7],
),
vec![(2..3, 0..1), (6..7, 2..3)]
vec![(6..7, 6..7)]
);
assert_eq!(
unchanged_ranges(
@ -865,7 +866,7 @@ mod tests {
&[0..1, 2..3, 4..5, 6..7],
&[0..1, 2..3, 4..5, 6..7],
),
vec![(2..3, 0..1), (4..5, 2..3)]
vec![]
);
assert_eq!(
unchanged_ranges(
@ -874,7 +875,7 @@ mod tests {
&[0..1, 2..3, 4..5, 6..7],
&[0..1, 2..3, 4..5, 6..7],
),
vec![(0..1, 0..1), (6..7, 2..3)]
vec![(0..1, 0..1), (6..7, 6..7)]
);
}
@ -1258,16 +1259,12 @@ int main(int argc, char **argv)
DiffHunk::Matching(b"\t\tunsigned int mode;\n"),
DiffHunk::Different(vec![b"", b"\t\tint fd;\n\n"]),
DiffHunk::Matching(b"\t\tif (size < len + 20 || sscanf(buffer, \"%o\", &mode) != 1)\n\t\t\tusage(\"corrupt \'tree\' file\");\n\t\tbuffer = sha1 + 20;\n\t\tsize -= len + 20;\n\t\t"),
DiffHunk::Different(vec![b"printf", b"data = read_sha1_file"]),
DiffHunk::Matching(b"("),
DiffHunk::Different(vec![b"\"%o %s (%s)\\n\", mode, path, sha1_to_hex(", b""]),
DiffHunk::Matching(b"sha1"),
DiffHunk::Different(vec![b"", b", type, &filesize"]),
DiffHunk::Matching(b")"),
DiffHunk::Different(vec![b")", b""]),
DiffHunk::Matching(b";\n"),
DiffHunk::Different(vec![b"", b"\t\tif (!data || strcmp(type, \"blob\"))\n\t\t\tusage(\"tree file refers to bad file data\");\n\t\tfd = create_file(path);\n\t\tif (fd < 0)\n\t\t\tusage(\"unable to create file\");\n\t\tif (write(fd, data, filesize) != filesize)\n\t\t\tusage(\"unable to write file\");\n\t\tfchmod(fd, mode);\n\t\tclose(fd);\n\t\tfree(data);\n"]),
DiffHunk::Matching(b"\t}\n\treturn 0;\n}\n\nint main(int argc, char **argv)\n{\n\tint fd;\n\tunsigned char sha1[20];\n\n\tif (argc != 2)\n\t\tusage(\"read-tree <key>\");\n\tif (get_sha1_hex(argv[1], sha1) < 0)\n\t\tusage(\"read-tree <key>\");\n\tsha1_file_directory = getenv(DB_ENVIRONMENT);\n\tif (!sha1_file_directory)\n\t\tsha1_file_directory = DEFAULT_DB_ENVIRONMENT;\n\tif (unpack(sha1) < 0)\n\t\tusage(\"unpack failed\");\n\treturn 0;\n}\n")
DiffHunk::Different(vec![b"printf(\"%o %s (%s)\\n\", mode, path,", b"data ="]),
DiffHunk::Matching(b" "),
DiffHunk::Different(vec![b"sha1_to_hex", b"read_sha1_file"]),
DiffHunk::Matching(b"(sha1"),
DiffHunk::Different(vec![b")", b", type, &filesize);\n\t\tif (!data || strcmp(type, \"blob\"))\n\t\t\tusage(\"tree file refers to bad file data\");\n\t\tfd = create_file(path);\n\t\tif (fd < 0)\n\t\t\tusage(\"unable to create file\");\n\t\tif (write(fd, data, filesize) != filesize)\n\t\t\tusage(\"unable to write file\");\n\t\tfchmod(fd, mode);\n\t\tclose(fd);\n\t\tfree(data"]),
DiffHunk::Matching(b");\n\t}\n\treturn 0;\n}\n\nint main(int argc, char **argv)\n{\n\tint fd;\n\tunsigned char sha1[20];\n\n\tif (argc != 2)\n\t\tusage(\"read-tree <key>\");\n\tif (get_sha1_hex(argv[1], sha1) < 0)\n\t\tusage(\"read-tree <key>\");\n\tsha1_file_directory = getenv(DB_ENVIRONMENT);\n\tif (!sha1_file_directory)\n\t\tsha1_file_directory = DEFAULT_DB_ENVIRONMENT;\n\tif (unpack(sha1) < 0)\n\t\tusage(\"unpack failed\");\n\treturn 0;\n}\n"),
]
);
}

View file

@ -373,10 +373,20 @@ mod tests {
])
);
// One side changes a line and adds a block after. The other side just adds the
// same block. This currently behaves as one would reasonably hope, but
// it's likely that it will change if when we fix
// https://github.com/martinvonz/jj/issues/761. Git and Mercurial both duplicate
// the block in the result.
// same block. You might expect the last block would be deduplicated. However,
// the changes in the first side can be parsed as follows:
// ```
// a {
// - p
// + q
// +}
// +
// +b {
// + x
// }
// ```
// Therefore, the first side modifies the block `a { .. }`, and the second side
// adds `b { .. }`. Git and Mercurial both duplicate the block in the result.
assert_eq!(
merge(
&[b"\
@ -411,6 +421,10 @@ a {
q
}
b {
x
}
b {
x
}