diff: Treat multi-byte UTF-8 runes as word characters

Inline diffs on multi-byte UTF-8 characters would match individual
bytes, causing garbled diffs in some cases. For example, replacing
`⊢` with `⊣`, which differ in the final byte only, caused the
diff to display a diff of the bytes instead the character.

This commit uses a workaround present in Mercurial by treating all
bytes 0x80 and above as word characters, causing any multi-byte
character to be treated as a word and not segmented.

https://www.mercurial-scm.org/repo/hg/file/6.3.3/mercurial/patch.py#l51
This commit is contained in:
B Wilson 2023-03-29 12:49:23 +09:00 committed by xelxebar
parent 7aad2aea8a
commit 01a9ce0c71

View file

@ -44,7 +44,12 @@ pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
fn is_word_byte(b: u8) -> bool {
// TODO: Make this configurable (probably higher up in the call stack)
matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_')
matches!(
b,
// Count 0x80..0xff as word bytes so multi-byte UTF-8 chars are
// treated as a single unit.
b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' | b'\x80'..=b'\xff'
)
}
pub fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> {
@ -675,6 +680,11 @@ mod tests {
assert_eq!(find_word_ranges(b" Abc"), vec![3..6]);
}
#[test]
fn test_find_word_ranges_multibyte() {
assert_eq!(find_word_ranges("".as_bytes()), vec![0..3])
}
#[test]
fn test_find_lcs_empty() {
let empty: Vec<(usize, usize)> = vec![];