mirror of
https://github.com/martinvonz/jj.git
synced 2025-01-18 18:27:38 +00:00
diff: Treat multi-byte UTF-8 runes as word characters
Inline diffs on multi-byte UTF-8 characters would match individual bytes, causing garbled diffs in some cases. For example, replacing `⊢` with `⊣`, which differ in the final byte only, caused the diff to display a diff of the bytes instead the character. This commit uses a workaround present in Mercurial by treating all bytes 0x80 and above as word characters, causing any multi-byte character to be treated as a word and not segmented. https://www.mercurial-scm.org/repo/hg/file/6.3.3/mercurial/patch.py#l51
This commit is contained in:
parent
7aad2aea8a
commit
01a9ce0c71
1 changed files with 11 additions and 1 deletions
|
@ -44,7 +44,12 @@ pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
|
|||
|
||||
fn is_word_byte(b: u8) -> bool {
|
||||
// TODO: Make this configurable (probably higher up in the call stack)
|
||||
matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_')
|
||||
matches!(
|
||||
b,
|
||||
// Count 0x80..0xff as word bytes so multi-byte UTF-8 chars are
|
||||
// treated as a single unit.
|
||||
b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' | b'\x80'..=b'\xff'
|
||||
)
|
||||
}
|
||||
|
||||
pub fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> {
|
||||
|
@ -675,6 +680,11 @@ mod tests {
|
|||
assert_eq!(find_word_ranges(b" Abc"), vec![3..6]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_word_ranges_multibyte() {
|
||||
assert_eq!(find_word_ranges("⊢".as_bytes()), vec![0..3])
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_lcs_empty() {
|
||||
let empty: Vec<(usize, usize)> = vec![];
|
||||
|
|
Loading…
Reference in a new issue