From 1c852b7fb9b3c51f866968aa65bd57f4719be13a Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Sat, 4 Mar 2023 17:02:55 +0900 Subject: [PATCH] cli: implement word wrapping function for bytes wrap_bytes() is similar to textwrap::wrap(), but can process arbitrary bytes. More importantly, it guarantees that byte offsets can be reconstructed from the split slices. This allows us to interleave push/pop_label()s with split text fragments. We could calculate byte offsets upfront, but using slice API is more convenient. That's why I didn't add inner function returning Vec. --- src/text_util.rs | 230 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) diff --git a/src/text_util.rs b/src/text_util.rs index 099e00a85..2133ddc64 100644 --- a/src/text_util.rs +++ b/src/text_util.rs @@ -45,3 +45,233 @@ pub fn write_indented( Ok(()) }) } + +/// Word with trailing whitespace. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct ByteFragment<'a> { + word: &'a [u8], + whitespace_len: usize, + word_width: usize, +} + +impl<'a> ByteFragment<'a> { + fn new(word: &'a [u8], whitespace_len: usize) -> Self { + // We don't care about the width of non-UTF-8 bytes, but should not panic. + let word_width = textwrap::core::display_width(&String::from_utf8_lossy(word)); + ByteFragment { + word, + whitespace_len, + word_width, + } + } + + fn offset_in(&self, text: &[u8]) -> usize { + byte_offset_from(text, self.word) + } +} + +impl textwrap::core::Fragment for ByteFragment<'_> { + fn width(&self) -> f64 { + self.word_width as f64 + } + + fn whitespace_width(&self) -> f64 { + self.whitespace_len as f64 + } + + fn penalty_width(&self) -> f64 { + 0.0 + } +} + +fn byte_offset_from(outer: &[u8], inner: &[u8]) -> usize { + let outer_start = outer.as_ptr() as usize; + let inner_start = inner.as_ptr() as usize; + assert!(outer_start <= inner_start); + assert!(inner_start + inner.len() <= outer_start + outer.len()); + inner_start - outer_start +} + +fn split_byte_line_to_words(line: &[u8]) -> Vec> { + let mut words = Vec::new(); + let mut tail = line; + while let Some(word_end) = tail.iter().position(|&c| c == b' ') { + let word = &tail[..word_end]; + let ws_end = tail[word_end + 1..] + .iter() + .position(|&c| c != b' ') + .map(|p| p + word_end + 1) + .unwrap_or(tail.len()); + words.push(ByteFragment::new(word, ws_end - word_end)); + tail = &tail[ws_end..]; + } + if !tail.is_empty() { + words.push(ByteFragment::new(tail, 0)); + } + words +} + +/// Wraps lines at the given width, returns a vector of lines (excluding "\n".) +/// +/// Existing newline characters will never be removed. For `str` content, you +/// can use `textwrap::refill()` to refill a pre-formatted text. +/// +/// Each line is a sub-slice of the given text, even if the line is empty. +/// +/// The wrapping logic is more restricted than the default of the `textwrap`. +/// Notably, this doesn't support hyphenation nor unicode line break. The +/// display width is calculated based on unicode property in the same manner +/// as `textwrap::wrap()`. +pub fn wrap_bytes(text: &[u8], width: usize) -> Vec<&[u8]> { + let mut split_lines = Vec::new(); + for line in text.split(|&c| c == b'\n') { + let words = split_byte_line_to_words(line); + let split = textwrap::wrap_algorithms::wrap_first_fit(&words, &[width as f64]); + split_lines.extend(split.iter().map(|words| match words { + [] => &line[..0], // Empty line + [a] => a.word, + [a, .., b] => { + let start = a.offset_in(line); + let end = b.offset_in(line) + b.word.len(); + &line[start..end] + } + })); + } + split_lines +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_split_byte_line_to_words() { + assert_eq!(split_byte_line_to_words(b""), vec![]); + assert_eq!( + split_byte_line_to_words(b"foo"), + vec![ByteFragment { + word: b"foo", + whitespace_len: 0, + word_width: 3 + }], + ); + assert_eq!( + split_byte_line_to_words(b" foo"), + vec![ + ByteFragment { + word: b"", + whitespace_len: 2, + word_width: 0 + }, + ByteFragment { + word: b"foo", + whitespace_len: 0, + word_width: 3 + }, + ], + ); + assert_eq!( + split_byte_line_to_words(b"foo "), + vec![ByteFragment { + word: b"foo", + whitespace_len: 2, + word_width: 3 + }], + ); + assert_eq!( + split_byte_line_to_words(b"a b foo bar "), + vec![ + ByteFragment { + word: b"a", + whitespace_len: 1, + word_width: 1 + }, + ByteFragment { + word: b"b", + whitespace_len: 2, + word_width: 1 + }, + ByteFragment { + word: b"foo", + whitespace_len: 1, + word_width: 3, + }, + ByteFragment { + word: b"bar", + whitespace_len: 1, + word_width: 3, + }, + ], + ); + } + + #[test] + fn test_wrap_bytes() { + assert_eq!(wrap_bytes(b"foo", 10), [b"foo".as_ref()]); + assert_eq!(wrap_bytes(b"foo bar", 10), [b"foo bar".as_ref()]); + assert_eq!( + wrap_bytes(b"foo bar baz", 10), + [b"foo bar".as_ref(), b"baz".as_ref()], + ); + + // Empty text is represented as [""] + assert_eq!(wrap_bytes(b"", 10), [b"".as_ref()]); + assert_eq!(wrap_bytes(b" ", 10), [b"".as_ref()]); + + // Whitespace in the middle should be preserved + assert_eq!( + wrap_bytes(b"foo bar baz", 8), + [b"foo bar".as_ref(), b"baz".as_ref()], + ); + assert_eq!( + wrap_bytes(b"foo bar x", 7), + [b"foo".as_ref(), b"bar x".as_ref()], + ); + assert_eq!( + wrap_bytes(b"foo bar \nx", 7), + [b"foo bar".as_ref(), b"x".as_ref()], + ); + assert_eq!( + wrap_bytes(b"foo bar\n x", 7), + [b"foo bar".as_ref(), b" x".as_ref()], + ); + assert_eq!( + wrap_bytes(b"foo bar x", 4), + [b"foo".as_ref(), b"bar".as_ref(), b"x".as_ref()], + ); + + // Ends with "\n" + assert_eq!(wrap_bytes(b"foo\n", 10), [b"foo".as_ref(), b"".as_ref()]); + assert_eq!(wrap_bytes(b"foo\n", 3), [b"foo".as_ref(), b"".as_ref()]); + assert_eq!(wrap_bytes(b"\n", 10), [b"".as_ref(), b"".as_ref()]); + + // Overflow + assert_eq!(wrap_bytes(b"foo x", 2), [b"foo".as_ref(), b"x".as_ref()]); + assert_eq!(wrap_bytes(b"x y", 0), [b"x".as_ref(), b"y".as_ref()]); + + // Invalid UTF-8 bytes should not cause panic + assert_eq!(wrap_bytes(b"foo\x80", 10), [b"foo\x80".as_ref()]); + } + + #[test] + fn test_wrap_bytes_slice_ptr() { + let text = b"\nfoo\n\nbar baz\n"; + let lines = wrap_bytes(text, 10); + assert_eq!( + lines, + [ + b"".as_ref(), + b"foo".as_ref(), + b"".as_ref(), + b"bar baz".as_ref(), + b"".as_ref() + ], + ); + // Each line should be a sub-slice of the source text + assert_eq!(lines[0].as_ptr(), text[0..].as_ptr()); + assert_eq!(lines[1].as_ptr(), text[1..].as_ptr()); + assert_eq!(lines[2].as_ptr(), text[5..].as_ptr()); + assert_eq!(lines[3].as_ptr(), text[6..].as_ptr()); + assert_eq!(lines[4].as_ptr(), text[14..].as_ptr()); + } +}