diff --git a/src/delta.rs b/src/delta.rs index 27b305122..5cda5266c 100644 --- a/src/delta.rs +++ b/src/delta.rs @@ -13,6 +13,7 @@ use crate::handlers::hunk_header::ParsedHunkHeader; use crate::handlers::{self, merge_conflict}; use crate::paint::Painter; use crate::style::DecorationStyle; +use crate::utils; #[derive(Clone, Debug, PartialEq)] pub enum State { @@ -181,10 +182,25 @@ impl<'a> StateMachine<'a> { } fn ingest_line(&mut self, raw_line_bytes: &[u8]) { - // TODO: retain raw_line as Cow - self.raw_line = String::from_utf8_lossy(raw_line_bytes).to_string(); + match String::from_utf8(raw_line_bytes.to_vec()) { + Ok(utf8) => self.ingest_line_utf8(utf8), + Err(_) => { + let raw_line = String::from_utf8_lossy(raw_line_bytes); + let truncated_len = utils::round_char_boundary::floor_char_boundary( + &raw_line, + self.config.max_line_length, + ); + self.raw_line = raw_line[..truncated_len].to_string(); + self.line = self.raw_line.clone(); + } + } + } + + fn ingest_line_utf8(&mut self, raw_line: String) { + self.raw_line = raw_line; // When a file has \r\n line endings, git sometimes adds ANSI escape sequences between the // \r and \n, in which case byte_lines does not remove the \r. Remove it now. + // TODO: Limit the number of characters we examine when looking for the \r? if let Some(cr_index) = self.raw_line.rfind('\r') { if ansi::strip_ansi_codes(&self.raw_line[cr_index + 1..]).is_empty() { self.raw_line = format!( diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 20adef24a..9cd8cfaae 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -3,4 +3,5 @@ pub mod bat; pub mod path; pub mod process; pub mod regex_replacement; +pub mod round_char_boundary; pub mod syntect; diff --git a/src/utils/round_char_boundary.rs b/src/utils/round_char_boundary.rs new file mode 100644 index 000000000..c000134d8 --- /dev/null +++ b/src/utils/round_char_boundary.rs @@ -0,0 +1,24 @@ +// Taken from https://github.com/rust-lang/rust/pull/86497 +// TODO: Remove when this is in the version of the Rust standard library that delta is building +// against. + +#[inline] +const fn is_utf8_char_boundary(b: u8) -> bool { + // This is bit magic equivalent to: b < 128 || b >= 192 + (b as i8) >= -0x40 +} + +#[inline] +pub fn floor_char_boundary(s: &str, index: usize) -> usize { + if index >= s.len() { + s.len() + } else { + let lower_bound = index.saturating_sub(3); + let new_index = s.as_bytes()[lower_bound..=index] + .iter() + .rposition(|b| is_utf8_char_boundary(*b)); + + // SAFETY: we know that the character boundary will be within four bytes + unsafe { lower_bound + new_index.unwrap_unchecked() } + } +}