From d714a555fae2661984c0c22725a43547755925b7 Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Sun, 31 Oct 2021 13:23:03 -0400 Subject: [PATCH] Break on non-ASCII grapheme clusters in itemizer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Basically Yao Wei's idea from #7, but use start using grapheme clusters instead of character indices in order to avoid breaking languages such as Hindi. Fixes #5 Co-authored-by: Yao Wei (魏銘廷) --- src/render/itemize.rs | 45 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/src/render/itemize.rs b/src/render/itemize.rs index 65203a2e..527db500 100644 --- a/src/render/itemize.rs +++ b/src/render/itemize.rs @@ -1,19 +1,28 @@ -use std::str::CharIndices; +use unicode_segmentation::*; pub struct ItemizeIterator<'a> { - char_iter: CharIndices<'a>, + grapheme_iter: GraphemeIndices<'a>, line: &'a str, + prev_grapheme: Option<(usize, &'a str)>, } impl<'a> ItemizeIterator<'a> { pub fn new(line: &'a str) -> Self { ItemizeIterator { - char_iter: line.char_indices(), + grapheme_iter: line.grapheme_indices(true), line, + prev_grapheme: None, } } } +/** + * Iterates through a line of text while itemizing it into the largest possible clusters of + * non-whitespace characters that can be drawn at once without risking column misalignment from + * ambiguous width characters. This means for ASCII where the size of non-whitespace is essentially + * guaranteed to be consistent, items will ideally be per-word to speed up rendering. For Unicode, + * items will be per-grapheme to ensure correct monospaced display. + */ impl<'a> Iterator for ItemizeIterator<'a> { type Item = (usize, usize); @@ -21,13 +30,32 @@ impl<'a> Iterator for ItemizeIterator<'a> { let mut start_index = None; let end_index = loop { - if let Some((index, ch)) = self.char_iter.next() { - let is_whitespace = ch.is_whitespace(); + let grapheme_indice = self.prev_grapheme.take().or_else(|| self.grapheme_iter.next()); + if let Some((index, grapheme)) = grapheme_indice { + // Figure out if this grapheme is whitespace and/or ASCII in one iteration + let mut is_whitespace = true; + let mut is_ascii = true; + for c in grapheme.chars() { + if is_whitespace { + if c.is_whitespace() { + continue; + } + is_whitespace = false; + } + if !c.is_ascii() { + is_ascii = false; + break; + } + } if start_index.is_none() && !is_whitespace { start_index = Some(index); + if !is_ascii { + break index + grapheme.len(); + } } - if start_index.is_some() && is_whitespace { + if start_index.is_some() && (is_whitespace || !is_ascii) { + self.prev_grapheme = grapheme_indice; break index; } } else { @@ -49,10 +77,13 @@ mod tests { #[test] fn test_iterator() { - let mut iter = ItemizeIterator::new("Test line "); + let mut iter = ItemizeIterator::new("Test line 啊啊 ते "); assert_eq!(Some((0, 4)), iter.next()); assert_eq!(Some((6, 4)), iter.next()); + assert_eq!(Some((11, 3)), iter.next()); + assert_eq!(Some((14, 3)), iter.next()); + assert_eq!(Some((18, 6)), iter.next()); assert_eq!(None, iter.next()); } }