-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: create SymbolIterator for block parsing #106
Changes from all commits
a2ca1d2
998d291
f1dc373
de52811
5398be0
aba8224
88c3064
fbefb50
cd608b3
32778c9
1a5c5b0
b8d430b
6ad4a8b
c73286f
27d8d70
71171f3
57f5f72
16c2a60
1df4d76
f7cbbf8
6c3c28e
ee317d2
0d2c225
dd903f5
b74c089
45f4a1f
8487538
e1751f5
f31143b
3746027
f8bab51
b63b902
0ad2063
17e1956
b20952f
0dc18ad
85f46ff
6e12f23
7235dfb
02c4505
0d5c8ab
d489076
01a148b
d710917
a69de7a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,81 +1,74 @@ | ||
//! Scanner and helper types and traits for structurization of Unimarkup input. | ||
//! Functionality, iterators, helper types and traits to get [`Symbol`]s from `&str`. | ||
//! These [`Symbol`]s and iterators are used to convert the input into a Unimarkup document. | ||
|
||
use icu_segmenter::GraphemeClusterSegmenter; | ||
|
||
pub mod position; | ||
pub mod span; | ||
mod symbol; | ||
|
||
use icu_segmenter::GraphemeClusterSegmenter; | ||
use position::{Offset, Position}; | ||
pub use symbol::{Symbol, SymbolKind}; | ||
|
||
#[derive(Debug)] | ||
pub struct Scanner { | ||
segmenter: GraphemeClusterSegmenter, | ||
} | ||
|
||
impl Clone for Scanner { | ||
fn clone(&self) -> Self { | ||
let segmenter = GraphemeClusterSegmenter::new(); | ||
|
||
Self { segmenter } | ||
} | ||
} | ||
|
||
impl Default for Scanner { | ||
fn default() -> Self { | ||
let segmenter = GraphemeClusterSegmenter::new(); | ||
use position::{Offset, Position as SymPos}; | ||
pub use symbol::{iterator::*, Symbol, SymbolKind}; | ||
|
||
Self { segmenter } | ||
} | ||
} | ||
|
||
impl Scanner { | ||
pub fn scan_str<'s>(&self, input: &'s str) -> Vec<Symbol<'s>> { | ||
let mut symbols: Vec<Symbol> = Vec::new(); | ||
let mut curr_pos: Position = Position::default(); | ||
let mut prev_offset = 0; | ||
/// Scans given input and returns vector of [`Symbol`]s needed to convert the input to Unimarkup content. | ||
pub fn scan_str(input: &str) -> Vec<Symbol<'_>> { | ||
let segmenter = GraphemeClusterSegmenter::new(); | ||
|
||
// skip(1) to ignore break at start of input | ||
for offset in self.segmenter.segment_str(input).skip(1) { | ||
if let Some(grapheme) = input.get(prev_offset..offset) { | ||
let mut kind = SymbolKind::from(grapheme); | ||
let mut symbols: Vec<Symbol> = Vec::new(); | ||
let mut curr_pos: SymPos = SymPos::default(); | ||
let mut prev_offset = 0; | ||
|
||
let end_pos = if kind == SymbolKind::Newline { | ||
Position { | ||
line: (curr_pos.line + 1), | ||
..Default::default() | ||
} | ||
} else { | ||
Position { | ||
line: curr_pos.line, | ||
col_utf8: (curr_pos.col_utf8 + grapheme.len()), | ||
col_utf16: (curr_pos.col_utf16 + grapheme.encode_utf16().count()), | ||
col_grapheme: (curr_pos.col_grapheme + 1), | ||
} | ||
}; | ||
// skip(1) to ignore break at start of input | ||
for offset in segmenter.segment_str(input).skip(1) { | ||
if let Some(grapheme) = input.get(prev_offset..offset) { | ||
let mut kind = SymbolKind::from(grapheme); | ||
|
||
if curr_pos.col_utf8 == 1 && kind == SymbolKind::Newline { | ||
// newline at the start of line -> Blankline | ||
kind = SymbolKind::Blankline; | ||
let end_pos = if kind == SymbolKind::Newline { | ||
SymPos { | ||
line: (curr_pos.line + 1), | ||
..Default::default() | ||
} | ||
} else { | ||
SymPos { | ||
line: curr_pos.line, | ||
col_utf8: (curr_pos.col_utf8 + grapheme.len()), | ||
col_utf16: (curr_pos.col_utf16 + grapheme.encode_utf16().count()), | ||
col_grapheme: (curr_pos.col_grapheme + 1), | ||
} | ||
}; | ||
|
||
symbols.push(Symbol { | ||
input, | ||
kind, | ||
offset: Offset { | ||
start: prev_offset, | ||
end: offset, | ||
}, | ||
start: curr_pos, | ||
end: end_pos, | ||
}); | ||
|
||
curr_pos = end_pos; | ||
if curr_pos.col_utf8 == 1 && kind == SymbolKind::Newline { | ||
// newline at the start of line -> Blankline | ||
kind = SymbolKind::Blankline; | ||
} | ||
prev_offset = offset; | ||
} | ||
|
||
// last offset not needed, because break at EOI is always available | ||
symbols | ||
symbols.push(Symbol { | ||
input, | ||
kind, | ||
offset: Offset { | ||
start: prev_offset, | ||
end: offset, | ||
}, | ||
start: curr_pos, | ||
end: end_pos, | ||
}); | ||
|
||
curr_pos = end_pos; | ||
} | ||
prev_offset = offset; | ||
} | ||
|
||
symbols.push(Symbol { | ||
input, | ||
kind: SymbolKind::EOI, | ||
offset: Offset { | ||
start: prev_offset, | ||
end: prev_offset, | ||
}, | ||
start: curr_pos, | ||
end: curr_pos, | ||
}); | ||
|
||
// last offset not needed, because break at EOI is always available | ||
symbols | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
//! Contains matcher traits and types used to detect iterator end and strip prefixes. | ||
//! The available matcher traits are implemented for [`SymbolIterator`]. | ||
|
||
use std::rc::Rc; | ||
|
||
use itertools::{Itertools, PeekingNext}; | ||
|
||
use crate::scanner::SymbolKind; | ||
|
||
use super::SymbolIterator; | ||
|
||
/// Function type to notify an iterator if an end was reached. | ||
pub type IteratorEndFn = Rc<dyn (Fn(&mut dyn EndMatcher) -> bool)>; | ||
|
||
/// Function type to consume prefix sequences of a new line. | ||
pub type IteratorPrefixFn = Rc<dyn (Fn(&mut dyn PrefixMatcher) -> bool)>; | ||
|
||
/// Trait containing functions that are available inside the end matcher function. | ||
pub trait EndMatcher { | ||
/// Returns `true` if the upcoming [`Symbol`] sequence is an empty line. | ||
/// Meaning that a line contains no [`Symbol`] or only [`SymbolKind::Whitespace`]. | ||
/// | ||
/// **Note:** This is also `true` if a parent iterator stripped non-whitespace symbols, and the nested iterator only has whitespace symbols. | ||
/// | ||
/// [`Symbol`]: super::Symbol | ||
fn is_empty_line(&mut self) -> bool; | ||
|
||
/// Wrapper around [`Self::is_empty_line()`] that additionally consumes the matched empty line. | ||
/// Consuming means the related iterator advances over the matched empty line. | ||
/// | ||
/// **Note:** The iterator is only advanced if an empty line is matched. | ||
/// | ||
/// **Note:** The empty line is **not** included in the symbols returned by [`SymbolIterator::take_to_end()`]. | ||
fn consumed_is_empty_line(&mut self) -> bool; | ||
|
||
/// Returns `true` if the given [`Symbol`] sequence matches the upcoming one. | ||
/// | ||
/// [`Symbol`]: super::Symbol | ||
fn matches(&mut self, sequence: &[SymbolKind]) -> bool; | ||
|
||
/// Wrapper around [`Self::matches()`] that additionally consumes the matched sequence. | ||
/// Consuming means the related iterator advances over the matched sequence. | ||
/// | ||
/// **Note:** The iterator is only advanced if the sequence is matched. | ||
/// | ||
/// **Note:** The matched sequence is **not** included in the symbols returned by [`SymbolIterator::take_to_end()`]. | ||
fn consumed_matches(&mut self, sequence: &[SymbolKind]) -> bool; | ||
|
||
/// Returns `true` if the iterator is at the given nesting depth. | ||
/// | ||
/// **Note** Use [`SymbolIterator::curr_depth()`] to get the current depth of an iterator. | ||
fn at_depth(&self, depth: usize) -> bool; | ||
} | ||
|
||
/// Trait containing functions that are available inside the prefix matcher function. | ||
pub trait PrefixMatcher { | ||
/// Consumes and returns `true` if the given [`Symbol`] sequence matches the upcoming one. | ||
/// Consuming means the related iterator advances over the matched sequence. | ||
/// | ||
/// **Note:** The iterator is only advanced if the sequence is matched. | ||
/// | ||
/// **Note:** The given sequence must **not** include any [`SymbolKind::Newline`], because matches are only considered per line. | ||
/// | ||
/// **Note:** The matched sequence is **not** included in the symbols returned by [`SymbolIterator::take_to_end()`]. | ||
/// | ||
/// [`Symbol`]: super::Symbol | ||
fn consumed_prefix(&mut self, sequence: &[SymbolKind]) -> bool; | ||
} | ||
|
||
impl<'input> EndMatcher for SymbolIterator<'input> { | ||
fn is_empty_line(&mut self) -> bool { | ||
// Note: Multiple matches may be set in the match closure, so we need to ensure that all start at the same index | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an extreme nitpick, but I think convention is to use upper case for NOTE, FIXME, TODO etc. Uppercase versions get highlighted (at least in my editor) 🙈. You can decide to change this or leave it, just wanted to mention it 👀 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did not know that about Would have to look through the code so replace all There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm good question. Generally I wouldn't write any form of You can decide if and when you want to change this, it's not important part of this PR anyway. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think notes in doc-comments can be useful. It helps to highlight information that is especially relevant to a user. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know, but just in general keep in mind that it's not necessary most of the time. If something is that important, maybe separate heading is a better option. Otherwise we can just explain it. I also use |
||
self.reset_peek(); | ||
|
||
let next = self | ||
.peeking_next(|s| { | ||
matches!( | ||
s.kind, | ||
SymbolKind::Newline | SymbolKind::Blankline | SymbolKind::EOI | ||
) | ||
}) | ||
.map(|s| s.kind); | ||
|
||
let is_empty_line = if Some(SymbolKind::Newline) == next { | ||
let _whitespaces = self | ||
.peeking_take_while(|s| s.kind == SymbolKind::Whitespace) | ||
.count(); | ||
|
||
let new_line = self.peeking_next(|s| { | ||
matches!( | ||
s.kind, | ||
SymbolKind::Newline | SymbolKind::Blankline | SymbolKind::EOI | ||
) | ||
}); | ||
new_line.is_some() | ||
} else { | ||
next.is_some() | ||
}; | ||
|
||
is_empty_line | ||
} | ||
|
||
fn consumed_is_empty_line(&mut self) -> bool { | ||
let is_empty_line = self.is_empty_line(); | ||
|
||
if is_empty_line { | ||
self.set_index(self.peek_index()); // To consume peeked symbols | ||
} | ||
|
||
is_empty_line | ||
} | ||
|
||
fn matches(&mut self, sequence: &[SymbolKind]) -> bool { | ||
// Note: Multiple matches may be set in the match closure, so we need to ensure that all start at the same index | ||
self.reset_peek(); | ||
|
||
for kind in sequence { | ||
if self.peeking_next(|s| s.kind == *kind).is_none() { | ||
return false; | ||
} | ||
} | ||
|
||
true | ||
} | ||
|
||
fn consumed_matches(&mut self, sequence: &[SymbolKind]) -> bool { | ||
let matched = self.matches(sequence); | ||
|
||
if matched { | ||
self.set_index(self.peek_index()); // To consume peeked symbols | ||
} | ||
|
||
matched | ||
} | ||
|
||
fn at_depth(&self, depth: usize) -> bool { | ||
self.depth() == depth | ||
} | ||
} | ||
|
||
impl<'input> PrefixMatcher for SymbolIterator<'input> { | ||
fn consumed_prefix(&mut self, sequence: &[SymbolKind]) -> bool { | ||
debug_assert!( | ||
!sequence.contains(&SymbolKind::Newline), | ||
"Newline symbol in prefix match is not allowed." | ||
); | ||
|
||
self.consumed_matches(sequence) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It could be a good idea to rename
Position
toSymPos
in general, since that's what it actually is 🤔There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think
SymbolPosition
would be better in that case, and I would also changeOffset
toSymbolOffset
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SymbolPosition
is looooong 😆. It does read better though. Both options are fine for me, you're free to choose whatever you find better 👍🏻.P.S. if you can't choose, then choose randomly 🤣
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Or we keep the names and move them into the
symbol
module?I was thinking about this option, but then
scanner
becomes a bit useless?But removing scanner, by moving
symbol
up did not seem right to me.