diff --git a/CHANGELOG.md b/CHANGELOG.md index 699ee507..1d0e58d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,26 @@ #### Rust -- `ChunkSize::from_offsets` was removed. This was only used to create an internal optimization, which turned out to not be very accurate anyway. It often required in tokenization implementations to do more work to calculate the size as well, which is no longer necessary. It should be simple to convert to the `ChunkSize::from_size` method (and likely simplify your code as well), which is now the only way to create a `ChunkSize`. +- `ChunkSize` has been removed. This was a holdover from a previous internal optimization, which turned out to not be very accurate anyway. +- This makes implementing a custom `ChunkSizer` much easier, as you now only need to generate the size of the chunk as a `usize`. It often required in tokenization implementations to do more work to calculate the size as well, which is no longer necessary. + +##### Before + +```rust +pub trait ChunkSizer { + // Required method + fn chunk_size(&self, chunk: &str, capacity: &ChunkCapacity) -> ChunkSize; +} +``` + +##### After + +```rust +pub trait ChunkSizer { + // Required method + fn size(&self, chunk: &str) -> usize; +} +``` ## v0.13.3 diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index 2453aea8..0e58fe77 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -13,8 +13,8 @@ use pyo3::{ pybacked::PyBackedStr, }; use text_splitter::{ - Characters, ChunkCapacity, ChunkCapacityError, ChunkConfig, ChunkConfigError, ChunkSize, - ChunkSizer, CodeSplitter, CodeSplitterError, MarkdownSplitter, TextSplitter, + Characters, ChunkCapacity, ChunkCapacityError, ChunkConfig, ChunkConfigError, ChunkSizer, + CodeSplitter, CodeSplitterError, MarkdownSplitter, TextSplitter, }; use tiktoken_rs::{get_bpe_from_model, CoreBPE}; use tokenizers::Tokenizer; @@ -88,16 +88,13 @@ struct CustomCallback(PyObject); impl ChunkSizer for CustomCallback { /// Determine the size of a given chunk to use for validation - fn chunk_size(&self, chunk: &str, capacity: &ChunkCapacity) -> ChunkSize { + fn size(&self, chunk: &str) -> usize { Python::with_gil(|py| { - let size = self - .0 + self.0 .call_bound(py, (chunk,), None) .unwrap() .extract::(py) - .unwrap(); - - ChunkSize::from_size(size, capacity) + .unwrap() }) } } diff --git a/src/chunk_size.rs b/src/chunk_size.rs index 604cad86..cb3b139b 100644 --- a/src/chunk_size.rs +++ b/src/chunk_size.rs @@ -130,6 +130,13 @@ impl ChunkCapacity { Ordering::Equal } } + + /// Generates a chunk size object based on the size provided from a sizer + /// Calculates and stores whether or not it fits within the capacity + #[must_use] + fn chunk_size(&self, size: usize) -> ChunkSize { + ChunkSize::new(self.fits(size), size) + } } impl From for ChunkCapacity { @@ -198,14 +205,9 @@ pub struct ChunkSize { } impl ChunkSize { - /// Generate a chunk size from a given size. Will not be able to compute the - /// max byte offset that fits within the capacity. #[must_use] - pub fn from_size(size: usize, capacity: &ChunkCapacity) -> Self { - Self { - fits: capacity.fits(size), - size, - } + fn new(fits: Ordering, size: usize) -> Self { + Self { fits, size } } /// Determine whether the chunk size fits within the capacity or not @@ -224,7 +226,7 @@ impl ChunkSize { /// Determines the size of a given chunk. pub trait ChunkSizer { /// Determine the size of a given chunk to use for validation - fn chunk_size(&self, chunk: &str, capacity: &ChunkCapacity) -> ChunkSize; + fn size(&self, chunk: &str) -> usize; } /// Indicates there was an error with the chunk configuration. @@ -409,7 +411,7 @@ where *cache .entry(offset..(offset + chunk.len())) - .or_insert_with(|| self.chunk_config.sizer.chunk_size(chunk, &capacity)) + .or_insert_with(|| capacity.chunk_size(self.chunk_config.sizer.size(chunk))) } /// Check if the chunk is within the capacity. Chunk should be trimmed if necessary beforehand. @@ -479,14 +481,23 @@ mod tests { let chunk = "12345"; assert_eq!( - Characters.chunk_size(chunk, &4.into()).fits, + ChunkCapacity::from(4) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Greater ); assert_eq!( - Characters.chunk_size(chunk, &5.into()).fits, + ChunkCapacity::from(5) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Equal ); - assert_eq!(Characters.chunk_size(chunk, &6.into()).fits, Ordering::Less); + assert_eq!( + ChunkCapacity::from(6) + .chunk_size(Characters.size(chunk)) + .fits, + Ordering::Less + ); } #[test] @@ -494,19 +505,27 @@ mod tests { let chunk = "12345"; assert_eq!( - Characters.chunk_size(chunk, &(0..0).into()).fits, + ChunkCapacity::from(0..0) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Greater ); assert_eq!( - Characters.chunk_size(chunk, &(0..5).into()).fits, + ChunkCapacity::from(0..5) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Greater ); assert_eq!( - Characters.chunk_size(chunk, &(5..6).into()).fits, + ChunkCapacity::from(5..6) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Equal ); assert_eq!( - Characters.chunk_size(chunk, &(6..100).into()).fits, + ChunkCapacity::from(6..100) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Less ); } @@ -516,15 +535,21 @@ mod tests { let chunk = "12345"; assert_eq!( - Characters.chunk_size(chunk, &(0..).into()).fits, + ChunkCapacity::from(0..) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Equal ); assert_eq!( - Characters.chunk_size(chunk, &(5..).into()).fits, + ChunkCapacity::from(5..) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Equal ); assert_eq!( - Characters.chunk_size(chunk, &(6..).into()).fits, + ChunkCapacity::from(6..) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Less ); } @@ -534,7 +559,9 @@ mod tests { let chunk = "12345"; assert_eq!( - Characters.chunk_size(chunk, &(..).into()).fits, + ChunkCapacity::from(..) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Equal ); } @@ -544,19 +571,27 @@ mod tests { let chunk = "12345"; assert_eq!( - Characters.chunk_size(chunk, &(0..=4).into()).fits, + ChunkCapacity::from(0..=4) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Greater ); assert_eq!( - Characters.chunk_size(chunk, &(5..=6).into()).fits, + ChunkCapacity::from(5..=6) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Equal ); assert_eq!( - Characters.chunk_size(chunk, &(4..=5).into()).fits, + ChunkCapacity::from(4..=5) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Equal ); assert_eq!( - Characters.chunk_size(chunk, &(6..=100).into()).fits, + ChunkCapacity::from(6..=100) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Less ); } @@ -566,15 +601,21 @@ mod tests { let chunk = "12345"; assert_eq!( - Characters.chunk_size(chunk, &(..0).into()).fits, + ChunkCapacity::from(..0) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Greater ); assert_eq!( - Characters.chunk_size(chunk, &(..5).into()).fits, + ChunkCapacity::from(..5) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Greater ); assert_eq!( - Characters.chunk_size(chunk, &(..6).into()).fits, + ChunkCapacity::from(..6) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Equal ); } @@ -584,15 +625,21 @@ mod tests { let chunk = "12345"; assert_eq!( - Characters.chunk_size(chunk, &(..=4).into()).fits, + ChunkCapacity::from(..=4) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Greater ); assert_eq!( - Characters.chunk_size(chunk, &(..=5).into()).fits, + ChunkCapacity::from(..=5) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Equal ); assert_eq!( - Characters.chunk_size(chunk, &(..=6).into()).fits, + ChunkCapacity::from(..=6) + .chunk_size(Characters.size(chunk)) + .fits, Ordering::Equal ); } @@ -604,9 +651,9 @@ mod tests { impl ChunkSizer for CountingSizer { // Return character version, but count calls - fn chunk_size(&self, chunk: &str, capacity: &ChunkCapacity) -> ChunkSize { + fn size(&self, chunk: &str) -> usize { self.calls.fetch_add(1, atomic::Ordering::SeqCst); - Characters.chunk_size(chunk, capacity) + Characters.size(chunk) } } @@ -668,18 +715,6 @@ mod tests { ); } - #[test] - fn test_chunk_size_from_size() { - let chunk_size = ChunkSize::from_size(10, &10.into()); - assert_eq!( - ChunkSize { - fits: Ordering::Equal, - size: 10, - }, - chunk_size - ); - } - #[test] fn basic_chunk_config() { let config = ChunkConfig::new(10); @@ -700,7 +735,7 @@ mod tests { struct BasicSizer; impl ChunkSizer for BasicSizer { - fn chunk_size(&self, _chunk: &str, _capacity: &ChunkCapacity) -> ChunkSize { + fn size(&self, _chunk: &str) -> usize { unimplemented!() } } diff --git a/src/chunk_size/characters.rs b/src/chunk_size/characters.rs index a3cbcc06..b868b48b 100644 --- a/src/chunk_size/characters.rs +++ b/src/chunk_size/characters.rs @@ -1,4 +1,4 @@ -use crate::{ChunkCapacity, ChunkSize, ChunkSizer}; +use crate::ChunkSizer; /// Used for splitting a piece of text into chunks based on the number of /// characters in each chunk. @@ -14,8 +14,8 @@ pub struct Characters; impl ChunkSizer for Characters { /// Determine the size of a given chunk to use for validation. - fn chunk_size(&self, chunk: &str, capacity: &ChunkCapacity) -> ChunkSize { - ChunkSize::from_size(chunk.chars().count(), capacity) + fn size(&self, chunk: &str) -> usize { + chunk.chars().count() } } @@ -25,8 +25,7 @@ mod tests { #[test] fn returns_size() { - let capacity = 10; - let offsets = Characters.chunk_size("eé", &capacity.into()); - assert_eq!(offsets, ChunkSize::from_size(2, &capacity.into())); + let offsets = Characters.size("eé"); + assert_eq!(offsets, 2); } } diff --git a/src/chunk_size/huggingface.rs b/src/chunk_size/huggingface.rs index b5e98640..2f908f11 100644 --- a/src/chunk_size/huggingface.rs +++ b/src/chunk_size/huggingface.rs @@ -1,6 +1,6 @@ use tokenizers::Tokenizer; -use crate::{ChunkCapacity, ChunkSize, ChunkSizer}; +use crate::ChunkSizer; impl ChunkSizer for &Tokenizer { /// Returns the number of tokens in a given text after tokenization. @@ -9,22 +9,20 @@ impl ChunkSizer for &Tokenizer { /// /// Will panic if you don't have a byte-level tokenizer and the splitter /// encounters text it can't tokenize. - fn chunk_size(&self, chunk: &str, capacity: &ChunkCapacity) -> ChunkSize { + fn size(&self, chunk: &str) -> usize { let encoding = self .encode(chunk, false) .expect("Unable to tokenize the following string {chunk}"); let pad_id = self.get_padding().map(|params| params.pad_id); - let size = encoding + encoding .get_ids() .iter() // Skip padding tokens at beginning and end so they don't count towards the chunk size .skip_while(|&id| pad_id.map_or(false, |pad_id| id == &pad_id)) .take_while(|&id| pad_id.map_or(true, |pad_id| id != &pad_id)) - .count(); - - ChunkSize::from_size(size, capacity) + .count() } } @@ -35,8 +33,8 @@ impl ChunkSizer for Tokenizer { /// /// Will panic if you don't have a byte-level tokenizer and the splitter /// encounters text it can't tokenize. - fn chunk_size(&self, chunk: &str, capacity: &ChunkCapacity) -> ChunkSize { - (&self).chunk_size(chunk, capacity) + fn size(&self, chunk: &str) -> usize { + (&self).size(chunk) } } @@ -47,9 +45,8 @@ mod tests { #[test] fn returns_size() { let tokenizer = Tokenizer::from_pretrained("bert-base-cased", None).unwrap(); - let capacity = 10; - let offsets = tokenizer.chunk_size(" An apple a", &capacity.into()); - assert_eq!(offsets, ChunkSize::from_size(3, &capacity.into())); + let size = tokenizer.size(" An apple a"); + assert_eq!(size, 3); } #[test] @@ -57,16 +54,14 @@ mod tests { let tokenizer = tokenizers::Tokenizer::from_file("./tests/tokenizers/huggingface.json").unwrap(); - let capacity = 10; - let offsets = tokenizer.chunk_size("An apple a", &capacity.into()); - assert_eq!(offsets, ChunkSize::from_size(3, &capacity.into())); + let size = tokenizer.size("An apple a"); + assert_eq!(size, 3); } #[test] fn handles_padding() { let tokenizer = Tokenizer::from_pretrained("thenlper/gte-small", None).unwrap(); - let capacity = 10; - let offsets = tokenizer.chunk_size("An apple a", &capacity.into()); - assert_eq!(offsets, ChunkSize::from_size(3, &capacity.into())); + let size = tokenizer.size("An apple a"); + assert_eq!(size, 3); } } diff --git a/src/chunk_size/rust_tokenizers.rs b/src/chunk_size/rust_tokenizers.rs index dabf449a..9b12f9b6 100644 --- a/src/chunk_size/rust_tokenizers.rs +++ b/src/chunk_size/rust_tokenizers.rs @@ -9,22 +9,18 @@ use rust_tokenizers::{ vocab::Vocab, }; -use crate::{ChunkCapacity, ChunkSize, ChunkSizer}; +use crate::ChunkSizer; -fn chunk_size_from_offsets>( - tokenizer: &T, - chunk: &str, - capacity: &ChunkCapacity, -) -> ChunkSize { - ChunkSize::from_size(tokenizer.tokenize(chunk).len(), capacity) +fn chunk_size_from_offsets>(tokenizer: &T, chunk: &str) -> usize { + tokenizer.tokenize(chunk).len() } impl ChunkSizer for &BaseTokenizer where V: Vocab + Sync + Send, { - fn chunk_size(&self, chunk: &str, capacity: &ChunkCapacity) -> ChunkSize { - chunk_size_from_offsets(*self, chunk, capacity) + fn size(&self, chunk: &str) -> usize { + chunk_size_from_offsets(*self, chunk) } } @@ -32,22 +28,22 @@ impl ChunkSizer for BaseTokenizer where V: Vocab + Sync + Send, { - fn chunk_size(&self, chunk: &str, capacity: &ChunkCapacity) -> ChunkSize { - (&self).chunk_size(chunk, capacity) + fn size(&self, chunk: &str) -> usize { + (&self).size(chunk) } } macro_rules! impl_chunk_sizer { ($($t:ty),+) => { $(impl ChunkSizer for &$t { - fn chunk_size(&self, chunk: &str, capacity: &ChunkCapacity) -> ChunkSize { - chunk_size_from_offsets(*self, chunk, capacity) + fn size(&self, chunk: &str) -> usize { + chunk_size_from_offsets(*self, chunk) } } impl ChunkSizer for $t { - fn chunk_size(&self, chunk: &str, capacity: &ChunkCapacity) -> ChunkSize { - (&self).chunk_size(chunk, capacity) + fn size(&self, chunk: &str) -> usize { + (&self).size(chunk) } })+ } @@ -109,9 +105,8 @@ mod tests { "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", ); let tokenizer = BertTokenizer::from_file(vocab_path, false, false).unwrap(); - let capacity = 10; - let offsets = tokenizer.chunk_size(" An apple a", &capacity.into()); - assert_eq!(offsets, ChunkSize::from_size(3, &capacity.into())); + let size = tokenizer.size(" An apple a"); + assert_eq!(size, 3); } #[test] @@ -119,8 +114,8 @@ mod tests { let sizes = TokenizerOption::iter() .collect::>() .into_par_iter() - .map(|tokenizer| tokenizer.tokenizer().chunk_size(" An apple a", &10.into())); - assert!(sizes.all(|chunk_size| chunk_size.size > 0)); + .map(|tokenizer| tokenizer.tokenizer().size(" An apple a")); + assert!(sizes.all(|size| size > 0)); } #[derive(EnumIter)] diff --git a/src/chunk_size/tiktoken.rs b/src/chunk_size/tiktoken.rs index 9d09ef75..154465fe 100644 --- a/src/chunk_size/tiktoken.rs +++ b/src/chunk_size/tiktoken.rs @@ -1,18 +1,18 @@ use tiktoken_rs::CoreBPE; -use crate::{ChunkCapacity, ChunkSize, ChunkSizer}; +use crate::ChunkSizer; impl ChunkSizer for &CoreBPE { /// Returns the number of tokens in a given text after tokenization. - fn chunk_size(&self, chunk: &str, capacity: &ChunkCapacity) -> ChunkSize { - ChunkSize::from_size(self.encode_ordinary(chunk).len(), capacity) + fn size(&self, chunk: &str) -> usize { + self.encode_ordinary(chunk).len() } } impl ChunkSizer for CoreBPE { /// Returns the number of tokens in a given text after tokenization. - fn chunk_size(&self, chunk: &str, capacity: &ChunkCapacity) -> ChunkSize { - (&self).chunk_size(chunk, capacity) + fn size(&self, chunk: &str) -> usize { + (&self).size(chunk) } } @@ -25,8 +25,7 @@ mod tests { #[test] fn returns_offsets() { let tokenizer = cl100k_base().unwrap(); - let capacity = 10; - let offsets = tokenizer.chunk_size("An apple a", &capacity.into()); - assert_eq!(offsets, ChunkSize::from_size(3, &capacity.into())); + let size = tokenizer.size("An apple a"); + assert_eq!(size, 3); } } diff --git a/src/lib.rs b/src/lib.rs index d6cc2dc7..475be7d6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,8 +6,7 @@ mod splitter; mod trim; pub use chunk_size::{ - Characters, ChunkCapacity, ChunkCapacityError, ChunkConfig, ChunkConfigError, ChunkSize, - ChunkSizer, + Characters, ChunkCapacity, ChunkCapacityError, ChunkConfig, ChunkConfigError, ChunkSizer, }; #[cfg(feature = "markdown")] pub use splitter::MarkdownSplitter; diff --git a/src/splitter.rs b/src/splitter.rs index e593d2be..d1b9d347 100644 --- a/src/splitter.rs +++ b/src/splitter.rs @@ -5,7 +5,11 @@ use itertools::Itertools; use strum::IntoEnumIterator; use self::fallback::FallbackLevel; -use crate::{chunk_size::MemoizedChunkSizer, trim::Trim, ChunkConfig, ChunkSize, ChunkSizer}; +use crate::{ + chunk_size::{ChunkSize, MemoizedChunkSizer}, + trim::Trim, + ChunkConfig, ChunkSizer, +}; #[cfg(feature = "code")] mod code; diff --git a/src/splitter/text.rs b/src/splitter/text.rs index 842952c2..0f2c9825 100644 --- a/src/splitter/text.rs +++ b/src/splitter/text.rs @@ -156,7 +156,7 @@ mod tests { use fake::{Fake, Faker}; - use crate::{splitter::SemanticSplitRanges, ChunkCapacity, ChunkSize}; + use crate::splitter::SemanticSplitRanges; use super::*; @@ -219,8 +219,8 @@ mod tests { struct Str; impl ChunkSizer for Str { - fn chunk_size(&self, chunk: &str, capacity: &ChunkCapacity) -> ChunkSize { - ChunkSize::from_size(chunk.as_bytes().len(), capacity) + fn size(&self, chunk: &str) -> usize { + chunk.as_bytes().len() } } diff --git a/tests/snapshots.rs b/tests/snapshots.rs index a0ddab30..9cea3ede 100644 --- a/tests/snapshots.rs +++ b/tests/snapshots.rs @@ -10,7 +10,7 @@ use strum::{Display, EnumIter, IntoEnumIterator}; use text_splitter::CodeSplitter; #[cfg(feature = "markdown")] use text_splitter::MarkdownSplitter; -use text_splitter::{Characters, ChunkCapacity, ChunkConfig, ChunkSize, ChunkSizer, TextSplitter}; +use text_splitter::{Characters, ChunkConfig, ChunkSizer, TextSplitter}; #[cfg(feature = "tiktoken-rs")] use tiktoken_rs::{cl100k_base, CoreBPE}; #[cfg(feature = "tokenizers")] @@ -61,15 +61,15 @@ enum SizerOption { } impl ChunkSizer for SizerOption { - fn chunk_size(&self, chunk: &str, capacity: &ChunkCapacity) -> ChunkSize { + fn size(&self, chunk: &str) -> usize { match self { - Self::Characters => Characters.chunk_size(chunk, capacity), + Self::Characters => Characters.size(chunk), #[cfg(feature = "rust-tokenizers")] - Self::RustTokenizers => BERT_UNCASED_TOKENIZER.chunk_size(chunk, capacity), + Self::RustTokenizers => BERT_UNCASED_TOKENIZER.size(chunk), #[cfg(feature = "tokenizers")] - Self::Tokenizers => HUGGINGFACE_TOKENIZER.chunk_size(chunk, capacity), + Self::Tokenizers => HUGGINGFACE_TOKENIZER.size(chunk), #[cfg(feature = "tiktoken-rs")] - Self::TikToken => TIKTOKEN_TOKENIZER.chunk_size(chunk, capacity), + Self::TikToken => TIKTOKEN_TOKENIZER.size(chunk), } } } @@ -93,7 +93,7 @@ fn trim_false() { assert_eq!(chunks.join(""), text); for chunk in &chunks { - assert!(sizer.chunk_size(chunk, &capacity).fits().is_le()); + assert!(capacity.fits(sizer.size(chunk)).is_le()); } insta::assert_yaml_snapshot!( format!( @@ -123,7 +123,7 @@ fn trim() { let chunks = splitter.chunks(&text).collect::>(); for chunk in &chunks { - assert!(sizer.chunk_size(chunk, &capacity).fits().is_le()); + assert!(capacity.fits(sizer.size(chunk)).is_le()); } insta::assert_yaml_snapshot!( format!( @@ -150,7 +150,7 @@ fn range_trim_false() { assert_eq!(chunks.join(""), text); for chunk in &chunks { - assert!(Characters.chunk_size(chunk, &capacity).fits().is_le()); + assert!(capacity.fits(Characters.size(chunk)).is_le()); } insta::assert_yaml_snapshot!( format!( @@ -175,7 +175,7 @@ fn range_trim() { let chunks = splitter.chunks(&text).collect::>(); for chunk in &chunks { - assert!(Characters.chunk_size(chunk, &capacity).fits().is_le()); + assert!(capacity.fits(Characters.size(chunk)).is_le()); } insta::assert_yaml_snapshot!( format!( @@ -206,7 +206,7 @@ fn overlap_trim_false() { let chunks = splitter.chunks(&text).collect::>(); for chunk in &chunks { - assert!(sizer.chunk_size(chunk, &capacity).fits().is_le()); + assert!(capacity.fits(sizer.size(chunk)).is_le()); } insta::assert_yaml_snapshot!( format!( @@ -236,7 +236,7 @@ fn overlap_trim() { let chunks = splitter.chunks(&text).collect::>(); for chunk in &chunks { - assert!(sizer.chunk_size(chunk, &capacity).fits().is_le()); + assert!(capacity.fits(sizer.size(chunk)).is_le()); } insta::assert_yaml_snapshot!( format!( @@ -266,7 +266,7 @@ fn markdown_trim_false() { assert_eq!(chunks.join(""), text); for chunk in &chunks { - assert!(sizer.chunk_size(chunk, &capacity).fits().is_le()); + assert!(capacity.fits(sizer.size(chunk)).is_le()); } insta::assert_yaml_snapshot!( format!( @@ -293,7 +293,7 @@ fn markdown_trim() { let chunks = splitter.chunks(&text).collect::>(); for chunk in &chunks { - assert!(sizer.chunk_size(chunk, &capacity).fits().is_le()); + assert!(capacity.fits(sizer.size(chunk)).is_le()); } insta::assert_yaml_snapshot!( format!( @@ -325,7 +325,7 @@ fn markdown_overlap_trim_false() { let chunks = splitter.chunks(&text).collect::>(); for chunk in &chunks { - assert!(sizer.chunk_size(chunk, &capacity).fits().is_le()); + assert!(capacity.fits(sizer.size(chunk)).is_le()); } insta::assert_yaml_snapshot!( format!( @@ -356,7 +356,7 @@ fn markdown_overlap_trim() { let chunks = splitter.chunks(&text).collect::>(); for chunk in &chunks { - assert!(sizer.chunk_size(chunk, &capacity).fits().is_le()); + assert!(capacity.fits(sizer.size(chunk)).is_le()); } insta::assert_yaml_snapshot!( format!( @@ -386,7 +386,7 @@ fn code_trim_false() { assert_eq!(chunks.join(""), text); for chunk in &chunks { - assert!(sizer.chunk_size(chunk, &capacity).fits().is_le()); + assert!(capacity.fits(sizer.size(chunk)).is_le()); } insta::assert_yaml_snapshot!( format!( @@ -413,7 +413,7 @@ fn code_trim() { let chunks = splitter.chunks(&text).collect::>(); for chunk in &chunks { - assert!(sizer.chunk_size(chunk, &capacity).fits().is_le()); + assert!(capacity.fits(sizer.size(chunk)).is_le()); } insta::assert_yaml_snapshot!( format!( @@ -445,7 +445,7 @@ fn code_overlap_trim_false() { let chunks = splitter.chunks(&text).collect::>(); for chunk in &chunks { - assert!(sizer.chunk_size(chunk, &capacity).fits().is_le()); + assert!(capacity.fits(sizer.size(chunk)).is_le()); } insta::assert_yaml_snapshot!( format!( @@ -476,7 +476,7 @@ fn code_overlap_trim() { let chunks = splitter.chunks(&text).collect::>(); for chunk in &chunks { - assert!(sizer.chunk_size(chunk, &capacity).fits().is_le()); + assert!(capacity.fits(sizer.size(chunk)).is_le()); } insta::assert_yaml_snapshot!( format!(