benbrandt · benbrandt · Jun 21, 2024 · Jun 17, 2024 · Jun 17, 2024 · Jun 17, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,40 @@
 # Changelog
 
+## v0.14.0
+
+### What's New
+
+**Performance fixes for large documents.** The worst-case performance for certain documents was abysmal, leading to documents [that ran forever](https://github.com/benbrandt/text-splitter/issues/184). This release makes sure that in the worst case, the splitter won't be binary searching over the entire document, which it was before. This is prohibitively expensive especially for the tokenizer implementations, and now this should always have a safe upper bound to the search space.
+
+For the "happy path", this new approach also led to big speed gains in the `CodeSplitter` (50%+ speed increase in some cases), marginal regressions in the `MarkdownSplitter`, and not much difference in the `TextSplitter`. But overall, the performance should be more consistent across documents, since it wasn't uncommon for a document with certain formatting to hit the worst-case scenario previously.
+
+### Breaking Changes
+
+- Chunk output may be slightly different because of the changes to the search optimizations. The previous optimization occasionally caused the splitter to stop too soon. For most cases, you may see no difference. It was most pronounced in the `MarkdownSplitter` at very small sizes, and any splitter using `RustTokenizers` because of its offset behavior.
+
+#### Rust
+
+- `ChunkSize` has been removed. This was a holdover from a previous internal optimization, which turned out to not be very accurate anyway.
+- This makes implementing a custom `ChunkSizer` much easier, as you now only need to generate the size of the chunk as a `usize`. It often required in tokenization implementations to do more work to calculate the size as well, which is no longer necessary.
+
+##### Before
+
+```rust
+pub trait ChunkSizer {
+    // Required method
+    fn chunk_size(&self, chunk: &str, capacity: &ChunkCapacity) -> ChunkSize;
+}
+```
+
+##### After
+
+```rust
+pub trait ChunkSizer {
+    // Required method
+    fn size(&self, chunk: &str) -> usize;
+}
+```
+
 ## v0.13.3
 
 Fixes broken PyPI publish because of a bad dev dependency specification.

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -2,7 +2,7 @@
 members = ["bindings/*"]
 
 [workspace.package]
-version = "0.13.3"
+version = "0.14.0"
 authors = ["Ben Brandt <benjamin.j.brandt@gmail.com>"]
 edition = "2021"
 description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens, and is callable from Rust and Python."

diff --git a/benches/chunk_size.rs b/benches/chunk_size.rs
@@ -1,9 +1,11 @@
 #![allow(missing_docs)]
 
-use std::path::PathBuf;
+use std::{fs, path::PathBuf};
 
+use ahash::AHashMap;
 use cached_path::Cache;
 use divan::AllocProfiler;
+use once_cell::sync::Lazy;
 
 #[global_allocator]
 static ALLOC: AllocProfiler = AllocProfiler::system();
@@ -30,29 +32,54 @@ fn download_file_to_cache(src: &str) -> PathBuf {
         .unwrap()
 }
 
+const TEXT_FILENAMES: &[&str] = &["romeo_and_juliet", "room_with_a_view"];
+const MARKDOWN_FILENAMES: &[&str] = &["commonmark_spec"];
+const CODE_FILENAMES: &[&str] = &["hashbrown_set_rs"];
+
+static FILES: Lazy<AHashMap<&'static str, String>> = Lazy::new(|| {
+    let mut m = AHashMap::new();
+    for &name in TEXT_FILENAMES {
+        m.insert(
+            name,
+            fs::read_to_string(format!("tests/inputs/text/{name}.txt")).unwrap(),
+        );
+    }
+    for &name in MARKDOWN_FILENAMES {
+        m.insert(
+            name,
+            fs::read_to_string(format!("tests/inputs/markdown/{name}.md")).unwrap(),
+        );
+    }
+    for &name in CODE_FILENAMES {
+        m.insert(
+            name,
+            fs::read_to_string(format!("tests/inputs/code/{name}.txt")).unwrap(),
+        );
+    }
+    m
+});
+
+static BERT_TOKENIZER: Lazy<rust_tokenizers::tokenizer::BertTokenizer> = Lazy::new(|| {
+    let vocab_path = download_file_to_cache(
+        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+    );
+    rust_tokenizers::tokenizer::BertTokenizer::from_file(vocab_path, false, false).unwrap()
+});
+
 #[divan::bench_group]
 mod text {
-    use std::fs;
-
     use divan::{black_box_drop, counter::BytesCount, Bencher};
     use text_splitter::{ChunkConfig, ChunkSizer, TextSplitter};
 
-    use crate::CHUNK_SIZES;
-
-    const TEXT_FILENAMES: &[&str] = &["romeo_and_juliet", "room_with_a_view"];
+    use crate::{CHUNK_SIZES, FILES, TEXT_FILENAMES};
 
     fn bench<S, G>(bencher: Bencher<'_, '_>, filename: &str, gen_splitter: G)
     where
         G: Fn() -> TextSplitter<S> + Sync,
         S: ChunkSizer,
     {
         bencher
-            .with_inputs(|| {
-                (
-                    gen_splitter(),
-                    fs::read_to_string(format!("tests/inputs/text/{filename}.txt")).unwrap(),
-                )
-            })
+            .with_inputs(|| (gen_splitter(), FILES.get(filename).unwrap().clone()))
             .input_counter(|(_, text)| BytesCount::of_str(text))
             .bench_values(|(splitter, text)| {
                 splitter.chunks(&text).for_each(black_box_drop);
@@ -86,46 +113,29 @@ mod text {
     #[cfg(feature = "rust-tokenizers")]
     #[divan::bench(args = TEXT_FILENAMES, consts = CHUNK_SIZES)]
     fn rust_tokenizers<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
-        use crate::download_file_to_cache;
+        use crate::BERT_TOKENIZER;
 
         bench(bencher, filename, || {
-            let vocab_path = download_file_to_cache(
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-            );
-            TextSplitter::new(
-                ChunkConfig::new(N).with_sizer(
-                    rust_tokenizers::tokenizer::BertTokenizer::from_file(vocab_path, false, false)
-                        .unwrap(),
-                ),
-            )
+            TextSplitter::new(ChunkConfig::new(N).with_sizer(&*BERT_TOKENIZER))
         });
     }
 }
 
 #[cfg(feature = "markdown")]
 #[divan::bench_group]
 mod markdown {
-    use std::fs;
-
     use divan::{black_box_drop, counter::BytesCount, Bencher};
     use text_splitter::{ChunkConfig, ChunkSizer, MarkdownSplitter};
 
-    use crate::CHUNK_SIZES;
-
-    const MARKDOWN_FILENAMES: &[&str] = &["commonmark_spec"];
+    use crate::{CHUNK_SIZES, FILES, MARKDOWN_FILENAMES};
 
     fn bench<S, G>(bencher: Bencher<'_, '_>, filename: &str, gen_splitter: G)
     where
         G: Fn() -> MarkdownSplitter<S> + Sync,
         S: ChunkSizer,
     {
         bencher
-            .with_inputs(|| {
-                (
-                    gen_splitter(),
-                    fs::read_to_string(format!("tests/inputs/markdown/{filename}.md")).unwrap(),
-                )
-            })
+            .with_inputs(|| (gen_splitter(), FILES.get(filename).unwrap().clone()))
             .input_counter(|(_, text)| BytesCount::of_str(text))
             .bench_values(|(splitter, text)| {
                 splitter.chunks(&text).for_each(black_box_drop);
@@ -159,46 +169,29 @@ mod markdown {
     #[cfg(feature = "rust-tokenizers")]
     #[divan::bench(args = MARKDOWN_FILENAMES, consts = CHUNK_SIZES)]
     fn rust_tokenizers<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
-        use crate::download_file_to_cache;
+        use crate::BERT_TOKENIZER;
 
         bench(bencher, filename, || {
-            let vocab_path = download_file_to_cache(
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-            );
-            MarkdownSplitter::new(
-                ChunkConfig::new(N).with_sizer(
-                    rust_tokenizers::tokenizer::BertTokenizer::from_file(vocab_path, false, false)
-                        .unwrap(),
-                ),
-            )
+            MarkdownSplitter::new(ChunkConfig::new(N).with_sizer(&*BERT_TOKENIZER))
         });
     }
 }
 
 #[cfg(feature = "code")]
 #[divan::bench_group]
 mod code {
-    use std::fs;
-
     use divan::{black_box_drop, counter::BytesCount, Bencher};
     use text_splitter::{ChunkConfig, ChunkSizer, CodeSplitter};
 
-    use crate::CHUNK_SIZES;
-
-    const CODE_FILENAMES: &[&str] = &["hashbrown_set_rs"];
+    use crate::{CHUNK_SIZES, CODE_FILENAMES, FILES};
 
     fn bench<S, G>(bencher: Bencher<'_, '_>, filename: &str, gen_splitter: G)
     where
         G: Fn() -> CodeSplitter<S> + Sync,
         S: ChunkSizer,
     {
         bencher
-            .with_inputs(|| {
-                (
-                    gen_splitter(),
-                    fs::read_to_string(format!("tests/inputs/code/{filename}.txt")).unwrap(),
-                )
-            })
+            .with_inputs(|| (gen_splitter(), FILES.get(filename).unwrap().clone()))
             .input_counter(|(_, text)| BytesCount::of_str(text))
             .bench_values(|(splitter, text)| {
                 splitter.chunks(&text).for_each(black_box_drop);
@@ -240,18 +233,12 @@ mod code {
     #[cfg(feature = "rust-tokenizers")]
     #[divan::bench(args = CODE_FILENAMES, consts = CHUNK_SIZES)]
     fn rust_tokenizers<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
-        use crate::download_file_to_cache;
+        use crate::BERT_TOKENIZER;
 
         bench(bencher, filename, || {
-            let vocab_path = download_file_to_cache(
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-            );
             CodeSplitter::new(
                 tree_sitter_rust::language(),
-                ChunkConfig::new(N).with_sizer(
-                    rust_tokenizers::tokenizer::BertTokenizer::from_file(vocab_path, false, false)
-                        .unwrap(),
-                ),
+                ChunkConfig::new(N).with_sizer(&*BERT_TOKENIZER),
             )
             .unwrap()
         });