New attempt at finding best effort binary search window

benbrandt · Jun 21, 2024 · 26998f1 · 26998f1
1 parent c11dad3
commit 26998f1
Show file tree

Hide file tree

Showing 3 changed files with 145 additions and 88 deletions.
diff --git a/benches/chunk_size.rs b/benches/chunk_size.rs
@@ -1,9 +1,11 @@
 #![allow(missing_docs)]
 
-use std::path::PathBuf;
+use std::{fs, path::PathBuf};
 
+use ahash::AHashMap;
 use cached_path::Cache;
 use divan::AllocProfiler;
+use once_cell::sync::Lazy;
 
 #[global_allocator]
 static ALLOC: AllocProfiler = AllocProfiler::system();
@@ -30,29 +32,54 @@ fn download_file_to_cache(src: &str) -> PathBuf {
         .unwrap()
 }
 
+const TEXT_FILENAMES: &[&str] = &["romeo_and_juliet", "room_with_a_view"];
+const MARKDOWN_FILENAMES: &[&str] = &["commonmark_spec"];
+const CODE_FILENAMES: &[&str] = &["hashbrown_set_rs"];
+
+static FILES: Lazy<AHashMap<&'static str, String>> = Lazy::new(|| {
+    let mut m = AHashMap::new();
+    for &name in TEXT_FILENAMES {
+        m.insert(
+            name,
+            fs::read_to_string(format!("tests/inputs/text/{name}.txt")).unwrap(),
+        );
+    }
+    for &name in MARKDOWN_FILENAMES {
+        m.insert(
+            name,
+            fs::read_to_string(format!("tests/inputs/markdown/{name}.md")).unwrap(),
+        );
+    }
+    for &name in CODE_FILENAMES {
+        m.insert(
+            name,
+            fs::read_to_string(format!("tests/inputs/code/{name}.txt")).unwrap(),
+        );
+    }
+    m
+});
+
+static BERT_TOKENIZER: Lazy<rust_tokenizers::tokenizer::BertTokenizer> = Lazy::new(|| {
+    let vocab_path = download_file_to_cache(
+        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+    );
+    rust_tokenizers::tokenizer::BertTokenizer::from_file(vocab_path, false, false).unwrap()
+});
+
 #[divan::bench_group]
 mod text {
-    use std::fs;
-
     use divan::{black_box_drop, counter::BytesCount, Bencher};
     use text_splitter::{ChunkConfig, ChunkSizer, TextSplitter};
 
-    use crate::CHUNK_SIZES;
-
-    const TEXT_FILENAMES: &[&str] = &["romeo_and_juliet", "room_with_a_view"];
+    use crate::{CHUNK_SIZES, FILES, TEXT_FILENAMES};
 
     fn bench<S, G>(bencher: Bencher<'_, '_>, filename: &str, gen_splitter: G)
     where
         G: Fn() -> TextSplitter<S> + Sync,
         S: ChunkSizer,
     {
         bencher
-            .with_inputs(|| {
-                (
-                    gen_splitter(),
-                    fs::read_to_string(format!("tests/inputs/text/{filename}.txt")).unwrap(),
-                )
-            })
+            .with_inputs(|| (gen_splitter(), FILES.get(filename).unwrap().clone()))
             .input_counter(|(_, text)| BytesCount::of_str(text))
             .bench_values(|(splitter, text)| {
                 splitter.chunks(&text).for_each(black_box_drop);
@@ -86,46 +113,29 @@ mod text {
     #[cfg(feature = "rust-tokenizers")]
     #[divan::bench(args = TEXT_FILENAMES, consts = CHUNK_SIZES)]
     fn rust_tokenizers<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
-        use crate::download_file_to_cache;
+        use crate::BERT_TOKENIZER;
 
         bench(bencher, filename, || {
-            let vocab_path = download_file_to_cache(
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-            );
-            TextSplitter::new(
-                ChunkConfig::new(N).with_sizer(
-                    rust_tokenizers::tokenizer::BertTokenizer::from_file(vocab_path, false, false)
-                        .unwrap(),
-                ),
-            )
+            TextSplitter::new(ChunkConfig::new(N).with_sizer(&*BERT_TOKENIZER))
         });
     }
 }
 
 #[cfg(feature = "markdown")]
 #[divan::bench_group]
 mod markdown {
-    use std::fs;
-
     use divan::{black_box_drop, counter::BytesCount, Bencher};
     use text_splitter::{ChunkConfig, ChunkSizer, MarkdownSplitter};
 
-    use crate::CHUNK_SIZES;
-
-    const MARKDOWN_FILENAMES: &[&str] = &["commonmark_spec"];
+    use crate::{CHUNK_SIZES, FILES, MARKDOWN_FILENAMES};
 
     fn bench<S, G>(bencher: Bencher<'_, '_>, filename: &str, gen_splitter: G)
     where
         G: Fn() -> MarkdownSplitter<S> + Sync,
         S: ChunkSizer,
     {
         bencher
-            .with_inputs(|| {
-                (
-                    gen_splitter(),
-                    fs::read_to_string(format!("tests/inputs/markdown/{filename}.md")).unwrap(),
-                )
-            })
+            .with_inputs(|| (gen_splitter(), FILES.get(filename).unwrap().clone()))
             .input_counter(|(_, text)| BytesCount::of_str(text))
             .bench_values(|(splitter, text)| {
                 splitter.chunks(&text).for_each(black_box_drop);
@@ -159,46 +169,29 @@ mod markdown {
     #[cfg(feature = "rust-tokenizers")]
     #[divan::bench(args = MARKDOWN_FILENAMES, consts = CHUNK_SIZES)]
     fn rust_tokenizers<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
-        use crate::download_file_to_cache;
+        use crate::BERT_TOKENIZER;
 
         bench(bencher, filename, || {
-            let vocab_path = download_file_to_cache(
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-            );
-            MarkdownSplitter::new(
-                ChunkConfig::new(N).with_sizer(
-                    rust_tokenizers::tokenizer::BertTokenizer::from_file(vocab_path, false, false)
-                        .unwrap(),
-                ),
-            )
+            MarkdownSplitter::new(ChunkConfig::new(N).with_sizer(&*BERT_TOKENIZER))
         });
     }
 }
 
 #[cfg(feature = "code")]
 #[divan::bench_group]
 mod code {
-    use std::fs;
-
     use divan::{black_box_drop, counter::BytesCount, Bencher};
     use text_splitter::{ChunkConfig, ChunkSizer, CodeSplitter};
 
-    use crate::CHUNK_SIZES;
-
-    const CODE_FILENAMES: &[&str] = &["hashbrown_set_rs"];
+    use crate::{CHUNK_SIZES, CODE_FILENAMES, FILES};
 
     fn bench<S, G>(bencher: Bencher<'_, '_>, filename: &str, gen_splitter: G)
     where
         G: Fn() -> CodeSplitter<S> + Sync,
         S: ChunkSizer,
     {
         bencher
-            .with_inputs(|| {
-                (
-                    gen_splitter(),
-                    fs::read_to_string(format!("tests/inputs/code/{filename}.txt")).unwrap(),
-                )
-            })
+            .with_inputs(|| (gen_splitter(), FILES.get(filename).unwrap().clone()))
             .input_counter(|(_, text)| BytesCount::of_str(text))
             .bench_values(|(splitter, text)| {
                 splitter.chunks(&text).for_each(black_box_drop);
@@ -240,18 +233,12 @@ mod code {
     #[cfg(feature = "rust-tokenizers")]
     #[divan::bench(args = CODE_FILENAMES, consts = CHUNK_SIZES)]
     fn rust_tokenizers<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
-        use crate::download_file_to_cache;
+        use crate::BERT_TOKENIZER;
 
         bench(bencher, filename, || {
-            let vocab_path = download_file_to_cache(
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-            );
             CodeSplitter::new(
                 tree_sitter_rust::language(),
-                ChunkConfig::new(N).with_sizer(
-                    rust_tokenizers::tokenizer::BertTokenizer::from_file(vocab_path, false, false)
-                        .unwrap(),
-                ),
+                ChunkConfig::new(N).with_sizer(&*BERT_TOKENIZER),
             )
             .unwrap()
         });

diff --git a/src/chunk_size.rs b/src/chunk_size.rs
@@ -434,8 +434,9 @@ where
         &mut self,
         offset: usize,
         levels_with_first_chunk: impl Iterator<Item = (L, &'text str)>,
-    ) -> Option<L> {
+    ) -> (Option<L>, Option<usize>) {
         let mut semantic_level = None;
+        let mut max_offset = None;
 
         // We assume that larger levels are also longer. We can skip lower levels if going to a higher level would result in a shorter text
         let levels_with_first_chunk =
@@ -451,13 +452,14 @@ where
             let chunk_size = self.check_capacity(offset, str, false);
             // If this no longer fits, we use the level we are at.
             if chunk_size.fits.is_gt() {
+                max_offset = Some(offset + str.len());
                 break;
             }
             // Otherwise break up the text with the next level
             semantic_level = Some(level);
         }
 
-        semantic_level
+        (semantic_level, max_offset)
     }
 
     /// Clear the cached values. Once we've moved the cursor,