Skip to content

Commit

Permalink
New attempt at finding best effort binary search window
Browse files Browse the repository at this point in the history
  • Loading branch information
benbrandt committed Jun 21, 2024
1 parent c11dad3 commit 26998f1
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 88 deletions.
111 changes: 49 additions & 62 deletions benches/chunk_size.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#![allow(missing_docs)]

use std::path::PathBuf;
use std::{fs, path::PathBuf};

use ahash::AHashMap;
use cached_path::Cache;
use divan::AllocProfiler;
use once_cell::sync::Lazy;

#[global_allocator]
static ALLOC: AllocProfiler = AllocProfiler::system();
Expand All @@ -30,29 +32,54 @@ fn download_file_to_cache(src: &str) -> PathBuf {
.unwrap()
}

const TEXT_FILENAMES: &[&str] = &["romeo_and_juliet", "room_with_a_view"];
const MARKDOWN_FILENAMES: &[&str] = &["commonmark_spec"];
const CODE_FILENAMES: &[&str] = &["hashbrown_set_rs"];

static FILES: Lazy<AHashMap<&'static str, String>> = Lazy::new(|| {
let mut m = AHashMap::new();
for &name in TEXT_FILENAMES {
m.insert(
name,
fs::read_to_string(format!("tests/inputs/text/{name}.txt")).unwrap(),
);
}
for &name in MARKDOWN_FILENAMES {
m.insert(
name,
fs::read_to_string(format!("tests/inputs/markdown/{name}.md")).unwrap(),
);
}
for &name in CODE_FILENAMES {
m.insert(
name,
fs::read_to_string(format!("tests/inputs/code/{name}.txt")).unwrap(),
);
}
m
});

static BERT_TOKENIZER: Lazy<rust_tokenizers::tokenizer::BertTokenizer> = Lazy::new(|| {
let vocab_path = download_file_to_cache(
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
);
rust_tokenizers::tokenizer::BertTokenizer::from_file(vocab_path, false, false).unwrap()
});

#[divan::bench_group]
mod text {
use std::fs;

use divan::{black_box_drop, counter::BytesCount, Bencher};
use text_splitter::{ChunkConfig, ChunkSizer, TextSplitter};

use crate::CHUNK_SIZES;

const TEXT_FILENAMES: &[&str] = &["romeo_and_juliet", "room_with_a_view"];
use crate::{CHUNK_SIZES, FILES, TEXT_FILENAMES};

fn bench<S, G>(bencher: Bencher<'_, '_>, filename: &str, gen_splitter: G)
where
G: Fn() -> TextSplitter<S> + Sync,
S: ChunkSizer,
{
bencher
.with_inputs(|| {
(
gen_splitter(),
fs::read_to_string(format!("tests/inputs/text/{filename}.txt")).unwrap(),
)
})
.with_inputs(|| (gen_splitter(), FILES.get(filename).unwrap().clone()))
.input_counter(|(_, text)| BytesCount::of_str(text))
.bench_values(|(splitter, text)| {
splitter.chunks(&text).for_each(black_box_drop);
Expand Down Expand Up @@ -86,46 +113,29 @@ mod text {
#[cfg(feature = "rust-tokenizers")]
#[divan::bench(args = TEXT_FILENAMES, consts = CHUNK_SIZES)]
fn rust_tokenizers<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
use crate::download_file_to_cache;
use crate::BERT_TOKENIZER;

bench(bencher, filename, || {
let vocab_path = download_file_to_cache(
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
);
TextSplitter::new(
ChunkConfig::new(N).with_sizer(
rust_tokenizers::tokenizer::BertTokenizer::from_file(vocab_path, false, false)
.unwrap(),
),
)
TextSplitter::new(ChunkConfig::new(N).with_sizer(&*BERT_TOKENIZER))
});
}
}

#[cfg(feature = "markdown")]
#[divan::bench_group]
mod markdown {
use std::fs;

use divan::{black_box_drop, counter::BytesCount, Bencher};
use text_splitter::{ChunkConfig, ChunkSizer, MarkdownSplitter};

use crate::CHUNK_SIZES;

const MARKDOWN_FILENAMES: &[&str] = &["commonmark_spec"];
use crate::{CHUNK_SIZES, FILES, MARKDOWN_FILENAMES};

fn bench<S, G>(bencher: Bencher<'_, '_>, filename: &str, gen_splitter: G)
where
G: Fn() -> MarkdownSplitter<S> + Sync,
S: ChunkSizer,
{
bencher
.with_inputs(|| {
(
gen_splitter(),
fs::read_to_string(format!("tests/inputs/markdown/{filename}.md")).unwrap(),
)
})
.with_inputs(|| (gen_splitter(), FILES.get(filename).unwrap().clone()))
.input_counter(|(_, text)| BytesCount::of_str(text))
.bench_values(|(splitter, text)| {
splitter.chunks(&text).for_each(black_box_drop);
Expand Down Expand Up @@ -159,46 +169,29 @@ mod markdown {
#[cfg(feature = "rust-tokenizers")]
#[divan::bench(args = MARKDOWN_FILENAMES, consts = CHUNK_SIZES)]
fn rust_tokenizers<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
use crate::download_file_to_cache;
use crate::BERT_TOKENIZER;

bench(bencher, filename, || {
let vocab_path = download_file_to_cache(
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
);
MarkdownSplitter::new(
ChunkConfig::new(N).with_sizer(
rust_tokenizers::tokenizer::BertTokenizer::from_file(vocab_path, false, false)
.unwrap(),
),
)
MarkdownSplitter::new(ChunkConfig::new(N).with_sizer(&*BERT_TOKENIZER))
});
}
}

#[cfg(feature = "code")]
#[divan::bench_group]
mod code {
use std::fs;

use divan::{black_box_drop, counter::BytesCount, Bencher};
use text_splitter::{ChunkConfig, ChunkSizer, CodeSplitter};

use crate::CHUNK_SIZES;

const CODE_FILENAMES: &[&str] = &["hashbrown_set_rs"];
use crate::{CHUNK_SIZES, CODE_FILENAMES, FILES};

fn bench<S, G>(bencher: Bencher<'_, '_>, filename: &str, gen_splitter: G)
where
G: Fn() -> CodeSplitter<S> + Sync,
S: ChunkSizer,
{
bencher
.with_inputs(|| {
(
gen_splitter(),
fs::read_to_string(format!("tests/inputs/code/{filename}.txt")).unwrap(),
)
})
.with_inputs(|| (gen_splitter(), FILES.get(filename).unwrap().clone()))
.input_counter(|(_, text)| BytesCount::of_str(text))
.bench_values(|(splitter, text)| {
splitter.chunks(&text).for_each(black_box_drop);
Expand Down Expand Up @@ -240,18 +233,12 @@ mod code {
#[cfg(feature = "rust-tokenizers")]
#[divan::bench(args = CODE_FILENAMES, consts = CHUNK_SIZES)]
fn rust_tokenizers<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
use crate::download_file_to_cache;
use crate::BERT_TOKENIZER;

bench(bencher, filename, || {
let vocab_path = download_file_to_cache(
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
);
CodeSplitter::new(
tree_sitter_rust::language(),
ChunkConfig::new(N).with_sizer(
rust_tokenizers::tokenizer::BertTokenizer::from_file(vocab_path, false, false)
.unwrap(),
),
ChunkConfig::new(N).with_sizer(&*BERT_TOKENIZER),
)
.unwrap()
});
Expand Down
6 changes: 4 additions & 2 deletions src/chunk_size.rs
Original file line number Diff line number Diff line change
Expand Up @@ -434,8 +434,9 @@ where
&mut self,
offset: usize,
levels_with_first_chunk: impl Iterator<Item = (L, &'text str)>,
) -> Option<L> {
) -> (Option<L>, Option<usize>) {
let mut semantic_level = None;
let mut max_offset = None;

// We assume that larger levels are also longer. We can skip lower levels if going to a higher level would result in a shorter text
let levels_with_first_chunk =
Expand All @@ -451,13 +452,14 @@ where
let chunk_size = self.check_capacity(offset, str, false);
// If this no longer fits, we use the level we are at.
if chunk_size.fits.is_gt() {
max_offset = Some(offset + str.len());
break;
}
// Otherwise break up the text with the next level
semantic_level = Some(level);
}

semantic_level
(semantic_level, max_offset)
}

/// Clear the cached values. Once we've moved the cursor,
Expand Down
Loading

0 comments on commit 26998f1

Please sign in to comment.