From f70459efc863c65421f7ac3962fe088220ce2b32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Wei=C3=9F?= Date: Thu, 1 Feb 2024 10:18:00 +0100 Subject: [PATCH 1/5] Add print output for verification --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 3c7147ff..a5078708 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,7 @@ let splitter = TextSplitter::new(tokenizer) .with_trim_chunks(true); let chunks = splitter.chunks("your document text", max_tokens); +println!("{}", chunks.count()) ``` ### With Tiktoken Tokenizer @@ -65,6 +66,7 @@ let splitter = TextSplitter::new(tokenizer) .with_trim_chunks(true); let chunks = splitter.chunks("your document text", max_tokens); +println!("{}", chunks.count()) ``` ### Using a Range for Chunk Capacity @@ -85,6 +87,7 @@ let max_characters = 500..2000; let splitter = TextSplitter::default().with_trim_chunks(true); let chunks = splitter.chunks("your document text", max_characters); +println!("{}", chunks.count()) ``` ## Method From 909c1de0359d721a42670942bf4790dd52ed0c5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Wei=C3=9F?= Date: Thu, 1 Feb 2024 10:26:03 +0100 Subject: [PATCH 2/5] Specify requirements for running examples --- README.md | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a5078708..ef120121 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,20 @@ let chunks = splitter.chunks("your document text", max_characters); ### With Huggingface Tokenizer -Requires the `tokenizers` feature to be activated. + +
+ +Requires the `tokenizers` feature to be activated and direct declaration of the dependency. The example below, using `from_pretrained()`, also requires tokenizers's `http` feature enabled. + +Click to show Cargo.toml. + + +```toml +[dependencies] +text-splitter = { version = "0.6", features = ["tokenizers"] } +tokenizers = { version = "0.15", features = ["http"] } +``` +
```rust use text_splitter::TextSplitter; @@ -50,8 +63,18 @@ println!("{}", chunks.count()) ``` ### With Tiktoken Tokenizer +
+ +Requires the `tiktoken-rs` feature to be activated and direct declaration of the dependency. + +Click to show Cargo.toml. + -Requires the `tiktoken-rs` feature to be activated. +```toml +text-splitter = { version = "0.6", features = ["tiktoken-rs"] } +tiktoken-rs = "0.5" +``` +
```rust use text_splitter::TextSplitter; From c90d598f46af77e755e12f705b998946a9be3e28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Wei=C3=9F?= Date: Thu, 1 Feb 2024 16:06:02 +0100 Subject: [PATCH 3/5] Improve wording --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ef120121..5818a49e 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ let chunks = splitter.chunks("your document text", max_characters);
-Requires the `tokenizers` feature to be activated and direct declaration of the dependency. The example below, using `from_pretrained()`, also requires tokenizers's `http` feature enabled. +Requires the `tokenizers` feature to be activated and adding `tokenizers` to dependencies. The example below, using `from_pretrained()`, also requires tokenizers `http` feature to be enabled. Click to show Cargo.toml. @@ -65,7 +65,7 @@ println!("{}", chunks.count()) ### With Tiktoken Tokenizer
-Requires the `tiktoken-rs` feature to be activated and direct declaration of the dependency. +Requires the `tiktoken-rs` feature to be activated and adding `tiktoken-rs` to dependencies. Click to show Cargo.toml. From 2d2497ab0b4f256f43cc0b3f8823810984d5917c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Wei=C3=9F?= Date: Thu, 1 Feb 2024 16:08:01 +0100 Subject: [PATCH 4/5] Add missing print --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 5818a49e..b853bb09 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ let splitter = TextSplitter::default() .with_trim_chunks(true); let chunks = splitter.chunks("your document text", max_characters); +println!("{}", chunks.count()) ``` ### With Huggingface Tokenizer From 57c1d5f30c1df08ca96c1fea150dcf2482d079ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Wei=C3=9F?= Date: Thu, 1 Feb 2024 16:08:30 +0100 Subject: [PATCH 5/5] Improve formatting, the "expand" arrow is in an akward position otherwise --- README.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index b853bb09..37c9ff22 100644 --- a/README.md +++ b/README.md @@ -32,11 +32,9 @@ println!("{}", chunks.count()) ### With Huggingface Tokenizer - +Requires the `tokenizers` feature to be activated and adding `tokenizers` to dependencies. The example below, using `from_pretrained()`, also requires tokenizers `http` feature to be enabled.
-Requires the `tokenizers` feature to be activated and adding `tokenizers` to dependencies. The example below, using `from_pretrained()`, also requires tokenizers `http` feature to be enabled. - Click to show Cargo.toml. @@ -64,10 +62,11 @@ println!("{}", chunks.count()) ``` ### With Tiktoken Tokenizer -
- + Requires the `tiktoken-rs` feature to be activated and adding `tiktoken-rs` to dependencies. +
+ Click to show Cargo.toml.