Python: Update to text-splitter 0.4.2 (#31)

benbrandt · Jul 2, 2023 · fed9dde · fed9dde
1 parent 2f5f718
commit fed9dde
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 37 deletions.
diff --git a/bindings/python/CHANGELOG.md b/bindings/python/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## v0.2.2
+
+### What's New
+
+- Update to v0.4.2 of `text-splitter` to support `tiktoken-rs@0.5.0`
+
 ## v0.2.1
 
 ### What's New

diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock
diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "semantic-text-splitter"
-version = "0.2.1"
+version = "0.2.2"
 authors = ["Ben Brandt <benjamin.j.brandt@gmail.com>"]
 edition = "2021"
 description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens (when used with large language models)."
@@ -15,8 +15,8 @@ crate-type = ["cdylib"]
 
 [dependencies]
 pyo3 = { version = "0.19.0", features = ["abi3-py37"] }
-text-splitter = { version = "0.4.1", features = ["tiktoken-rs", "tokenizers"] }
-tiktoken-rs = "0.4.2"
+text-splitter = { version = "0.4.2", features = ["tiktoken-rs", "tokenizers"] }
+tiktoken-rs = "0.5.0"
 tokenizers = { version = "0.13.3", default_features = false, features = [
     "onig",
 ] }
diff --git a/bindings/python/README.md b/bindings/python/README.md
@@ -21,6 +21,34 @@ splitter = CharacterTextSplitter(trim_chunks=False)
 chunks = splitter.chunks("your document text", max_characters)
 ```
 
+### With Huggingface Tokenizer
+
+```python
+from semantic_text_splitter import HuggingFaceTextSplitter
+from tokenizers import Tokenizer
+
+# Maximum number of tokens in a chunk
+max_characters = 1000
+# Optionally can also have the splitter not trim whitespace for you
+tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
+splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=False)
+
+chunks = splitter.chunks("your document text", max_characters)
+```
+
+### With Tiktoken Tokenizer
+
+```python
+from semantic_text_splitter import TiktokenTextSplitter
+
+# Maximum number of tokens in a chunk
+max_tokens = 1000
+# Optionally can also have the splitter not trim whitespace for you
+splitter = TiktokenTextSplitter("gpt-3.5-turbo", trim_chunks=False)
+
+chunks = splitter.chunks("your document text", max_tokens)
+```
+
 ### Using a Range for Chunk Capacity
 
 You also have the option of specifying your chunk capacity as a range.