Skip to content

Commit

Permalink
Chore // Adds nimble options to validate chunking options (#19)
Browse files Browse the repository at this point in the history
* adds nimble options; adds tests for invalid opts

* removes tuple requirement
  • Loading branch information
stuartjohnpage committed Apr 12, 2024
1 parent cbbb0de commit d8a3806
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 9 deletions.
54 changes: 45 additions & 9 deletions lib/text_chunker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,46 @@ defmodule TextChunker do
* **Customizable Splitting:** Allows the splitting strategy to be customized via the `:strategy` option.
* **Size and Overlap Control:** Provides options for `:chunk_size` and `:chunk_overlap`.
* **Metadata Tracking:** Generates `Chunk` structs containing byte range information.
**Supported Options**
* `:chunk_size` (positive integer, default: 2000) - Maximum size in code point length for each chunk.
* `:chunk_overlap` (non-negative integer, default: 200) - Number of overlapping code points between consecutive chunks to preserve context.
* `:strategy` (module default: `RecursiveChunk`) - A module implementing the split function. Currently only `RecursiveChunk` is supported.
* `:format` (atom, default: `:plaintext`) - The format of the input text. Used to determine where to split the text in some strategies.
"""
alias TextChunker.Strategies.RecursiveChunk

@supported_strategies [RecursiveChunk]

@supported_formats [
:doc,
:docx,
:epub,
:latex,
:odt,
:pdf,
:rtf,
:markdown,
:plaintext,
:elixir,
:ruby,
:php,
:python,
:vue,
:javascript,
:typescript
]

@opts_schema [
strategy: [required: true, type: {:in, @supported_strategies}],
chunk_overlap: [required: true, type: :non_neg_integer],
chunk_size: [required: true, type: :pos_integer],
format: [
required: true,
type: {:in, @supported_formats}
]
]

@default_opts [
chunk_size: 2000,
chunk_overlap: 200,
Expand All @@ -20,13 +57,6 @@ defmodule TextChunker do
@doc """
Splits the provided text into a list of `%Chunk{}` structs.
## Options
* `:chunk_size` (integer, default: 2000) - Maximum size in code point length for each chunk.
* `:chunk_overlap` (integer, default: 200) - Number of overlapping code points between consecutive chunks to preserve context.
* `:strategy` (function, default: `&RecursiveChunk.split/2`) - A function taking two arguments (text and options) and returning a list of `%Chunk{}` structs. Currently only `&RecursiveChunk.split/2` is fully supported.
* `:format` (atom, default: `:plaintext`) - The format of the input text. Used to determine where to split the text in some strategies.
## Examples
```elixir
Expand All @@ -39,10 +69,16 @@ defmodule TextChunker do
# => Generates many smaller chunks with significant overlap
"""
@spec split(binary(), keyword()) :: [Chunk.t()]
@spec split(binary(), keyword()) :: [Chunk.t()] | {:error, String.t()}
def split(text, opts \\ []) do
opts = Keyword.merge(@default_opts, opts)

opts[:strategy].split(text, opts)
case NimbleOptions.validate(opts, @opts_schema) do
{:ok, args} ->
opts[:strategy].split(text, args)

{:error, %NimbleOptions.ValidationError{message: message}} ->
{:error, message}
end
end
end
1 change: 1 addition & 0 deletions mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ defmodule TextChunker.MixProject do
[
{:styler, "~> 0.7", only: [:dev, :test], runtime: false},
{:ex_doc, "~> 0.29", only: [:dev, :test], runtime: false},
{:nimble_options, "~> 1.0"}
]
end

Expand Down
1 change: 1 addition & 0 deletions mix.lock
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"makeup": {:hex, :makeup, "1.1.1", "fa0bc768698053b2b3869fa8a62616501ff9d11a562f3ce39580d60860c3a55e", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "5dc62fbdd0de44de194898b6710692490be74baa02d9d108bc29f007783b0b48"},
"makeup_elixir": {:hex, :makeup_elixir, "0.16.2", "627e84b8e8bf22e60a2579dad15067c755531fea049ae26ef1020cad58fe9578", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "41193978704763f6bbe6cc2758b84909e62984c7752b3784bd3c218bb341706b"},
"makeup_erlang": {:hex, :makeup_erlang, "0.1.5", "e0ff5a7c708dda34311f7522a8758e23bfcd7d8d8068dc312b5eb41c6fd76eba", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "94d2e986428585a21516d7d7149781480013c56e30c6a233534bedf38867a59a"},
"nimble_options": {:hex, :nimble_options, "1.1.0", "3b31a57ede9cb1502071fade751ab0c7b8dbe75a9a4c2b5bbb0943a690b63172", [:mix], [], "hexpm", "8bbbb3941af3ca9acc7835f5655ea062111c9c27bcac53e004460dfd19008a99"},
"nimble_parsec": {:hex, :nimble_parsec, "1.4.0", "51f9b613ea62cfa97b25ccc2c1b4216e81df970acd8e16e8d1bdc58fef21370d", [:mix], [], "hexpm", "9c565862810fb383e9838c1dd2d7d2c437b3d13b267414ba6af33e50d2d1cf28"},
"styler": {:hex, :styler, "0.11.9", "2595393b94e660cd6e8b582876337cc50ff047d184ccbed42fdad2bfd5d78af5", [:mix], [], "hexpm", "8b7806ba1fdc94d0a75127c56875f91db89b75117fcc67572661010c13e1f259"},
}
45 changes: 45 additions & 0 deletions test/recursive_chunk_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -355,4 +355,49 @@ defmodule TextChunkerTest do
assert result == expected_result
end
end

describe "rejects unsupported options" do
test "rejects a chunk_overlap of -1" do
opts = [
chunk_overlap: -1
]

result = TextChunker.split("this should fail", opts)
assert result == {:error, "invalid value for :chunk_overlap option: expected non negative integer, got: -1"}
end

test "rejects a chunk_size of 0" do
opts = [
chunk_size: 0
]

result = TextChunker.split("this should fail", opts)
assert result == {:error, "invalid value for :chunk_size option: expected positive integer, got: 0"}
end

test "rejects an unsupported format" do
opts = [
format: :made_up_format
]

result = TextChunker.split("this should fail", opts)

assert result == {
:error,
"invalid value for :format option: expected one of [:doc, :docx, :epub, :latex, :odt, :pdf, :rtf, :markdown, :plaintext, :elixir, :ruby, :php, :python, :vue, :javascript, :typescript], got: :made_up_format"
}
end

test "rejects a strategy that is not currently supported" do
opts = [
strategy: UnsupportedModule
]

result = TextChunker.split("this should fail", opts)

assert result ==
{:error,
"invalid value for :strategy option: expected one of [TextChunker.Strategies.RecursiveChunk], got: UnsupportedModule"}
end
end
end

0 comments on commit d8a3806

Please sign in to comment.