From d8a3806f26f00c9e8fc1e347c4b05cab014e8782 Mon Sep 17 00:00:00 2001 From: Stuart Page <38261603+stuartjohnpage@users.noreply.github.com> Date: Fri, 12 Apr 2024 12:22:39 -0500 Subject: [PATCH] Chore // Adds nimble options to validate chunking options (#19) * adds nimble options; adds tests for invalid opts * removes tuple requirement --- lib/text_chunker.ex | 54 +++++++++++++++++++++++++++++------ mix.exs | 1 + mix.lock | 1 + test/recursive_chunk_test.exs | 45 +++++++++++++++++++++++++++++ 4 files changed, 92 insertions(+), 9 deletions(-) diff --git a/lib/text_chunker.ex b/lib/text_chunker.ex index 6d0f8bd..c695cba 100644 --- a/lib/text_chunker.ex +++ b/lib/text_chunker.ex @@ -7,9 +7,46 @@ defmodule TextChunker do * **Customizable Splitting:** Allows the splitting strategy to be customized via the `:strategy` option. * **Size and Overlap Control:** Provides options for `:chunk_size` and `:chunk_overlap`. * **Metadata Tracking:** Generates `Chunk` structs containing byte range information. + + **Supported Options** + * `:chunk_size` (positive integer, default: 2000) - Maximum size in code point length for each chunk. + * `:chunk_overlap` (non-negative integer, default: 200) - Number of overlapping code points between consecutive chunks to preserve context. + * `:strategy` (module default: `RecursiveChunk`) - A module implementing the split function. Currently only `RecursiveChunk` is supported. + * `:format` (atom, default: `:plaintext`) - The format of the input text. Used to determine where to split the text in some strategies. """ alias TextChunker.Strategies.RecursiveChunk + @supported_strategies [RecursiveChunk] + + @supported_formats [ + :doc, + :docx, + :epub, + :latex, + :odt, + :pdf, + :rtf, + :markdown, + :plaintext, + :elixir, + :ruby, + :php, + :python, + :vue, + :javascript, + :typescript + ] + + @opts_schema [ + strategy: [required: true, type: {:in, @supported_strategies}], + chunk_overlap: [required: true, type: :non_neg_integer], + chunk_size: [required: true, type: :pos_integer], + format: [ + required: true, + type: {:in, @supported_formats} + ] + ] + @default_opts [ chunk_size: 2000, chunk_overlap: 200, @@ -20,13 +57,6 @@ defmodule TextChunker do @doc """ Splits the provided text into a list of `%Chunk{}` structs. - ## Options - - * `:chunk_size` (integer, default: 2000) - Maximum size in code point length for each chunk. - * `:chunk_overlap` (integer, default: 200) - Number of overlapping code points between consecutive chunks to preserve context. - * `:strategy` (function, default: `&RecursiveChunk.split/2`) - A function taking two arguments (text and options) and returning a list of `%Chunk{}` structs. Currently only `&RecursiveChunk.split/2` is fully supported. - * `:format` (atom, default: `:plaintext`) - The format of the input text. Used to determine where to split the text in some strategies. - ## Examples ```elixir @@ -39,10 +69,16 @@ defmodule TextChunker do # => Generates many smaller chunks with significant overlap """ - @spec split(binary(), keyword()) :: [Chunk.t()] + @spec split(binary(), keyword()) :: [Chunk.t()] | {:error, String.t()} def split(text, opts \\ []) do opts = Keyword.merge(@default_opts, opts) - opts[:strategy].split(text, opts) + case NimbleOptions.validate(opts, @opts_schema) do + {:ok, args} -> + opts[:strategy].split(text, args) + + {:error, %NimbleOptions.ValidationError{message: message}} -> + {:error, message} + end end end diff --git a/mix.exs b/mix.exs index 58eafb1..58e9181 100644 --- a/mix.exs +++ b/mix.exs @@ -37,6 +37,7 @@ defmodule TextChunker.MixProject do [ {:styler, "~> 0.7", only: [:dev, :test], runtime: false}, {:ex_doc, "~> 0.29", only: [:dev, :test], runtime: false}, + {:nimble_options, "~> 1.0"} ] end diff --git a/mix.lock b/mix.lock index 31a2c90..0c8fb52 100644 --- a/mix.lock +++ b/mix.lock @@ -4,6 +4,7 @@ "makeup": {:hex, :makeup, "1.1.1", "fa0bc768698053b2b3869fa8a62616501ff9d11a562f3ce39580d60860c3a55e", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "5dc62fbdd0de44de194898b6710692490be74baa02d9d108bc29f007783b0b48"}, "makeup_elixir": {:hex, :makeup_elixir, "0.16.2", "627e84b8e8bf22e60a2579dad15067c755531fea049ae26ef1020cad58fe9578", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "41193978704763f6bbe6cc2758b84909e62984c7752b3784bd3c218bb341706b"}, "makeup_erlang": {:hex, :makeup_erlang, "0.1.5", "e0ff5a7c708dda34311f7522a8758e23bfcd7d8d8068dc312b5eb41c6fd76eba", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "94d2e986428585a21516d7d7149781480013c56e30c6a233534bedf38867a59a"}, + "nimble_options": {:hex, :nimble_options, "1.1.0", "3b31a57ede9cb1502071fade751ab0c7b8dbe75a9a4c2b5bbb0943a690b63172", [:mix], [], "hexpm", "8bbbb3941af3ca9acc7835f5655ea062111c9c27bcac53e004460dfd19008a99"}, "nimble_parsec": {:hex, :nimble_parsec, "1.4.0", "51f9b613ea62cfa97b25ccc2c1b4216e81df970acd8e16e8d1bdc58fef21370d", [:mix], [], "hexpm", "9c565862810fb383e9838c1dd2d7d2c437b3d13b267414ba6af33e50d2d1cf28"}, "styler": {:hex, :styler, "0.11.9", "2595393b94e660cd6e8b582876337cc50ff047d184ccbed42fdad2bfd5d78af5", [:mix], [], "hexpm", "8b7806ba1fdc94d0a75127c56875f91db89b75117fcc67572661010c13e1f259"}, } diff --git a/test/recursive_chunk_test.exs b/test/recursive_chunk_test.exs index 9942592..70b4af9 100644 --- a/test/recursive_chunk_test.exs +++ b/test/recursive_chunk_test.exs @@ -355,4 +355,49 @@ defmodule TextChunkerTest do assert result == expected_result end end + + describe "rejects unsupported options" do + test "rejects a chunk_overlap of -1" do + opts = [ + chunk_overlap: -1 + ] + + result = TextChunker.split("this should fail", opts) + assert result == {:error, "invalid value for :chunk_overlap option: expected non negative integer, got: -1"} + end + + test "rejects a chunk_size of 0" do + opts = [ + chunk_size: 0 + ] + + result = TextChunker.split("this should fail", opts) + assert result == {:error, "invalid value for :chunk_size option: expected positive integer, got: 0"} + end + + test "rejects an unsupported format" do + opts = [ + format: :made_up_format + ] + + result = TextChunker.split("this should fail", opts) + + assert result == { + :error, + "invalid value for :format option: expected one of [:doc, :docx, :epub, :latex, :odt, :pdf, :rtf, :markdown, :plaintext, :elixir, :ruby, :php, :python, :vue, :javascript, :typescript], got: :made_up_format" + } + end + + test "rejects a strategy that is not currently supported" do + opts = [ + strategy: UnsupportedModule + ] + + result = TextChunker.split("this should fail", opts) + + assert result == + {:error, + "invalid value for :strategy option: expected one of [TextChunker.Strategies.RecursiveChunk], got: UnsupportedModule"} + end + end end