Skip to content

Commit

Permalink
Python splitting // adds python separators (#9)
Browse files Browse the repository at this point in the history
* adds python separators

* added newline

* fixed tests
  • Loading branch information
stuartjohnpage authored Mar 8, 2024
1 parent fd05321 commit ab728bb
Show file tree
Hide file tree
Showing 4 changed files with 188 additions and 2 deletions.
11 changes: 11 additions & 0 deletions lib/text_chunker/strategies/recursive_chunk/separators.ex
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,17 @@ defmodule TextChunker.Strategies.RecursiveChunk.Separators do
]
end

def get_separators(:python) do
[
"\nclass ",
"\ndef ",
"\n\tdef ",
"\n\n",
"\n",
" "
]
end

def get_separators(:vue) do
[
"<script",
Expand Down
127 changes: 125 additions & 2 deletions test/recursive_chunk_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ defmodule TextChunkerTest do

@moduletag timeout: :infinity

describe "plaintext chunker" do
describe "chunker with plaintext separators" do
test "splits multiple sentences correctly" do
opts = [
chunk_size: 50,
Expand Down Expand Up @@ -203,7 +203,7 @@ defmodule TextChunkerTest do
end
end

describe "markdown chunker" do
describe "chunker with markdown separators" do
test "splits a simple markdown file" do
opts = [
chunk_size: 100,
Expand Down Expand Up @@ -232,4 +232,127 @@ defmodule TextChunkerTest do
assert result == expected_result
end
end

describe "chunker with python separators" do
test "splits a simple python file sensibly with no overlap" do
opts = [
chunk_size: 100,
chunk_overlap: 0,
format: :python
]

{:ok, text} = File.read("test/support/fixtures/document_fixtures/test_code.py")

result = text |> TextChunker.split(opts) |> TestHelpers.extract_text_from_chunks()

expected_result =
[
"class PetShop:\n \"\"\"Represents a pet shop with inventory and sales functionality.\"\"\"",
"\n\n def __init__(self, name):\n self.name = name\n self.inventory = {}",
"\n\n def add_pet(self, pet_type, quantity):",
"\n \"\"\"Adds a specified quantity of a pet type to the inventory.\"\"\"",
"\n if pet_type in self.inventory:\n self.inventory[pet_type] += quantity",
"\n else:\n self.inventory[pet_type] = quantity",
"\n\n def sell_pet(self, pet_type, quantity):",
"\n \"\"\"Sells a specified quantity of a pet type.\"\"\"",
"\n if pet_type in self.inventory and self.inventory[pet_type] >= quantity:",
"\n self.inventory[pet_type] -= quantity\n return True\n else:",
"\n return False",
"\n\n def get_pet_count(self, pet_type):",
"\n \"\"\"Returns the current count of a specific pet type.\"\"\"",
"\n return self.inventory.get(pet_type, 0)\n"
]

assert result == expected_result
end

test "splits a simple python file sensibly with overlap" do
opts = [
chunk_size: 100,
chunk_overlap: 50,
format: :python
]

{:ok, text} = File.read("test/support/fixtures/document_fixtures/test_code.py")

result = text |> TextChunker.split(opts) |> TestHelpers.extract_text_from_chunks()

expected_result =
[
"class PetShop:\n \"\"\"Represents a pet shop with inventory and sales functionality.\"\"\"",
"\n\n def __init__(self, name):\n self.name = name\n self.inventory = {}",
"\n\n def add_pet(self, pet_type, quantity):",
"\n \"\"\"Adds a specified quantity of a pet type to the inventory.\"\"\"",
"\n if pet_type in self.inventory:\n self.inventory[pet_type] += quantity",
"\n self.inventory[pet_type] += quantity\n else:",
"\n else:\n self.inventory[pet_type] = quantity",
"\n\n def sell_pet(self, pet_type, quantity):",
"\n def sell_pet(self, pet_type, quantity):\n \"\"\"Sells a specified quantity of a pet type.\"\"\"",
"\n if pet_type in self.inventory and self.inventory[pet_type] >= quantity:",
"\n self.inventory[pet_type] -= quantity\n return True\n else:",
"\n return True\n else:\n return False",
"\n\n def get_pet_count(self, pet_type):",
"\n \"\"\"Returns the current count of a specific pet type.\"\"\"",
"\n return self.inventory.get(pet_type, 0)\n"
]

assert result == expected_result
end
end

describe "chunker with javascript separators" do
test "splits a simple javascript file sensibly with no overlap" do
opts = [
chunk_size: 100,
chunk_overlap: 0,
format: :javascript
]

{:ok, text} = File.read("test/support/fixtures/document_fixtures/test_code.js")

result = text |> TextChunker.split(opts) |> TestHelpers.extract_text_from_chunks()

expected_result =
[
"class PetShop {\n constructor(name) {\n this.name = name;\n this.inventory = {};\n }",
"\n\n addPet(petType, quantity) {\n ",
" if (this.inventory[petType]) {\n this.inventory[petType] += quantity;\n } else {",
"\n this.inventory[petType] = quantity;\n }\n }",
"\n\n sellPet(petType, quantity) {\n ",
" if (this.inventory[petType] && this.inventory[petType] >= quantity) {",
"\n this.inventory[petType] -= quantity;\n return true;\n } else {",
"\n return false;\n }\n }",
"\n\n getPetCount(petType) {\n return this.inventory[petType] || 0; \n }\n}\n"
]

assert result == expected_result
end

test "splits a simple javascript file sensibly with overlap" do
opts = [
chunk_size: 100,
chunk_overlap: 50,
format: :javascript
]

{:ok, text} = File.read("test/support/fixtures/document_fixtures/test_code.js")

result = text |> TextChunker.split(opts) |> TestHelpers.extract_text_from_chunks()

expected_result =
[
"class PetShop {\n constructor(name) {\n this.name = name;\n this.inventory = {};\n }",
"\n\n addPet(petType, quantity) {\n ",
" if (this.inventory[petType]) {\n this.inventory[petType] += quantity;\n } else {",
"\n } else {\n this.inventory[petType] = quantity;\n }\n }",
"\n\n sellPet(petType, quantity) {\n ",
" if (this.inventory[petType] && this.inventory[petType] >= quantity) {",
"\n this.inventory[petType] -= quantity;\n return true;\n } else {",
"\n return true;\n } else {\n return false;\n }\n }",
"\n\n getPetCount(petType) {\n return this.inventory[petType] || 0; \n }\n}\n"
]

assert result == expected_result
end
end
end
27 changes: 27 additions & 0 deletions test/support/fixtures/document_fixtures/test_code.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
class PetShop {
constructor(name) {
this.name = name;
this.inventory = {};
}

addPet(petType, quantity) {
if (this.inventory[petType]) {
this.inventory[petType] += quantity;
} else {
this.inventory[petType] = quantity;
}
}

sellPet(petType, quantity) {
if (this.inventory[petType] && this.inventory[petType] >= quantity) {
this.inventory[petType] -= quantity;
return true;
} else {
return false;
}
}

getPetCount(petType) {
return this.inventory[petType] || 0;
}
}
25 changes: 25 additions & 0 deletions test/support/fixtures/document_fixtures/test_code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
class PetShop:
"""Represents a pet shop with inventory and sales functionality."""

def __init__(self, name):
self.name = name
self.inventory = {}

def add_pet(self, pet_type, quantity):
"""Adds a specified quantity of a pet type to the inventory."""
if pet_type in self.inventory:
self.inventory[pet_type] += quantity
else:
self.inventory[pet_type] = quantity

def sell_pet(self, pet_type, quantity):
"""Sells a specified quantity of a pet type."""
if pet_type in self.inventory and self.inventory[pet_type] >= quantity:
self.inventory[pet_type] -= quantity
return True
else:
return False

def get_pet_count(self, pet_type):
"""Returns the current count of a specific pet type."""
return self.inventory.get(pet_type, 0)

0 comments on commit ab728bb

Please sign in to comment.