Skip to content

Commit

Permalink
feat: set user being able to set chunk size and overlap for indices (#…
Browse files Browse the repository at this point in the history
…524)

* use tzlocal to get the local time

* delete tmp folder

* update date_created and date_updated with current timezone

* pass precommit

* update date_created field default by local time

* add chunk size and chunk overlap param for indices

* refactor code to pass pre-commit

* fix: minor update logics

---------

Co-authored-by: Tadashi <tadashi@cinnamon.is>
  • Loading branch information
cin-cris and taprosoft authored Dec 4, 2024
1 parent a1fecfa commit 32732c3
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 2 deletions.
2 changes: 2 additions & 0 deletions libs/ktem/ktem/index/file/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ class BaseFileIndexIndexing(BaseComponent):
FSPath = Param(help="The file storage path")
user_id = Param(help="The user id")
private = Param(False, help="Whether this is private index")
chunk_size = Param(help="Chunk size for this index")
chunk_overlap = Param(help="Chunk overlap for this index")

def run(
self, file_paths: str | Path | list[str | Path], *args, **kwargs
Expand Down
21 changes: 21 additions & 0 deletions libs/ktem/ktem/index/file/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,25 @@ def get_admin_settings(cls):
"choices": [("Yes", True), ("No", False)],
"info": "If private, files will not be accessible across users.",
},
"chunk_size": {
"name": "Size of chunk (number of tokens)",
"value": 0,
"component": "number",
"info": (
"Number of tokens of each text segment. "
"Set 0 to use developer setting."
),
},
"chunk_overlap": {
"name": "Number of overlapping tokens between chunks",
"value": 0,
"component": "number",
"info": (
"Number of tokens that consecutive text segments "
"should overlap with each other. "
"Set 0 to use developer setting."
),
},
}

def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing:
Expand All @@ -423,6 +442,8 @@ def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing:
obj.FSPath = self._fs_path
obj.user_id = user_id
obj.private = self.config.get("private", False)
obj.chunk_size = self.config.get("chunk_size", 0)
obj.chunk_overlap = self.config.get("chunk_overlap", 0)

return obj

Expand Down
10 changes: 8 additions & 2 deletions libs/ktem/ktem/index/file/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,7 +729,11 @@ def route(self, file_path: str | Path) -> IndexPipeline:
Can subclass this method for a more elaborate pipeline routing strategy.
"""
_, chunk_size, chunk_overlap = dev_settings()

_, dev_chunk_size, dev_chunk_overlap = dev_settings()

chunk_size = self.chunk_size or dev_chunk_size
chunk_overlap = self.chunk_overlap or dev_chunk_overlap

# check if file_path is a URL
if self.is_url(file_path):
Expand All @@ -744,12 +748,14 @@ def route(self, file_path: str | Path) -> IndexPipeline:
"the suitable pipeline for this file type in the settings."
)

print(f"Chunk size: {chunk_size}, chunk overlap: {chunk_overlap}")

print("Using reader", reader)
pipeline: IndexPipeline = IndexPipeline(
loader=reader,
splitter=TokenSplitter(
chunk_size=chunk_size or 1024,
chunk_overlap=chunk_overlap if chunk_overlap is not None else 256,
chunk_overlap=chunk_overlap or 256,
separator="\n\n",
backup_separators=["\n", ".", "\u200B"],
),
Expand Down

0 comments on commit 32732c3

Please sign in to comment.