Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed settings to unstructured to make largest chunks the proper size #1029

Merged
merged 1 commit into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
938 changes: 464 additions & 474 deletions django_app/poetry.lock

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion django_app/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,8 @@ build-backend = "poetry.core.masonry.api"
DJANGO_SETTINGS_MODULE = "redbox_app.settings"
testpaths = "tests"
norecursedirs = "tests/test_ai.py"

env_override_existing_values = 1
env_files = [
".env.test",
".env"
]
11 changes: 6 additions & 5 deletions redbox-core/redbox/chains/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
from io import BytesIO
from functools import partial

from annotated_types import doc
from langchain.vectorstores import VectorStore
from langchain_core.documents.base import Document
from langchain_core.runnables import RunnableLambda, chain, Runnable

from redbox.loader.loaders import UnstructuredChunkLoader
from redbox.models.settings import Settings
from redbox.loader.base import BaseRedboxFileLoader


if TYPE_CHECKING:
Expand All @@ -26,20 +27,20 @@ def log_chunks(chunks: list[Document]):
return chunks


def document_loader(document_loader_type: type[BaseRedboxFileLoader], s3_client: S3Client, env: Settings) -> Runnable:
def document_loader(document_loader: UnstructuredChunkLoader, s3_client: S3Client, env: Settings) -> Runnable:
@chain
def wrapped(file_name: str):
file_bytes = s3_client.get_object(Bucket=env.bucket_name, Key=file_name)["Body"].read()
return document_loader_type(file_name=file_name, file_bytes=BytesIO(file_bytes), env=env).lazy_load()
return document_loader.lazy_load(file_name=file_name, file_bytes=BytesIO(file_bytes))

return wrapped


def ingest_from_loader(
document_loader_type: type[BaseRedboxFileLoader], s3_client: S3Client, vectorstore: VectorStore, env: Settings
loader: UnstructuredChunkLoader, s3_client: S3Client, vectorstore: VectorStore, env: Settings
) -> Runnable:
return (
document_loader(document_loader_type=document_loader_type, s3_client=s3_client, env=env)
document_loader(document_loader=loader, s3_client=s3_client, env=env)
| RunnableLambda(list)
| log_chunks
| RunnableLambda(partial(vectorstore.add_documents, create_index_if_not_exists=False)) # type: ignore[arg-type]
Expand Down
4 changes: 0 additions & 4 deletions redbox-core/redbox/loader/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +0,0 @@
from .loaders import UnstructuredTitleLoader
from .loaders import UnstructuredLargeChunkLoader

__all__ = ["UnstructuredTitleLoader", "UnstructuredLargeChunkLoader"]
17 changes: 0 additions & 17 deletions redbox-core/redbox/loader/base.py

This file was deleted.

19 changes: 16 additions & 3 deletions redbox-core/redbox/loader/ingester.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@

from redbox.chains.components import get_embeddings
from redbox.chains.ingest import ingest_from_loader
from redbox.loader import UnstructuredLargeChunkLoader, UnstructuredTitleLoader
from redbox.loader.loaders import UnstructuredChunkLoader
from redbox.models import Settings
from redbox.models.file import ChunkResolution

if TYPE_CHECKING:
from mypy_boto3_s3.client import S3Client
Expand Down Expand Up @@ -43,14 +44,26 @@ def ingest_file(file_name: str) -> str | None:
es.indices.create(index=es_index_name, ignore=[400])

chunk_ingest_chain = ingest_from_loader(
document_loader_type=UnstructuredTitleLoader,
loader=UnstructuredChunkLoader(
chunk_resolution=ChunkResolution.normal,
env=env,
min_chunk_size=env.worker_ingest_min_chunk_size,
max_chunk_size=env.worker_ingest_max_chunk_size,
overlap_chars=0
),
s3_client=env.s3_client(),
vectorstore=get_elasticsearch_store(es, es_index_name),
env=env,
)

large_chunk_ingest_chain = ingest_from_loader(
document_loader_type=UnstructuredLargeChunkLoader,
loader=UnstructuredChunkLoader(
chunk_resolution=ChunkResolution.largest,
env=env,
min_chunk_size=env.worker_ingest_largest_chunk_size,
max_chunk_size=env.worker_ingest_largest_chunk_size,
overlap_chars=env.worker_ingest_largest_chunk_overlap
),
s3_client=env.s3_client(),
vectorstore=get_elasticsearch_store_without_embeddings(es, es_index_name),
env=env,
Expand Down
75 changes: 19 additions & 56 deletions redbox-core/redbox/loader/loaders.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from collections.abc import Iterator
from datetime import UTC, datetime
from io import BytesIO
from typing import TYPE_CHECKING

import requests
import tiktoken
from langchain_core.documents import Document

from redbox.models.file import ChunkResolution, ChunkMetadata
from redbox.loader.base import BaseRedboxFileLoader
from redbox.models.settings import Settings

encoding = tiktoken.get_encoding("cl100k_base")

Expand All @@ -17,76 +18,38 @@
S3Client = object


class UnstructuredLargeChunkLoader(BaseRedboxFileLoader):
class UnstructuredChunkLoader:
"""Load, partition and chunk a document using local unstructured library"""

def lazy_load(self) -> Iterator[Document]: # <-- Does not take any arguments
"""A lazy loader that reads a file line by line.

When you're implementing lazy load methods, you should use a generator
to yield documents one by one.
"""

url = f"http://{self.env.unstructured_host}:8000/general/v0/general"
files = {
"files": (self.file_name, self.file_bytes),
}
response = requests.post(
url,
files=files,
data={
"strategy": "fast",
"max_characters": self.env.worker_ingest_largest_chunk_size,
"overlap": self.env.worker_ingest_largest_chunk_overlap,
"overlap_all": True,
},
)

if response.status_code != 200:
raise ValueError(response.text)

elements = response.json()
def __init__(self, chunk_resolution: ChunkResolution, env: Settings, min_chunk_size: int, max_chunk_size: int, overlap_chars: int = 0, overlap_all_chunks: bool = True):
self.chunk_resolution = chunk_resolution
self.env = env
self._min_chunk_size = min_chunk_size
self._max_chunk_size = max_chunk_size
self._overlap_chars = overlap_chars
self._overlap_all_chunks = overlap_all_chunks

if not elements:
raise ValueError("Unstructured failed to extract text for this file")

for i, raw_chunk in enumerate(elements):
yield Document(
page_content=raw_chunk["text"],
metadata=ChunkMetadata(
index=i,
file_name=raw_chunk["metadata"].get("filename"),
page_number=raw_chunk["metadata"].get("page_number"),
created_datetime=datetime.now(UTC),
token_count=len(encoding.encode(raw_chunk["text"])),
chunk_resolution=ChunkResolution.largest,
).model_dump(),
)


class UnstructuredTitleLoader(BaseRedboxFileLoader):
"""Load, partition and chunk a document using local unstructured library"""

def lazy_load(self) -> Iterator[Document]: # <-- Does not take any arguments
def lazy_load(self, file_name: str, file_bytes: BytesIO) -> Iterator[Document]:
"""A lazy loader that reads a file line by line.

When you're implementing lazy load methods, you should use a generator
to yield documents one by one.
"""

url = f"http://{self.env.unstructured_host}:8000/general/v0/general"

files = {
"files": (self.file_name, self.file_bytes),
"files": (file_name, file_bytes),
}
response = requests.post(
url,
files=files,
data={
"chunking_strategy": "by_title",
"strategy": "fast",
"combine_under_n_chars": self.env.worker_ingest_min_chunk_size,
"max_characters": self.env.worker_ingest_max_chunk_size,
"chunking_strategy": "by_title",
"max_characters": self._max_chunk_size,
"combine_under_n_chars": self._min_chunk_size,
"overlap": self._overlap_chars,
"overlap_all": self._overlap_all_chunks,
},
)

Expand All @@ -107,6 +70,6 @@ def lazy_load(self) -> Iterator[Document]: # <-- Does not take any arguments
page_number=raw_chunk["metadata"].get("page_number"),
created_datetime=datetime.now(UTC),
token_count=len(encoding.encode(raw_chunk["text"])),
chunk_resolution=ChunkResolution.normal,
chunk_resolution=self.chunk_resolution,
).model_dump(),
)
)
34 changes: 20 additions & 14 deletions redbox-core/tests/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@
from redbox.loader import ingester
from redbox.loader.ingester import ingest_file
from redbox.chains.ingest import document_loader, ingest_from_loader
from redbox.loader.base import BaseRedboxFileLoader
from redbox.loader.loaders import UnstructuredTitleLoader
from redbox.loader.loaders import UnstructuredLargeChunkLoader
from redbox.loader.loaders import UnstructuredChunkLoader
from redbox.models.settings import Settings
from redbox.retriever.queries import make_query_filter
from redbox.models.file import ChunkResolution
Expand Down Expand Up @@ -49,12 +47,8 @@ def make_file_query(file_name: str, resolution: ChunkResolution | None = None) -


@patch("redbox.loader.loaders.requests.post")
@pytest.mark.parametrize(
"document_loader_type",
[UnstructuredTitleLoader, UnstructuredLargeChunkLoader],
)
def test_document_loader(
mock_post: MagicMock, document_loader_type: type[BaseRedboxFileLoader], s3_client: S3Client, env: Settings
mock_post: MagicMock, s3_client: S3Client, env: Settings
):
"""
Given that I have written a text File to s3
Expand All @@ -77,26 +71,31 @@ def test_document_loader(
},
}
]
loader = UnstructuredChunkLoader(
chunk_resolution=ChunkResolution.normal,
env=env,
min_chunk_size=env.worker_ingest_min_chunk_size,
max_chunk_size=env.worker_ingest_max_chunk_size
)

# Upload file and and call
file = file_to_s3("html/example.html", s3_client, env)
loader = document_loader(document_loader_type, s3_client, env)
loader = document_loader(loader, s3_client, env)
chunks = list(loader.invoke(file))

assert len(chunks) > 0


@patch("redbox.loader.loaders.requests.post")
@pytest.mark.parametrize(
"document_loader_type, resolution, has_embeddings",
"resolution, has_embeddings",
[
(UnstructuredTitleLoader, ChunkResolution.normal, True),
(UnstructuredLargeChunkLoader, ChunkResolution.largest, False),
(ChunkResolution.normal, True),
(ChunkResolution.largest, False),
],
)
def test_ingest_from_loader(
mock_post: MagicMock,
document_loader_type: type[BaseRedboxFileLoader],
resolution: ChunkResolution,
has_embeddings: bool,
monkeypatch: MonkeyPatch,
Expand Down Expand Up @@ -127,13 +126,20 @@ def test_ingest_from_loader(
}
]

loader = UnstructuredChunkLoader(
chunk_resolution=resolution,
env=env,
min_chunk_size=env.worker_ingest_min_chunk_size,
max_chunk_size=env.worker_ingest_max_chunk_size
)

# Mock embeddings
monkeypatch.setattr(ingester, "get_embeddings", lambda _: FakeEmbeddings(size=3072))

# Upload file and call
file_name = file_to_s3(filename="html/example.html", s3_client=s3_client, env=env)
ingest_chain = ingest_from_loader(
document_loader_type=document_loader_type, s3_client=s3_client, vectorstore=es_vector_store, env=env
loader=loader, s3_client=s3_client, vectorstore=es_vector_store, env=env
)

_ = ingest_chain.invoke(file_name)
Expand Down
Loading