Skip to content

Commit

Permalink
fix: improve indexing of multiple documents (#8)
Browse files Browse the repository at this point in the history
  • Loading branch information
lsorber authored Aug 17, 2024
1 parent 107db7e commit fa2bad8
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 49 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ RAGLite is a Python package for Retrieval-Augmented Generation (RAG) with SQLite
4. 📖 PDF to Markdown conversion on top of [pdftext](https://github.com/VikParuchuri/pdftext) and [pypdfium2](https://github.com/pypdfium2-team/pypdfium2)
5. ✂️ Optimal [level 4 semantic chunking](https://medium.com/@anuragmishra_27746/five-levels-of-chunking-strategies-in-rag-notes-from-gregs-video-7b735895694d) by solving a [binary integer programming problem](https://en.wikipedia.org/wiki/Integer_programming)
6. 📌 Markdown-based [contextual chunk headings](https://d-star.ai/solving-the-out-of-context-chunk-problem-for-rag)
7. 🌈 Sub-chunk matching with [multi-vector chunk retrieval](https://python.langchain.com/v0.2/docs/how_to/multi_vector/)
7. 🌈 Combined sentence-level and chunk-level matching with [multi-vector chunk retrieval](https://python.langchain.com/v0.2/docs/how_to/multi_vector/)
8. 🌀 Optimal [closed-form linear query adapter](src/raglite/_query_adapter.py) by solving an [orthogonal Procrustes problem](https://en.wikipedia.org/wiki/Orthogonal_Procrustes_problem)
9. 🔍 [Hybrid search](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) that combines [SQLite's BM25 full-text search](https://sqlite.org/fts5.html) with [PyNNDescent's ANN vector search](https://github.com/lmcinnes/pynndescent)
10. ✍️ Optional support for conversion of any input document to Markdown with [Pandoc](https://github.com/jgm/pandoc)
Expand Down
13 changes: 10 additions & 3 deletions src/raglite/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,17 @@ def default_llm() -> Llama:
@lru_cache(maxsize=1)
def default_embedder() -> Llama:
"""Get default embedder."""
# Select the best available embedder for the given accelerator.
if llama_supports_gpu_offload():
repo_id = "ChristianAzinn/snowflake-arctic-embed-l-gguf" # https://github.com/Snowflake-Labs/arctic-embed
filename = "*f16.GGUF"
else:
repo_id = "yishan-wang/snowflake-arctic-embed-m-v1.5-Q8_0-GGUF" # https://github.com/Snowflake-Labs/arctic-embed
filename = "*q8_0.gguf"
# Load the embedder.
embedder = Llama.from_pretrained(
repo_id="yishan-wang/snowflake-arctic-embed-m-v1.5-Q8_0-GGUF", # https://github.com/Snowflake-Labs/arctic-embed
filename="*q8_0.gguf",
repo_id=repo_id,
filename=filename,
n_ctx=0, # 0 = Use the model's context size (default is 512).
n_gpu_layers=-1, # -1 = Offload all layers to the GPU (default is 0).
verbose=False,
Expand All @@ -62,7 +69,7 @@ class RAGLiteConfig:
embedder_batch_size: int = 128
embedder_dtype: npt.DTypeLike = np.float16
embedder_normalize: bool = True
multi_vector_weight: float = 0.5 # Between 0 (chunk embedding) and 1 (sentence embedding).
sentence_embedding_weight: float = 0.5 # Between 0 (chunk level) and 1 (sentence level).
# Chunker config used to partition documents into chunks.
chunk_max_size: int = 1440 # Max number of characters per chunk.
chunk_sentence_window_size: int = 3
Expand Down
60 changes: 28 additions & 32 deletions src/raglite/_index.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Index documents."""

from copy import deepcopy
from functools import partial
from pathlib import Path

Expand All @@ -20,37 +21,31 @@
def _create_chunk_records(
document_id: str,
chunks: list[str],
multi_vector_embeddings: list[FloatMatrix],
sentence_embeddings: list[FloatMatrix],
config: RAGLiteConfig,
) -> list[Chunk]:
"""Process chunks into headings, body and improved multi-vector embeddings."""
"""Process chunks into chunk records comprising headings, body, and a multi-vector embedding."""
# Create the chunk records.
chunk_records = []
contextualized_chunks = []
headings = ""
chunk_records, headings = [], ""
for i, chunk in enumerate(chunks):
# Create and append the contextualised chunk, which includes the current Markdown headings.
contextualized_chunks.append(headings + "\n\n" + chunk)
# Create and append the chunk record.
chunk_record = Chunk.from_body(
document_id=document_id, index=i, body=chunk, headings=headings
)
chunk_records.append(chunk_record)
record = Chunk.from_body(document_id=document_id, index=i, body=chunk, headings=headings)
chunk_records.append(record)
# Update the Markdown headings with those of this chunk.
headings = chunk_record.extract_headings()
# Embed the contextualised chunks.
contextualized_embeddings = embed_strings(contextualized_chunks, config=config)
# Update the chunk records with improved multi-vector embeddings that combine its multi-vector
# embedding with its contextualised chunk embedding.
for chunk_record, multi_vector_embedding, contextualized_embedding in zip(
chunk_records, multi_vector_embeddings, contextualized_embeddings, strict=True
headings = record.extract_headings()
# Embed the contextualised chunks, which include the current Markdown headings.
contextualized_embeddings = embed_strings([str(chunk) for chunk in chunks], config=config)
# Set the chunk's multi-vector embedding as a linear combination of its sentence embeddings
# (for local context) and an embedding of the contextualised chunk (for global context).
for record, sentence_embedding, contextualized_embedding in zip(
chunk_records, sentence_embeddings, contextualized_embeddings, strict=True
):
chunk_embedding = (
config.multi_vector_weight * multi_vector_embedding
+ (1 - config.multi_vector_weight) * contextualized_embedding[np.newaxis, :]
config.sentence_embedding_weight * sentence_embedding
+ (1 - config.sentence_embedding_weight) * contextualized_embedding[np.newaxis, :]
)
chunk_embedding = chunk_embedding / np.linalg.norm(chunk_embedding, axis=1, keepdims=True)
chunk_record.multi_vector_embedding = chunk_embedding
record.multi_vector_embedding = chunk_embedding
return chunk_records


Expand All @@ -72,7 +67,7 @@ def insert_document(
sentences = split_sentences(doc, max_len=config.chunk_max_size)
pbar.update(1)
pbar.set_description("Splitting chunks")
chunks, multi_vector_embeddings = split_chunks(
chunks, sentence_embeddings = split_chunks(
sentences,
max_size=config.chunk_max_size,
sentence_window_size=config.chunk_sentence_window_size,
Expand All @@ -88,7 +83,7 @@ def insert_document(
session.commit()
# Create the chunk records.
chunk_records = _create_chunk_records(
document_record.id, chunks, multi_vector_embeddings, config
document_record.id, chunks, sentence_embeddings, config
)
# Store the chunk records.
for chunk_record in tqdm(
Expand Down Expand Up @@ -124,21 +119,22 @@ def update_vector_index(config: RAGLiteConfig | None = None) -> None:
unit="chunk",
dynamic_ncols=True,
) as pbar:
# Fit or update the ANN index.
pbar.update(num_chunks_indexed)
if num_chunks_unindexed == 0:
return
X_unindexed = np.vstack([chunk.multi_vector_embedding for chunk in unindexed_chunks]) # noqa: N806
if num_chunks_indexed == 0:
vector_search_chunk_index.index = NNDescent(
X_unindexed, metric=config.vector_search_index_metric
)
vector_search_chunk_index.index.prepare()
nndescent = NNDescent(X_unindexed, metric=config.vector_search_index_metric)
else:
vector_search_chunk_index.index.update(X_unindexed) # type: ignore[union-attr]
vector_search_chunk_index.index.prepare() # type: ignore[union-attr]
vector_search_chunk_index.chunk_sizes.extend(
[chunk.multi_vector_embedding.shape[0] for chunk in unindexed_chunks]
)
nndescent = deepcopy(vector_search_chunk_index.index)
nndescent.update(X_unindexed)
nndescent.prepare()
# Mark the vector search chunk index as dirty.
vector_search_chunk_index.index = nndescent
vector_search_chunk_index.chunk_sizes = vector_search_chunk_index.chunk_sizes + [
chunk.multi_vector_embedding.shape[0] for chunk in unindexed_chunks
]
# Store the updated vector search chunk index.
session.add(vector_search_chunk_index)
session.commit()
Expand Down
6 changes: 3 additions & 3 deletions src/raglite/_split_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def split_chunks(
sentence_window_size: int = 3,
embed: Callable[[list[str]], FloatMatrix] = embed_strings,
) -> tuple[list[str], list[FloatMatrix]]:
"""Split sentences into optimal semantic chunks."""
"""Split sentences into optimal semantic chunks with corresponding sentence embeddings."""
# Window the sentences.
whisker_size = (sentence_window_size - 1) // 2
windows = [
Expand Down Expand Up @@ -96,8 +96,8 @@ def split_chunks(
"".join(sentences[i:j])
for i, j in zip([0, *partition_indices], [*partition_indices, len(sentences)], strict=True)
]
multi_vector_embeddings = [
sentence_embeddings = [
window_embeddings[i:j]
for i, j in zip([0, *partition_indices], [*partition_indices, len(sentences)], strict=True)
]
return chunks, multi_vector_embeddings
return chunks, sentence_embeddings
12 changes: 2 additions & 10 deletions tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,14 @@

from pathlib import Path

from raglite import (
RAGLiteConfig,
hybrid_search,
insert_document,
retrieve_segments,
update_vector_index,
)
from raglite import RAGLiteConfig, hybrid_search, insert_document, retrieve_segments


def test_insert_index_search(simple_config: RAGLiteConfig) -> None:
"""Test inserting a document, updating the vector index, and searching for a query."""
# Insert a document.
# Insert a document and update the index.
doc_path = Path(__file__).parent / "specrel.pdf" # Einstein's special relativity paper.
insert_document(doc_path, config=simple_config)
# Update the vector index with the new document.
update_vector_index(config=simple_config)
# Search for a query.
query = "What does it mean for two events to be simultaneous?"
chunk_rowids, scores = hybrid_search(query, config=simple_config)
Expand Down

0 comments on commit fa2bad8

Please sign in to comment.