fix: improve indexing of multiple documents (#8)

superlinear-ai · Aug 17, 2024 · fa2bad8 · fa2bad8
1 parent 107db7e
commit fa2bad8
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ RAGLite is a Python package for Retrieval-Augmented Generation (RAG) with SQLite
 4. 📖 PDF to Markdown conversion on top of [pdftext](https://github.com/VikParuchuri/pdftext) and [pypdfium2](https://github.com/pypdfium2-team/pypdfium2)
 5. ✂️ Optimal [level 4 semantic chunking](https://medium.com/@anuragmishra_27746/five-levels-of-chunking-strategies-in-rag-notes-from-gregs-video-7b735895694d) by solving a [binary integer programming problem](https://en.wikipedia.org/wiki/Integer_programming)
 6. 📌 Markdown-based [contextual chunk headings](https://d-star.ai/solving-the-out-of-context-chunk-problem-for-rag)
-7. 🌈 Sub-chunk matching with [multi-vector chunk retrieval](https://python.langchain.com/v0.2/docs/how_to/multi_vector/)
+7. 🌈 Combined sentence-level and chunk-level matching with [multi-vector chunk retrieval](https://python.langchain.com/v0.2/docs/how_to/multi_vector/)
 8. 🌀 Optimal [closed-form linear query adapter](src/raglite/_query_adapter.py) by solving an [orthogonal Procrustes problem](https://en.wikipedia.org/wiki/Orthogonal_Procrustes_problem)
 9. 🔍 [Hybrid search](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) that combines [SQLite's BM25 full-text search](https://sqlite.org/fts5.html) with [PyNNDescent's ANN vector search](https://github.com/lmcinnes/pynndescent)
 10. ✍️ Optional support for conversion of any input document to Markdown with [Pandoc](https://github.com/jgm/pandoc)

diff --git a/src/raglite/_config.py b/src/raglite/_config.py
@@ -37,10 +37,17 @@ def default_llm() -> Llama:
 @lru_cache(maxsize=1)
 def default_embedder() -> Llama:
     """Get default embedder."""
+    # Select the best available embedder for the given accelerator.
+    if llama_supports_gpu_offload():
+        repo_id = "ChristianAzinn/snowflake-arctic-embed-l-gguf"  # https://github.com/Snowflake-Labs/arctic-embed
+        filename = "*f16.GGUF"
+    else:
+        repo_id = "yishan-wang/snowflake-arctic-embed-m-v1.5-Q8_0-GGUF"  # https://github.com/Snowflake-Labs/arctic-embed
+        filename = "*q8_0.gguf"
     # Load the embedder.
     embedder = Llama.from_pretrained(
-        repo_id="yishan-wang/snowflake-arctic-embed-m-v1.5-Q8_0-GGUF",  # https://github.com/Snowflake-Labs/arctic-embed
-        filename="*q8_0.gguf",
+        repo_id=repo_id,
+        filename=filename,
         n_ctx=0,  # 0 = Use the model's context size (default is 512).
         n_gpu_layers=-1,  # -1 = Offload all layers to the GPU (default is 0).
         verbose=False,
@@ -62,7 +69,7 @@ class RAGLiteConfig:
     embedder_batch_size: int = 128
     embedder_dtype: npt.DTypeLike = np.float16
     embedder_normalize: bool = True
-    multi_vector_weight: float = 0.5  # Between 0 (chunk embedding) and 1 (sentence embedding).
+    sentence_embedding_weight: float = 0.5  # Between 0 (chunk level) and 1 (sentence level).
     # Chunker config used to partition documents into chunks.
     chunk_max_size: int = 1440  # Max number of characters per chunk.
     chunk_sentence_window_size: int = 3

diff --git a/src/raglite/_index.py b/src/raglite/_index.py
@@ -1,5 +1,6 @@
 """Index documents."""
 
+from copy import deepcopy
 from functools import partial
 from pathlib import Path
 
@@ -20,37 +21,31 @@
 def _create_chunk_records(
     document_id: str,
     chunks: list[str],
-    multi_vector_embeddings: list[FloatMatrix],
+    sentence_embeddings: list[FloatMatrix],
     config: RAGLiteConfig,
 ) -> list[Chunk]:
-    """Process chunks into headings, body and improved multi-vector embeddings."""
+    """Process chunks into chunk records comprising headings, body, and a multi-vector embedding."""
     # Create the chunk records.
-    chunk_records = []
-    contextualized_chunks = []
-    headings = ""
+    chunk_records, headings = [], ""
     for i, chunk in enumerate(chunks):
-        # Create and append the contextualised chunk, which includes the current Markdown headings.
-        contextualized_chunks.append(headings + "\n\n" + chunk)
         # Create and append the chunk record.
-        chunk_record = Chunk.from_body(
-            document_id=document_id, index=i, body=chunk, headings=headings
-        )
-        chunk_records.append(chunk_record)
+        record = Chunk.from_body(document_id=document_id, index=i, body=chunk, headings=headings)
+        chunk_records.append(record)
         # Update the Markdown headings with those of this chunk.
-        headings = chunk_record.extract_headings()
-    # Embed the contextualised chunks.
-    contextualized_embeddings = embed_strings(contextualized_chunks, config=config)
-    # Update the chunk records with improved multi-vector embeddings that combine its multi-vector
-    # embedding with its contextualised chunk embedding.
-    for chunk_record, multi_vector_embedding, contextualized_embedding in zip(
-        chunk_records, multi_vector_embeddings, contextualized_embeddings, strict=True
+        headings = record.extract_headings()
+    # Embed the contextualised chunks, which include the current Markdown headings.
+    contextualized_embeddings = embed_strings([str(chunk) for chunk in chunks], config=config)
+    # Set the chunk's multi-vector embedding as a linear combination of its sentence embeddings
+    # (for local context) and an embedding of the contextualised chunk (for global context).
+    for record, sentence_embedding, contextualized_embedding in zip(
+        chunk_records, sentence_embeddings, contextualized_embeddings, strict=True
     ):
         chunk_embedding = (
-            config.multi_vector_weight * multi_vector_embedding
-            + (1 - config.multi_vector_weight) * contextualized_embedding[np.newaxis, :]
+            config.sentence_embedding_weight * sentence_embedding
+            + (1 - config.sentence_embedding_weight) * contextualized_embedding[np.newaxis, :]
         )
         chunk_embedding = chunk_embedding / np.linalg.norm(chunk_embedding, axis=1, keepdims=True)
-        chunk_record.multi_vector_embedding = chunk_embedding
+        record.multi_vector_embedding = chunk_embedding
     return chunk_records
 
 
@@ -72,7 +67,7 @@ def insert_document(
         sentences = split_sentences(doc, max_len=config.chunk_max_size)
         pbar.update(1)
         pbar.set_description("Splitting chunks")
-        chunks, multi_vector_embeddings = split_chunks(
+        chunks, sentence_embeddings = split_chunks(
             sentences,
             max_size=config.chunk_max_size,
             sentence_window_size=config.chunk_sentence_window_size,
@@ -88,7 +83,7 @@ def insert_document(
             session.commit()
         # Create the chunk records.
         chunk_records = _create_chunk_records(
-            document_record.id, chunks, multi_vector_embeddings, config
+            document_record.id, chunks, sentence_embeddings, config
         )
         # Store the chunk records.
         for chunk_record in tqdm(
@@ -124,21 +119,22 @@ def update_vector_index(config: RAGLiteConfig | None = None) -> None:
             unit="chunk",
             dynamic_ncols=True,
         ) as pbar:
+            # Fit or update the ANN index.
             pbar.update(num_chunks_indexed)
             if num_chunks_unindexed == 0:
                 return
             X_unindexed = np.vstack([chunk.multi_vector_embedding for chunk in unindexed_chunks])  # noqa: N806
             if num_chunks_indexed == 0:
-                vector_search_chunk_index.index = NNDescent(
-                    X_unindexed, metric=config.vector_search_index_metric
-                )
-                vector_search_chunk_index.index.prepare()
+                nndescent = NNDescent(X_unindexed, metric=config.vector_search_index_metric)
             else:
-                vector_search_chunk_index.index.update(X_unindexed)  # type: ignore[union-attr]
-                vector_search_chunk_index.index.prepare()  # type: ignore[union-attr]
-            vector_search_chunk_index.chunk_sizes.extend(
-                [chunk.multi_vector_embedding.shape[0] for chunk in unindexed_chunks]
-            )
+                nndescent = deepcopy(vector_search_chunk_index.index)
+                nndescent.update(X_unindexed)
+            nndescent.prepare()
+            # Mark the vector search chunk index as dirty.
+            vector_search_chunk_index.index = nndescent
+            vector_search_chunk_index.chunk_sizes = vector_search_chunk_index.chunk_sizes + [
+                chunk.multi_vector_embedding.shape[0] for chunk in unindexed_chunks
+            ]
             # Store the updated vector search chunk index.
             session.add(vector_search_chunk_index)
             session.commit()

diff --git a/src/raglite/_split_chunks.py b/src/raglite/_split_chunks.py
@@ -17,7 +17,7 @@ def split_chunks(
     sentence_window_size: int = 3,
     embed: Callable[[list[str]], FloatMatrix] = embed_strings,
 ) -> tuple[list[str], list[FloatMatrix]]:
-    """Split sentences into optimal semantic chunks."""
+    """Split sentences into optimal semantic chunks with corresponding sentence embeddings."""
     # Window the sentences.
     whisker_size = (sentence_window_size - 1) // 2
     windows = [
@@ -96,8 +96,8 @@ def split_chunks(
         "".join(sentences[i:j])
         for i, j in zip([0, *partition_indices], [*partition_indices, len(sentences)], strict=True)
     ]
-    multi_vector_embeddings = [
+    sentence_embeddings = [
         window_embeddings[i:j]
         for i, j in zip([0, *partition_indices], [*partition_indices, len(sentences)], strict=True)
     ]
-    return chunks, multi_vector_embeddings
+    return chunks, sentence_embeddings
diff --git a/tests/test_basic.py b/tests/test_basic.py
@@ -2,22 +2,14 @@
 
 from pathlib import Path
 
-from raglite import (
-    RAGLiteConfig,
-    hybrid_search,
-    insert_document,
-    retrieve_segments,
-    update_vector_index,
-)
+from raglite import RAGLiteConfig, hybrid_search, insert_document, retrieve_segments
 
 
 def test_insert_index_search(simple_config: RAGLiteConfig) -> None:
     """Test inserting a document, updating the vector index, and searching for a query."""
-    # Insert a document.
+    # Insert a document and update the index.
     doc_path = Path(__file__).parent / "specrel.pdf"  # Einstein's special relativity paper.
     insert_document(doc_path, config=simple_config)
-    # Update the vector index with the new document.
-    update_vector_index(config=simple_config)
     # Search for a query.
     query = "What does it mean for two events to be simultaneous?"
     chunk_rowids, scores = hybrid_search(query, config=simple_config)