Skip to content

Commit

Permalink
Allow base_store to be used directly with MultiVectorRetriever (langc…
Browse files Browse the repository at this point in the history
…hain-ai#14202)

Allow users to pass a generic `BaseStore[str, bytes]` to
MultiVectorRetriever, removing the need to use the `create_kv_docstore`
method. This encoding will now happen internally.

@rlancemartin @eyurtsev

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
  • Loading branch information
2 people authored and aymeric-roucher committed Dec 11, 2023
1 parent 83b9d86 commit bb34116
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 48 deletions.
78 changes: 35 additions & 43 deletions docs/docs/modules/data_connection/retrievers/multi_vector.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@
"# The retriever (empty to start)\n",
"retriever = MultiVectorRetriever(\n",
" vectorstore=vectorstore,\n",
" docstore=store,\n",
" base_store=store,\n",
" id_key=id_key,\n",
")\n",
"import uuid\n",
Expand Down Expand Up @@ -143,7 +143,7 @@
{
"data": {
"text/plain": [
"Document(page_content='Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.', metadata={'doc_id': '455205f7-bb7d-4c36-b442-d1d6f9f701ed', 'source': '../../state_of_the_union.txt'})"
"Document(page_content='Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.', metadata={'doc_id': '59899493-92a0-41cb-b6ba-a854730ad74a', 'source': '../../state_of_the_union.txt'})"
]
},
"execution_count": 8,
Expand Down Expand Up @@ -188,7 +188,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 10,
"id": "36739460-a737-4a8e-b70f-50bf8c8eaae7",
"metadata": {},
"outputs": [
Expand All @@ -198,7 +198,7 @@
"9875"
]
},
"execution_count": 15,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -223,7 +223,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"id": "1433dff4",
"metadata": {},
"outputs": [],
Expand All @@ -238,7 +238,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 12,
"id": "35b30390",
"metadata": {},
"outputs": [],
Expand All @@ -253,7 +253,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 13,
"id": "41a2a738",
"metadata": {},
"outputs": [],
Expand All @@ -263,7 +263,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 14,
"id": "7ac5e4b1",
"metadata": {},
"outputs": [],
Expand All @@ -276,7 +276,7 @@
"# The retriever (empty to start)\n",
"retriever = MultiVectorRetriever(\n",
" vectorstore=vectorstore,\n",
" docstore=store,\n",
" base_store=store,\n",
" id_key=id_key,\n",
")\n",
"doc_ids = [str(uuid.uuid4()) for _ in docs]"
Expand Down Expand Up @@ -338,7 +338,7 @@
{
"data": {
"text/plain": [
"Document(page_content=\"The document is a transcript of a speech given by the President of the United States. The President discusses several important issues and initiatives, including the nomination of a Supreme Court Justice, border security and immigration reform, protecting women's rights, advancing LGBTQ+ equality, bipartisan legislation, addressing the opioid epidemic and mental health, supporting veterans, investigating the health effects of burn pits on military personnel, ending cancer, and the strength and resilience of the American people.\", metadata={'doc_id': '79fa2e9f-28d9-4372-8af3-2caf4f1de312'})"
"Document(page_content=\"The document is a speech given by the President of the United States. The President discusses various important issues and goals for the country, including nominating a Supreme Court Justice, securing the border and fixing the immigration system, protecting women's rights, supporting veterans, addressing the opioid epidemic, improving mental health care, and ending cancer. The President emphasizes the unity and strength of the American people and expresses optimism for the future of the nation.\", metadata={'doc_id': '8fdf4009-628c-400d-949c-1d3f4daf1e66'})"
]
},
"execution_count": 19,
Expand Down Expand Up @@ -393,7 +393,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 22,
"id": "5219b085",
"metadata": {},
"outputs": [],
Expand All @@ -418,7 +418,7 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 23,
"id": "523deb92",
"metadata": {},
"outputs": [],
Expand All @@ -429,7 +429,7 @@
" {\"doc\": lambda x: x.page_content}\n",
" # Only asking for 3 hypothetical questions, but this could be adjusted\n",
" | ChatPromptTemplate.from_template(\n",
" \"Generate a list of 3 hypothetical questions that the below document could be used to answer:\\n\\n{doc}\"\n",
" \"Generate a list of exactly 3 hypothetical questions that the below document could be used to answer:\\n\\n{doc}\"\n",
" )\n",
" | ChatOpenAI(max_retries=0, model=\"gpt-4\").bind(\n",
" functions=functions, function_call={\"name\": \"hypothetical_questions\"}\n",
Expand All @@ -440,19 +440,19 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 24,
"id": "11d30554",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[\"What was the author's initial impression of philosophy as a field of study, and how did it change when they got to college?\",\n",
" 'Why did the author decide to switch their focus to Artificial Intelligence (AI)?',\n",
" \"What led to the author's disillusionment with the field of AI as it was practiced at the time?\"]"
"[\"What were the author's initial areas of interest before college?\",\n",
" \"What was the author's experience with programming in his early years?\",\n",
" 'Why did the author switch his focus from AI to Lisp?']"
]
},
"execution_count": 33,
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -463,7 +463,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 25,
"id": "3eb2e48c",
"metadata": {},
"outputs": [],
Expand All @@ -473,7 +473,7 @@
},
{
"cell_type": "code",
"execution_count": 67,
"execution_count": 26,
"id": "b2cd6e75",
"metadata": {},
"outputs": [],
Expand All @@ -488,15 +488,15 @@
"# The retriever (empty to start)\n",
"retriever = MultiVectorRetriever(\n",
" vectorstore=vectorstore,\n",
" docstore=store,\n",
" base_store=store,\n",
" id_key=id_key,\n",
")\n",
"doc_ids = [str(uuid.uuid4()) for _ in docs]"
]
},
{
"cell_type": "code",
"execution_count": 68,
"execution_count": 27,
"id": "18831b3b",
"metadata": {},
"outputs": [],
Expand All @@ -510,7 +510,7 @@
},
{
"cell_type": "code",
"execution_count": 69,
"execution_count": 28,
"id": "224b24c5",
"metadata": {},
"outputs": [],
Expand All @@ -521,7 +521,7 @@
},
{
"cell_type": "code",
"execution_count": 70,
"execution_count": 29,
"id": "7b442b90",
"metadata": {},
"outputs": [],
Expand All @@ -531,20 +531,20 @@
},
{
"cell_type": "code",
"execution_count": 71,
"execution_count": 30,
"id": "089b5ad0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content=\"What is the President's stance on immigration reform?\", metadata={'doc_id': '505d73e3-8350-46ec-a58e-3af032f04ab3'}),\n",
" Document(page_content=\"What is the President's stance on immigration reform?\", metadata={'doc_id': '1c9618f0-7660-4b4f-a37c-509cbbbf6dba'}),\n",
" Document(page_content=\"What is the President's stance on immigration reform?\", metadata={'doc_id': '82c08209-b904-46a8-9532-edd2380950b7'}),\n",
" Document(page_content='What measures is the President proposing to protect the rights of LGBTQ+ Americans?', metadata={'doc_id': '82c08209-b904-46a8-9532-edd2380950b7'})]"
"[Document(page_content='What made Robert Morris advise the author to leave Y Combinator?', metadata={'doc_id': '740e484e-d67c-45f7-989d-9928aaf51c28'}),\n",
" Document(page_content=\"How did the author's mother's illness affect his decision to leave Y Combinator?\", metadata={'doc_id': '740e484e-d67c-45f7-989d-9928aaf51c28'}),\n",
" Document(page_content='What led the author to start publishing essays online?', metadata={'doc_id': '675ccee3-ce0b-4d5d-892c-b8942370babd'}),\n",
" Document(page_content='What measures are being taken to secure the border and fix the immigration system?', metadata={'doc_id': '2d51f010-969e-48a9-9e82-6b12bc7ab3d4'})]"
]
},
"execution_count": 71,
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -555,7 +555,7 @@
},
{
"cell_type": "code",
"execution_count": 72,
"execution_count": 31,
"id": "7594b24e",
"metadata": {},
"outputs": [],
Expand All @@ -565,32 +565,24 @@
},
{
"cell_type": "code",
"execution_count": 73,
"execution_count": 32,
"id": "4c120c65",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"9194"
"9844"
]
},
"execution_count": 73,
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(retrieved_docs[0].page_content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "616cfeeb",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -609,7 +601,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
"version": "3.10.5"
}
},
"nbformat": 4,
Expand Down
33 changes: 28 additions & 5 deletions libs/langchain/langchain/retrievers/multi_vector.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from enum import Enum
from typing import List
from typing import List, Optional

from langchain_core.documents import Document
from langchain_core.pydantic_v1 import Field
from langchain_core.retrievers import BaseRetriever
from langchain_core.stores import BaseStore
from langchain_core.vectorstores import VectorStore

from langchain.callbacks.manager import CallbackManagerForRetrieverRun
from langchain.storage._lc_store import create_kv_docstore


class SearchType(str, Enum):
Expand All @@ -27,12 +27,35 @@ class MultiVectorRetriever(BaseRetriever):
and their embedding vectors"""
docstore: BaseStore[str, Document]
"""The storage layer for the parent documents"""
id_key: str = "doc_id"
search_kwargs: dict = Field(default_factory=dict)
id_key: str
search_kwargs: dict
"""Keyword arguments to pass to the search function."""
search_type: SearchType = SearchType.similarity
search_type: SearchType
"""Type of search to perform (similarity / mmr)"""

def __init__(
self,
*,
vectorstore: VectorStore,
docstore: Optional[BaseStore[str, Document]] = None,
base_store: Optional[BaseStore[str, bytes]] = None,
id_key: str = "doc_id",
search_kwargs: Optional[dict] = None,
search_type: SearchType = SearchType.similarity,
):
if base_store is not None:
docstore = create_kv_docstore(base_store)
elif docstore is None:
raise Exception("You must pass a `base_store` parameter.")

super().__init__(
vectorstore=vectorstore,
docstore=docstore,
id_key=id_key,
search_kwargs=search_kwargs if search_kwargs is not None else {},
search_type=search_type,
)

def _get_relevant_documents(
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
) -> List[Document]:
Expand Down

0 comments on commit bb34116

Please sign in to comment.