Skip to content

Commit

Permalink
Support similarity search by vector (in FAISS) (#961)
Browse files Browse the repository at this point in the history
Alternate implementation to PR #960 Again - only FAISS is implemented.
If accepted can add this to other vectorstores or leave as
NotImplemented? Suggestions welcome...
  • Loading branch information
seanaedmiston authored Feb 16, 2023
1 parent 05ad399 commit f0a2585
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 8 deletions.
20 changes: 20 additions & 0 deletions docs/modules/utils/combine_docs_examples/vectorstores.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,26 @@
"docs_and_scores[0]"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "d5170563",
"metadata": {},
"source": [
"It is also possible to do a search for documents similar to a given embedding vector using `similarity_search_by_vector` which accepts an embedding vector as a parameter instead of a string."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7675b0aa",
"metadata": {},
"outputs": [],
"source": [
"embedding_vector = embeddings.embed_query(query)\n",
"docs_and_scores = docsearch.similarity_search_by_vector(embedding_vector)"
]
},
{
"cell_type": "markdown",
"id": "b386dbb8",
Expand Down
32 changes: 32 additions & 0 deletions langchain/vectorstores/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,20 @@ def similarity_search(
) -> List[Document]:
"""Return docs most similar to query."""

def similarity_search_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[Document]:
"""Return docs most similar to embedding vector.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
Returns:
List of Documents most similar to the query vector.
"""
raise NotImplementedError

def max_marginal_relevance_search(
self, query: str, k: int = 4, fetch_k: int = 20
) -> List[Document]:
Expand All @@ -49,6 +63,24 @@ def max_marginal_relevance_search(
"""
raise NotImplementedError

def max_marginal_relevance_search_by_vector(
self, embedding: List[float], k: int = 4, fetch_k: int = 20
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
Returns:
List of Documents selected by maximal marginal relevance.
"""
raise NotImplementedError

@classmethod
def from_documents(
cls,
Expand Down
67 changes: 59 additions & 8 deletions langchain/vectorstores/faiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ def add_texts(
self.index_to_docstore_id.update(index_to_id)
return [_id for _, _id, _ in full_info]

def similarity_search_with_score(
self, query: str, k: int = 4
def similarity_search_with_score_by_vector(
self, embedding: List[float], k: int = 4
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query.
Expand All @@ -104,7 +104,6 @@ def similarity_search_with_score(
Returns:
List of Documents most similar to the query and score for each
"""
embedding = self.embedding_function(query)
scores, indices = self.index.search(np.array([embedding], dtype=np.float32), k)
docs = []
for j, i in enumerate(indices[0]):
Expand All @@ -118,6 +117,37 @@ def similarity_search_with_score(
docs.append((doc, scores[0][j]))
return docs

def similarity_search_with_score(
self, query: str, k: int = 4
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
Returns:
List of Documents most similar to the query and score for each
"""
embedding = self.embedding_function(query)
docs = self.similarity_search_with_score_by_vector(embedding, k)
return docs

def similarity_search_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[Document]:
"""Return docs most similar to embedding vector.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
Returns:
List of Documents most similar to the embedding.
"""
docs_and_scores = self.similarity_search_with_score_by_vector(embedding, k)
return [doc for doc, _ in docs_and_scores]

def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
Expand All @@ -133,27 +163,28 @@ def similarity_search(
docs_and_scores = self.similarity_search_with_score(query, k)
return [doc for doc, _ in docs_and_scores]

def max_marginal_relevance_search(
self, query: str, k: int = 4, fetch_k: int = 20
def max_marginal_relevance_search_by_vector(
self, embedding: List[float], k: int = 4, fetch_k: int = 20
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
query: Text to look up documents similar to.
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
Returns:
List of Documents selected by maximal marginal relevance.
"""
embedding = self.embedding_function(query)
_, indices = self.index.search(np.array([embedding], dtype=np.float32), fetch_k)
# -1 happens when not enough docs are returned.
embeddings = [self.index.reconstruct(int(i)) for i in indices[0] if i != -1]
mmr_selected = maximal_marginal_relevance(embedding, embeddings, k=k)
mmr_selected = maximal_marginal_relevance(
np.array([embedding], dtype=np.float32), embeddings, k=k
)
selected_indices = [indices[0][i] for i in mmr_selected]
docs = []
for i in selected_indices:
Expand All @@ -164,6 +195,26 @@ def max_marginal_relevance_search(
docs.append(doc)
return docs

def max_marginal_relevance_search(
self, query: str, k: int = 4, fetch_k: int = 20
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
Returns:
List of Documents selected by maximal marginal relevance.
"""
embedding = self.embedding_function(query)
docs = self.max_marginal_relevance_search_by_vector(embedding, k, fetch_k)
return docs

@classmethod
def from_texts(
cls,
Expand Down
18 changes: 18 additions & 0 deletions tests/integration_tests/vectorstores/test_faiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,24 @@ def test_faiss() -> None:
assert output == [Document(page_content="foo")]


def test_faiss_vector_sim() -> None:
"""Test vector similarity."""
texts = ["foo", "bar", "baz"]
docsearch = FAISS.from_texts(texts, FakeEmbeddings())
index_to_id = docsearch.index_to_docstore_id
expected_docstore = InMemoryDocstore(
{
index_to_id[0]: Document(page_content="foo"),
index_to_id[1]: Document(page_content="bar"),
index_to_id[2]: Document(page_content="baz"),
}
)
assert docsearch.docstore.__dict__ == expected_docstore.__dict__
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.similarity_search_by_vector(query_vec, k=1)
assert output == [Document(page_content="foo")]


def test_faiss_with_metadatas() -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
Expand Down

0 comments on commit f0a2585

Please sign in to comment.