Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs: Chroma docstrings update #22001

Merged
merged 4 commits into from
May 22, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 145 additions & 65 deletions libs/partners/chroma/langchain_chroma/vectorstores.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,11 @@ def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]:


def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
"""Row-wise cosine similarity between two equal-width matrices."""
"""Row-wise cosine similarity between two equal-width matrices.

Raises:
ValueError: If the number of columns in X and Y are not the same.
"""
if len(X) == 0 or len(Y) == 0:
return np.array([])

Expand Down Expand Up @@ -80,7 +84,21 @@ def maximal_marginal_relevance(
lambda_mult: float = 0.5,
k: int = 4,
) -> List[int]:
"""Calculate maximal marginal relevance."""
"""Calculate maximal marginal relevance.

Args:
query_embedding: Query embedding.
embedding_list: List of embeddings to select from.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
k: Number of Documents to return. Defaults to 4.

Returns:
List of indices of embeddings selected by maximal marginal relevance.
"""

if min(k, len(embedding_list)) <= 0:
return []
if query_embedding.ndim == 1:
Expand Down Expand Up @@ -136,8 +154,21 @@ def __init__(
relevance_score_fn: Optional[Callable[[float], float]] = None,
create_collection_if_not_exists: Optional[bool] = True,
) -> None:
"""Initialize with a Chroma client."""
"""Initialize with a Chroma client.

Args:
collection_name: Name of the collection to create.
embedding_function: Embedding class object. Used to embed texts.
persist_director: Directory to persist the collection.
client_settings: Chroma client settings
collection_metadata: Collection configurations.
client: Chroma client. Documentation:
https://docs.trychroma.com/reference/js-client#class:-chromaclient
relevance_score_fn: Function to calculate relevance score from distance.
Used only in `similarity_search_with_relevance_scores`
create_collection_if_not_exists: Whether to create collection
if it doesn't exist. Defaults to True.
"""
if client is not None:
self._client_settings = client_settings
self._client = client
Expand Down Expand Up @@ -204,7 +235,23 @@ def __query_collection(
where_document: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> Union[List[Document], chromadb.QueryResult]:
"""Query the chroma collection."""
"""Query the chroma collection.

Args:
query_texts: List of query texts.
query_embeddings: List of query embeddings.
n_results: Number of results to return. Defaults to 4.
where: dict used to filter results by
e.g. {"color" : "red", "price": 4.20}.
where_document: dict used to filter by the documents.
E.g. {$contains: {"text": "hello"}}.

Returns:
List of `n_results` nearest neighbor embeddings for provided
query_embeddings or query_texts.

See more: https://docs.trychroma.com/reference/py-collection#query
"""
return self._collection.query(
query_texts=query_texts,
query_embeddings=query_embeddings, # type: ignore
Expand All @@ -229,12 +276,16 @@ def add_images(
"""Run more images through the embeddings and add to the vectorstore.

Args:
uris List[str]: File path to the image.
metadatas (Optional[List[dict]], optional): Optional list of metadatas.
ids (Optional[List[str]], optional): Optional list of IDs.
uris: File path to the image.
metadatas: Optional list of metadatas.
When querying, you can filter on this metadata.
ids: Optional list of IDs.

Returns:
List[str]: List of IDs of the added images.
List of IDs of the added images.

Raises:
ValueError: When metadata is incorrect.
"""
# Map from uris to b64 encoded strings
b64_texts = [self.encode_image(uri=uri) for uri in uris]
Expand Down Expand Up @@ -312,14 +363,18 @@ def add_texts(
"""Run more texts through the embeddings and add to the vectorstore.

Args:
texts (Iterable[str]): Texts to add to the vectorstore.
metadatas (Optional[List[dict]], optional): Optional list of metadatas.
ids (Optional[List[str]], optional): Optional list of IDs.
texts: Texts to add to the vectorstore.
metadatas: Optional list of metadatas.
When querying, you can filter on this metadata.
ids: Optional list of IDs.

Returns:
List[str]: List of IDs of the added texts.
List of IDs of the added texts.

Raises:
ValueError: When metadata is incorrect.
"""
# TODO: Handle the case where the user doesn't provide ids on the Collection

if ids is None:
ids = [str(uuid.uuid4()) for _ in texts]
embeddings = None
Expand Down Expand Up @@ -391,12 +446,12 @@ def similarity_search(
"""Run similarity search with Chroma.

Args:
query (str): Query text to search for.
k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
query: Query text to search for.
k: Number of results to return. Defaults to 4.
filter: Filter by metadata. Defaults to None.

Returns:
List[Document]: List of documents most similar to the query text.
List of documents most similar to the query text.
"""
docs_and_scores = self.similarity_search_with_score(
query, k, filter=filter, **kwargs
Expand All @@ -412,10 +467,14 @@ def similarity_search_by_vector(
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to embedding vector.

Args:
embedding (List[float]): Embedding to look up documents similar to.
k (int): Number of Documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: Filter by metadata. Defaults to None.
where_document: dict used to filter by the documents.
E.g. {$contains: {"text": "hello"}}.

Returns:
List of Documents most similar to the query vector.
"""
Expand All @@ -441,13 +500,14 @@ def similarity_search_by_vector_with_relevance_scores(

Args:
embedding (List[float]): Embedding to look up documents similar to.
k (int): Number of Documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
k: Number of Documents to return. Defaults to 4.
filter: Filter by metadata. Defaults to None.
where_document: dict used to filter by the documents.
E.g. {$contains: {"text": "hello"}}.

Returns:
List[Tuple[Document, float]]: List of documents most similar to
the query text and cosine distance in float for each.
Lower score represents more similarity.
List of documents most similar to the query text and relevance score
in float for each. Lower score represents more similarity.
"""
results = self.__query_collection(
query_embeddings=embedding,
Expand All @@ -469,14 +529,15 @@ def similarity_search_with_score(
"""Run similarity search with Chroma with distance.

Args:
query (str): Query text to search for.
k (int): Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
query: Query text to search for.
k: Number of results to return. Defaults to 4.
filter: Filter by metadata. Defaults to None.
where_document: dict used to filter by the documents.
E.g. {$contains: {"text": "hello"}}.

Returns:
List[Tuple[Document, float]]: List of documents most similar to
the query text and cosine distance in float for each.
Lower score represents more similarity.
List of documents most similar to the query text and
distance in float for each. Lower score represents more similarity.
"""
if self._embedding_function is None:
results = self.__query_collection(
Expand All @@ -499,14 +560,21 @@ def similarity_search_with_score(
return _results_to_docs_and_scores(results)

def _select_relevance_score_fn(self) -> Callable[[float], float]:
"""Select the relevance score function based on collections distance metric.

The most similar documents will have the lowest relevance score. Default
relevance score function is euclidean distance. Distance metric must be
provided in `collection_metadata` during initizalition of Chroma object.
Example: collection_metadata={"hnsw:space": "cosine"}. Available distance
metrics are: 'cosine', 'l2' and 'ip'.

Returns:
The relevance score function.

Raises:
ValueError: If the distance metric is not supported.
"""
The 'correct' relevance function
may differ depending on a few things, including:
- the distance / similarity metric used by the VectorStore
- the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
- embedding dimensionality
- etc.
"""

if self.override_relevance_score_fn:
return self.override_relevance_score_fn

Expand Down Expand Up @@ -541,18 +609,20 @@ def max_marginal_relevance_search_by_vector(
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.

Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.

Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
fetch_k: Number of Documents to fetch to pass to MMR algorithm. Defaults to
20.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter: Filter by metadata. Defaults to None.

Returns:
List of Documents selected by maximal marginal relevance.
Expand Down Expand Up @@ -600,26 +670,30 @@ def max_marginal_relevance_search(
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
filter: Filter by metadata. Defaults to None.
where_document: dict used to filter by the documents.
E.g. {$contains: {"text": "hello"}}.

Returns:
List of Documents selected by maximal marginal relevance.

Raises:
ValueError: If the embedding function is not provided.
"""
if self._embedding_function is None:
raise ValueError(
"For MMR search, you must specify an embedding function on" "creation."
)

embedding = self._embedding_function.embed_query(query)
docs = self.max_marginal_relevance_search_by_vector(
return self.max_marginal_relevance_search_by_vector(
embedding,
k,
fetch_k,
lambda_mult=lambda_mult,
filter=filter,
where_document=where_document,
)
return docs

def delete_collection(self) -> None:
"""Delete the collection."""
Expand Down Expand Up @@ -656,6 +730,9 @@ def get(
Can contain `"embeddings"`, `"metadatas"`, `"documents"`.
Ids are always included.
Defaults to `["metadatas", "documents"]`. Optional.

Return:
A dict with the keys `"ids"`, `"embeddings"`, `"metadatas"`, `"documents"`.
"""
kwargs = {
"ids": ids,
Expand All @@ -674,8 +751,8 @@ def update_document(self, document_id: str, document: Document) -> None:
"""Update a document in the collection.

Args:
document_id (str): ID of the document to update.
document (Document): Document to update.
document_id: ID of the document to update.
document: Document to update.
"""
return self.update_documents([document_id], [document])

Expand All @@ -684,8 +761,11 @@ def update_documents(self, ids: List[str], documents: List[Document]) -> None:
"""Update a document in the collection.

Args:
ids (List[str]): List of ids of the document to update.
documents (List[Document]): List of documents to update.
ids: List of ids of the document to update.
documents: List of documents to update.

Raises:
ValueError: If the embedding function is not provided.
"""
text = [document.page_content for document in documents]
metadata = [document.metadata for document in documents]
Expand Down Expand Up @@ -741,14 +821,14 @@ def from_texts(
Otherwise, the data will be ephemeral in-memory.

Args:
texts (List[str]): List of texts to add to the collection.
collection_name (str): Name of the collection to create.
persist_directory (Optional[str]): Directory to persist the collection.
embedding (Optional[Embeddings]): Embedding function. Defaults to None.
metadatas (Optional[List[dict]]): List of metadatas. Defaults to None.
ids (Optional[List[str]]): List of document IDs. Defaults to None.
client_settings (Optional[chromadb.config.Settings]): Chroma client settings
collection_metadata (Optional[Dict]): Collection configurations.
texts: List of texts to add to the collection.
collection_name: Name of the collection to create.
persist_directory: Directory to persist the collection.
embedding: Embedding function. Defaults to None.
metadatas: List of metadatas. Defaults to None.
ids: List of document IDs. Defaults to None.
client_settings: Chroma client settings
collection_metadata: Collection configurations.
Defaults to None.

Returns:
Expand Down Expand Up @@ -804,13 +884,13 @@ def from_documents(
Otherwise, the data will be ephemeral in-memory.

Args:
collection_name (str): Name of the collection to create.
persist_directory (Optional[str]): Directory to persist the collection.
ids (Optional[List[str]]): List of document IDs. Defaults to None.
documents (List[Document]): List of documents to add to the vectorstore.
embedding (Optional[Embeddings]): Embedding function. Defaults to None.
client_settings (Optional[chromadb.config.Settings]): Chroma client settings
collection_metadata (Optional[Dict]): Collection configurations.
collection_name: Name of the collection to create.
persist_directory: Directory to persist the collection.
ids : List of document IDs. Defaults to None.
documents: List of documents to add to the vectorstore.
embedding: Embedding function. Defaults to None.
client_settings: Chroma client settings
collection_metadata: Collection configurations.
Defaults to None.

Returns:
Expand Down
Loading