Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: Use similarity scores if available #1602

Merged
merged 2 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 21 additions & 8 deletions mteb/evaluation/evaluators/BitextMiningEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def compute_metrics(self, model: Encoder, encode_kwargs: dict[str, Any] = {}):
tqdm.tqdm(self.pairs, desc="Matching sentences")
):
scores[f"{key1}-{key2}"] = self._compute_metrics(
embeddings[key1], embeddings[key2]
embeddings[key1], embeddings[key2], model
)

# in case of default pair unnest the dict
Expand All @@ -76,10 +76,13 @@ def _compute_metrics(
self,
embeddings1,
embeddings2,
model: Encoder,
):
# Find nearest neighbors
logger.info("Finding nearest neighbors...")
nearest_neighbors = self._similarity_search(embeddings1, embeddings2, top_k=1)
nearest_neighbors = self._similarity_search(
embeddings1, embeddings2, model, top_k=1
)

# Compute errors
logger.info("Computing metrics...")
Expand All @@ -106,21 +109,21 @@ def _similarity_search(
self,
query_embeddings,
corpus_embeddings,
model: Encoder,
query_chunk_size: int = 100,
corpus_chunk_size: int = 500000,
top_k: int = 10,
score_function=cos_sim,
):
"""This function performs a cosine similarity search between a list of query embeddings and a list of corpus embeddings.
It can be used for Information Retrieval / Semantic Search for corpora up to about 1 Million entries.

Args:
query_embeddings: A 2 dimensional tensor with the query embeddings.
corpus_embeddings: A 2 dimensional tensor with the corpus embeddings.
model: The model used to encode the queries and corpus. This is used to check if the embeddings are on the same device and to encode the queries and corpus if they are not already tensors.
query_chunk_size: Process 100 queries simultaneously. Increasing that value increases the speed, but requires more memory.
corpus_chunk_size: Scans the corpus 100k entries at a time. Increasing that value increases the speed, but requires more memory.
top_k: Retrieve top k matching entries.
score_function: Function for computing scores. By default, cosine similarity.

Returns:
Returns a list with one entry for each query. Each entry is a list of dictionaries with the keys 'corpus_id' and 'score', sorted by decreasing cosine similarity scores.
Expand All @@ -142,7 +145,7 @@ def _similarity_search(
# Iterate over chunks of the corpus
for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size):
# Compute cosine similarities
cos_scores = score_function(
similarity_scores = cos_sim(
query_embeddings[
query_start_idx : query_start_idx + query_chunk_size
],
Expand All @@ -151,18 +154,28 @@ def _similarity_search(
],
)

if hasattr(model, "similarity"):
similarity_scores = model.similarity(
query_embeddings[
query_start_idx : query_start_idx + query_chunk_size
],
corpus_embeddings[
corpus_start_idx : corpus_start_idx + corpus_chunk_size
],
)

# Get top-k scores
cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(
cos_scores,
min(top_k, len(cos_scores[0])),
similarity_scores,
min(top_k, len(similarity_scores[0])),
dim=1,
largest=True,
sorted=False,
)
cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()

for query_itr in range(len(cos_scores)):
for query_itr in range(len(similarity_scores)):
for sub_corpus_id, score in zip(
cos_scores_top_k_idx[query_itr],
cos_scores_top_k_values[query_itr],
Expand Down
27 changes: 18 additions & 9 deletions mteb/evaluation/evaluators/RerankingEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ def __init__(
task_name: str | None = None,
mrr_at_k: int = 10,
name: str = "",
similarity_fct=cos_sim,
encode_kwargs: dict[str, Any] = {},
use_batched_encoding: bool = True,
limit: int | None = None,
Expand All @@ -48,7 +47,6 @@ def __init__(
self.samples = samples
self.name = name
self.mrr_at_k = mrr_at_k
self.similarity_fct = similarity_fct
self.use_batched_encoding = use_batched_encoding
self.task_name = task_name
self.k_values = k_values
Expand Down Expand Up @@ -211,6 +209,7 @@ def _encode_candidates_batched(
all_mrr_scores,
all_ap_scores,
all_conf_scores,
model,
)

def _encode_candidates_individual(
Expand Down Expand Up @@ -257,6 +256,7 @@ def _encode_candidates_individual(
all_mrr_scores,
all_ap_scores,
all_conf_scores,
model,
)

def _collect_results(self, all_mrr_scores, all_ap_scores, all_conf_scores):
Expand Down Expand Up @@ -316,7 +316,7 @@ def _encode_candidates_miracl_batched(self, all_query_embs, model: Encoder):
docs_idx += num_doc

fake_qid = str(query_idx)
results[fake_qid] = self.rerank(query_emb, docs_emb)
results[fake_qid] = self.rerank(query_emb, docs_emb, model)
qrels[fake_qid] = {
str(i): 1 if doc in positive else 0 for i, doc in enumerate(docs)
}
Expand Down Expand Up @@ -351,7 +351,7 @@ def _encode_candidates_miracl_individual(self, model: Encoder):
)

fake_qid = str(i)
results[fake_qid] = self.rerank(query_emb, docs_emb)
results[fake_qid] = self.rerank(query_emb, docs_emb, model)
qrels[fake_qid] = {
str(i): 1 if doc in positive else 0 for i, doc in enumerate(docs)
}
Expand All @@ -371,14 +371,15 @@ def _collect_miracl_results(self, results, qrels):
return scores_miracl

def rerank(
self, query_emb: torch.Tensor, docs_emb: torch.Tensor
self, query_emb: np.ndarray, docs_emb: np.ndarray, model: Encoder
) -> dict[str, float]:
"""Rerank documents (docs_emb) given the query (query_emb)

Args:
query_emb: Query embedding of shape `(num_queries, hidden_size)`)
if `num_queries` > 0: we take the closest document to any of the queries
docs_emb: Candidates documents embeddings of shape `(num_pos+num_neg, hidden_size)`)
model: Model to use for computing similarity scores if model.similarity is available

Returns:
similarity_scores:
Expand All @@ -389,7 +390,10 @@ def rerank(
if not docs_emb.shape[0]:
return {"empty-docid": 0}

pred_scores = self.similarity_fct(query_emb, docs_emb)
if hasattr(model, "similarity"):
pred_scores = model.similarity(query_emb, docs_emb)
else:
pred_scores = cos_sim(query_emb, docs_emb)
if len(pred_scores.shape) > 1:
pred_scores = torch.amax(pred_scores, dim=0)

Expand All @@ -405,8 +409,9 @@ def _apply_sim_scores(
all_mrr_scores,
all_ap_scores,
all_conf_scores,
model: Encoder,
):
sim_scores = self._compute_sim_scores_instance(query_emb, docs_emb)
sim_scores = self._compute_sim_scores_instance(query_emb, docs_emb, model)
scores = self._compute_metrics_instance(sim_scores, is_relevant)
conf_scores = self.conf_scores(sim_scores.tolist())

Expand Down Expand Up @@ -443,19 +448,23 @@ def _encode_unique_texts(
return all_unique_texts_embs[all_texts_indexes]

def _compute_sim_scores_instance(
self, query_emb: torch.Tensor, docs_emb: torch.Tensor
self, query_emb: np.ndarray, docs_emb: np.ndarray, model: Encoder
) -> torch.Tensor:
"""Computes similarity scores for a single instance = (query, positives, negatives)

Args:
query_emb: Query embedding, with shape `(num_queries, hidden_size)`
if `num_queries` > 0: we take the closest document to any of the queries
docs_emb: Candidates documents embeddings, with shape `(num_pos+num_neg, hidden_size)`
model: Model to use for computing similarity scores if model.similarity is available

Returns:
sim_scores: Query-documents similarity scores, with shape `(num_pos+num_neg,)`
"""
sim_scores = self.similarity_fct(query_emb, docs_emb)
if hasattr(model, "similarity"):
sim_scores = model.similarity(query_emb, docs_emb)
else:
sim_scores = cos_sim(query_emb, docs_emb)
if len(sim_scores.shape) > 1:
sim_scores = torch.amax(sim_scores, dim=0)

Expand Down
Loading