From e17b6bbe98987ab14cc12e1afa9462483da3a43a Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Mon, 16 Dec 2024 16:15:07 +0300 Subject: [PATCH 1/2] Use similarity scores if available --- .../evaluators/BitextMiningEvaluator.py | 27 +++++++++++++------ .../evaluators/RerankingEvaluator.py | 26 +++++++++++------- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/mteb/evaluation/evaluators/BitextMiningEvaluator.py b/mteb/evaluation/evaluators/BitextMiningEvaluator.py index 4fa7022ed..38668e819 100644 --- a/mteb/evaluation/evaluators/BitextMiningEvaluator.py +++ b/mteb/evaluation/evaluators/BitextMiningEvaluator.py @@ -62,7 +62,7 @@ def compute_metrics(self, model: Encoder, encode_kwargs: dict[str, Any] = {}): tqdm.tqdm(self.pairs, desc="Matching sentences") ): scores[f"{key1}-{key2}"] = self._compute_metrics( - embeddings[key1], embeddings[key2] + embeddings[key1], embeddings[key2], model ) # in case of default pair unnest the dict @@ -76,10 +76,11 @@ def _compute_metrics( self, embeddings1, embeddings2, + model: Encoder, ): # Find nearest neighbors logger.info("Finding nearest neighbors...") - nearest_neighbors = self._similarity_search(embeddings1, embeddings2, top_k=1) + nearest_neighbors = self._similarity_search(embeddings1, embeddings2, model, top_k=1) # Compute errors logger.info("Computing metrics...") @@ -106,10 +107,10 @@ def _similarity_search( self, query_embeddings, corpus_embeddings, + model: Encoder, query_chunk_size: int = 100, corpus_chunk_size: int = 500000, top_k: int = 10, - score_function=cos_sim, ): """This function performs a cosine similarity search between a list of query embeddings and a list of corpus embeddings. It can be used for Information Retrieval / Semantic Search for corpora up to about 1 Million entries. @@ -117,10 +118,10 @@ def _similarity_search( Args: query_embeddings: A 2 dimensional tensor with the query embeddings. corpus_embeddings: A 2 dimensional tensor with the corpus embeddings. + model: The model used to encode the queries and corpus. This is used to check if the embeddings are on the same device and to encode the queries and corpus if they are not already tensors. query_chunk_size: Process 100 queries simultaneously. Increasing that value increases the speed, but requires more memory. corpus_chunk_size: Scans the corpus 100k entries at a time. Increasing that value increases the speed, but requires more memory. top_k: Retrieve top k matching entries. - score_function: Function for computing scores. By default, cosine similarity. Returns: Returns a list with one entry for each query. Each entry is a list of dictionaries with the keys 'corpus_id' and 'score', sorted by decreasing cosine similarity scores. @@ -142,7 +143,7 @@ def _similarity_search( # Iterate over chunks of the corpus for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size): # Compute cosine similarities - cos_scores = score_function( + similarity_scores = cos_sim( query_embeddings[ query_start_idx : query_start_idx + query_chunk_size ], @@ -151,10 +152,20 @@ def _similarity_search( ], ) + if hasattr(model, "similarity"): + similarity_scores = model.similarity( + query_embeddings[ + query_start_idx : query_start_idx + query_chunk_size + ], + corpus_embeddings[ + corpus_start_idx : corpus_start_idx + corpus_chunk_size + ], + ) + # Get top-k scores cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk( - cos_scores, - min(top_k, len(cos_scores[0])), + similarity_scores, + min(top_k, len(similarity_scores[0])), dim=1, largest=True, sorted=False, @@ -162,7 +173,7 @@ def _similarity_search( cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist() cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist() - for query_itr in range(len(cos_scores)): + for query_itr in range(len(similarity_scores)): for sub_corpus_id, score in zip( cos_scores_top_k_idx[query_itr], cos_scores_top_k_values[query_itr], diff --git a/mteb/evaluation/evaluators/RerankingEvaluator.py b/mteb/evaluation/evaluators/RerankingEvaluator.py index 62d741ee0..e12d9eea3 100644 --- a/mteb/evaluation/evaluators/RerankingEvaluator.py +++ b/mteb/evaluation/evaluators/RerankingEvaluator.py @@ -34,7 +34,6 @@ def __init__( task_name: str | None = None, mrr_at_k: int = 10, name: str = "", - similarity_fct=cos_sim, encode_kwargs: dict[str, Any] = {}, use_batched_encoding: bool = True, limit: int | None = None, @@ -48,7 +47,6 @@ def __init__( self.samples = samples self.name = name self.mrr_at_k = mrr_at_k - self.similarity_fct = similarity_fct self.use_batched_encoding = use_batched_encoding self.task_name = task_name self.k_values = k_values @@ -211,6 +209,7 @@ def _encode_candidates_batched( all_mrr_scores, all_ap_scores, all_conf_scores, + model, ) def _encode_candidates_individual( @@ -257,6 +256,7 @@ def _encode_candidates_individual( all_mrr_scores, all_ap_scores, all_conf_scores, + model, ) def _collect_results(self, all_mrr_scores, all_ap_scores, all_conf_scores): @@ -316,7 +316,7 @@ def _encode_candidates_miracl_batched(self, all_query_embs, model: Encoder): docs_idx += num_doc fake_qid = str(query_idx) - results[fake_qid] = self.rerank(query_emb, docs_emb) + results[fake_qid] = self.rerank(query_emb, docs_emb, model) qrels[fake_qid] = { str(i): 1 if doc in positive else 0 for i, doc in enumerate(docs) } @@ -351,7 +351,7 @@ def _encode_candidates_miracl_individual(self, model: Encoder): ) fake_qid = str(i) - results[fake_qid] = self.rerank(query_emb, docs_emb) + results[fake_qid] = self.rerank(query_emb, docs_emb, model) qrels[fake_qid] = { str(i): 1 if doc in positive else 0 for i, doc in enumerate(docs) } @@ -371,7 +371,7 @@ def _collect_miracl_results(self, results, qrels): return scores_miracl def rerank( - self, query_emb: torch.Tensor, docs_emb: torch.Tensor + self, query_emb: np.ndarray, docs_emb: np.ndarray, model: Encoder ) -> dict[str, float]: """Rerank documents (docs_emb) given the query (query_emb) @@ -379,6 +379,7 @@ def rerank( query_emb: Query embedding of shape `(num_queries, hidden_size)`) if `num_queries` > 0: we take the closest document to any of the queries docs_emb: Candidates documents embeddings of shape `(num_pos+num_neg, hidden_size)`) + model: Model to use for computing similarity scores if model.similarity is available Returns: similarity_scores: @@ -389,7 +390,10 @@ def rerank( if not docs_emb.shape[0]: return {"empty-docid": 0} - pred_scores = self.similarity_fct(query_emb, docs_emb) + if hasattr(model, "similarity"): + pred_scores = model.similarity(query_emb, docs_emb) + else: + pred_scores = cos_sim(query_emb, docs_emb) if len(pred_scores.shape) > 1: pred_scores = torch.amax(pred_scores, dim=0) @@ -405,8 +409,9 @@ def _apply_sim_scores( all_mrr_scores, all_ap_scores, all_conf_scores, + model: Encoder, ): - sim_scores = self._compute_sim_scores_instance(query_emb, docs_emb) + sim_scores = self._compute_sim_scores_instance(query_emb, docs_emb, model) scores = self._compute_metrics_instance(sim_scores, is_relevant) conf_scores = self.conf_scores(sim_scores.tolist()) @@ -443,7 +448,7 @@ def _encode_unique_texts( return all_unique_texts_embs[all_texts_indexes] def _compute_sim_scores_instance( - self, query_emb: torch.Tensor, docs_emb: torch.Tensor + self, query_emb: np.ndarray, docs_emb: np.ndarray, model: Encoder ) -> torch.Tensor: """Computes similarity scores for a single instance = (query, positives, negatives) @@ -455,7 +460,10 @@ def _compute_sim_scores_instance( Returns: sim_scores: Query-documents similarity scores, with shape `(num_pos+num_neg,)` """ - sim_scores = self.similarity_fct(query_emb, docs_emb) + if hasattr(model, "similarity"): + sim_scores = model.similarity(query_emb, docs_emb) + else: + sim_scores = cos_sim(query_emb, docs_emb) if len(sim_scores.shape) > 1: sim_scores = torch.amax(sim_scores, dim=0) From 89eac0786f7ec618e4deedebd1faac855ac764a5 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Mon, 16 Dec 2024 16:52:58 +0300 Subject: [PATCH 2/2] lint --- mteb/evaluation/evaluators/BitextMiningEvaluator.py | 4 +++- mteb/evaluation/evaluators/RerankingEvaluator.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/mteb/evaluation/evaluators/BitextMiningEvaluator.py b/mteb/evaluation/evaluators/BitextMiningEvaluator.py index 38668e819..4b068653d 100644 --- a/mteb/evaluation/evaluators/BitextMiningEvaluator.py +++ b/mteb/evaluation/evaluators/BitextMiningEvaluator.py @@ -80,7 +80,9 @@ def _compute_metrics( ): # Find nearest neighbors logger.info("Finding nearest neighbors...") - nearest_neighbors = self._similarity_search(embeddings1, embeddings2, model, top_k=1) + nearest_neighbors = self._similarity_search( + embeddings1, embeddings2, model, top_k=1 + ) # Compute errors logger.info("Computing metrics...") diff --git a/mteb/evaluation/evaluators/RerankingEvaluator.py b/mteb/evaluation/evaluators/RerankingEvaluator.py index e12d9eea3..3df204f86 100644 --- a/mteb/evaluation/evaluators/RerankingEvaluator.py +++ b/mteb/evaluation/evaluators/RerankingEvaluator.py @@ -456,6 +456,7 @@ def _compute_sim_scores_instance( query_emb: Query embedding, with shape `(num_queries, hidden_size)` if `num_queries` > 0: we take the closest document to any of the queries docs_emb: Candidates documents embeddings, with shape `(num_pos+num_neg, hidden_size)` + model: Model to use for computing similarity scores if model.similarity is available Returns: sim_scores: Query-documents similarity scores, with shape `(num_pos+num_neg,)`