Add Chinese tasks (C-MTEB) (#134)

* add C_MTEB * add C_MTEB * rename MMarcoReranking * rename MMarcoReranking * Update mteb/tasks/Retrieval/CMTEBRetrieval.py * Update README.md * Allow custom encode functions --------- Co-authored-by: shitao <stxiao@bupt.edu.cn> Co-authored-by: Nouamane Tazi <nouamane98@gmail.com> Co-authored-by: Niklas Muennighoff <n.muennighoff@gmail.com>
embeddings-benchmark · Aug 26, 2023 · 071974a · 071974a
1 parent 2779344
commit 071974a
Show file tree

Hide file tree

Showing 16 changed files with 652 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -181,7 +181,8 @@ evaluation.run(model)
 ## Leaderboard
 
 The MTEB Leaderboard is available [here](https://huggingface.co/spaces/mteb/leaderboard). To submit:
-1. Run on MTEB: You can reference [scripts/run_mteb_english.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/run_mteb_english.py) for all MTEB English datasets used in the main ranking. Advanced scripts with different models are available in the [mteb/mtebscripts repo](https://github.com/embeddings-benchmark/mtebscripts).
+1. Run on MTEB: You can reference [scripts/run_mteb_english.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/run_mteb_english.py) for all MTEB English datasets used in the main ranking, or [scripts/run_mteb_chinese.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/run_mteb_chinese.py) for the Chinese ones. 
+Advanced scripts with different models are available in the [mteb/mtebscripts repo](https://github.com/embeddings-benchmark/mtebscripts).
 2. Format the json files into metadata using the script at `scripts/mteb_meta.py`. For example
 `python scripts/mteb_meta.py path_to_results_folder`, which will create a `mteb_metadata.md` file. If you ran CQADupstack retrieval, make sure to merge the results first with `python scripts/merge_cqadupstack.py path_to_results_folder`.
 3. Copy the content of the `mteb_metadata.md` file to the top of a `README.md` file of your model on the Hub. See [here](https://huggingface.co/Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit/blob/main/README.md) for an example.
@@ -304,6 +305,8 @@ The MTEB Leaderboard is available [here](https://huggingface.co/spaces/mteb/lead
 | [CDSC-R](https://aclanthology.org/P17-1073.pdf)                                                                                                                       | [PL-MTEB/cdscr-sts](https://huggingface.co/datasets/PL-MTEB/cdscr-sts)                                                               | Compositional Distributional Semantics Corpus for textual relatedness.                                                                                                                                           | STS                | s2s      |          1 |          16000 |         2000 |          2000 |               72.1 |             73.2 |              75.0 |
 | [SummEval](https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html)                                                                                                     | [mteb/summeval](https://huggingface.co/datasets/mteb/summeval)                                                                       | News Article Summary Semantic Similarity Estimation.                                                                                                                                                             | Summarization      | s2s      |          1 |              0 |            0 |          2800 |                  0 |                0 |             359.8 |
 
+For Chinese tasks, you can refer to [C_MTEB](https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB).
+
 ## Citation
 
 If you find MTEB useful, feel free to cite our publication [MTEB: Massive Text Embedding Benchmark](https://arxiv.org/abs/2210.07316):

diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
@@ -65,6 +65,7 @@ def __init__(
         else:
             self._task_types = task_types
             self._task_categories = task_categories
+            self._tasks = None
 
         self._task_langs = task_langs if task_langs is not None else []
         if type(self._task_langs) is str:

diff --git a/mteb/evaluation/evaluators/RerankingEvaluator.py b/mteb/evaluation/evaluators/RerankingEvaluator.py
@@ -71,17 +71,22 @@ def compute_metrics_batched(self, model):
         all_mrr_scores = []
         all_ap_scores = []
 
+        # using encode_queries and encode_corpus functions if they exists,
+        # which can be defined by users to add different instructions for query and passage conveniently
+        encode_queries_func = model.encode_queries if hasattr(model, 'encode_queries') else model.encode
+        encode_corpus_func = model.encode_corpus if hasattr(model, 'encode_corpus') else model.encode
+
         logger.info("Encoding queries...")
         if isinstance(self.samples[0]["query"], str):
-            all_query_embs = model.encode(
-                [sample["query"] for sample in self.samples],
-                convert_to_tensor=True,
-                batch_size=self.batch_size,
-            )
+            all_query_embs = encode_queries_func(
+                    [sample["query"] for sample in self.samples],
+                    convert_to_tensor=True,
+                    batch_size=self.batch_size,
+                )
         elif isinstance(self.samples[0]["query"], list):
             # In case the query is a list of strings, we get the most similar embedding to any of the queries
             all_query_flattened = [q for sample in self.samples for q in sample["query"]]
-            all_query_embs = model.encode(all_query_flattened, convert_to_tensor=True, batch_size=self.batch_size)
+            all_query_embs = encode_queries_func(all_query_flattened, convert_to_tensor=True, batch_size=self.batch_size)
         else:
             raise ValueError(f"Query must be a string or a list of strings but is {type(self.samples[0]['query'])}")
 
@@ -91,7 +96,7 @@ def compute_metrics_batched(self, model):
             all_docs.extend(sample["positive"])
             all_docs.extend(sample["negative"])
 
-        all_docs_embs = model.encode(all_docs, convert_to_tensor=True, batch_size=self.batch_size)
+        all_docs_embs = encode_corpus_func(all_docs, convert_to_tensor=True, batch_size=self.batch_size)
 
         # Compute scores
         logger.info("Evaluating...")
@@ -130,6 +135,11 @@ def compute_metrics_individual(self, model):
         all_mrr_scores = []
         all_ap_scores = []
 
+        # using encode_queries and encode_corpus functions if they exists,
+        # which can be defined by users to add different instructions for query and passage conveniently
+        encode_queries_func = model.encode_queries if hasattr(model, 'encode_queries') else model.encode
+        encode_corpus_func = model.encode_corpus if hasattr(model, 'encode_corpus') else model.encode
+
         for instance in tqdm.tqdm(self.samples, desc="Samples"):
             query = instance["query"]
             positive = list(instance["positive"])
@@ -141,8 +151,8 @@ def compute_metrics_individual(self, model):
             docs = positive + negative
             is_relevant = [True] * len(positive) + [False] * len(negative)
 
-            query_emb = model.encode([query], convert_to_tensor=True, batch_size=self.batch_size)
-            docs_emb = model.encode(docs, convert_to_tensor=True, batch_size=self.batch_size)
+            query_emb = encode_queries_func([query], convert_to_tensor=True, batch_size=self.batch_size)
+            docs_emb = encode_corpus_func(docs, convert_to_tensor=True, batch_size=self.batch_size)
 
             scores = self._compute_metrics_instance(query_emb, docs_emb, is_relevant)
             all_mrr_scores.append(scores["mrr"])

diff --git a/mteb/tasks/Classification/CMTEBClassification.py b/mteb/tasks/Classification/CMTEBClassification.py
@@ -0,0 +1,101 @@
+from ...abstasks import AbsTaskClassification
+
+class TNews(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            'name': 'TNews',
+            'hf_hub_name': 'C-MTEB/TNews-classification',
+            'description': 'Short Text Classification for News',
+            "reference": "https://www.cluebenchmarks.com/introduce.html",
+            'type': 'Classification',
+            'category': 's2s',
+            'eval_splits': ['validation'],
+            'eval_langs': ['zh'],
+            'main_score': 'accuracy',
+            'samples_per_label': 32,
+        }
+
+
+class IFlyTek(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            'name': 'IFlyTek',
+            'hf_hub_name': 'C-MTEB/IFlyTek-classification',
+            'description': 'Long Text classification for the description of Apps',
+            "reference": "https://www.cluebenchmarks.com/introduce.html",
+            'type': 'Classification',
+            'category': 's2s',
+            'eval_splits': ['validation'],
+            'eval_langs': ['zh'],
+            'main_score': 'accuracy',
+            'samples_per_label': 32,
+            'n_experiments': 5
+        }
+
+
+class MultilingualSentiment(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            'name': 'MultilingualSentiment',
+            'hf_hub_name': 'C-MTEB/MultilingualSentiment-classification',
+            'description': 'A collection of multilingual sentiments datasets grouped into 3 classes -- positive, neutral, negative',
+            "reference": "https://github.com/tyqiangz/multilingual-sentiment-datasets",
+            'category': 's2s',
+            'type': 'Classification',
+            'eval_splits': ['validation'],
+            'eval_langs': ['zh'],
+            'main_score': 'accuracy',
+            'samples_per_label': 32,
+        }
+
+
+
+class JDReview(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            'name': 'JDReview',
+            'hf_hub_name': 'C-MTEB/JDReview-classification',
+            'description': 'review for iphone',
+            'category': 's2s',
+            'type': 'Classification',
+            'eval_splits': ['test'],
+            'eval_langs': ['zh'],
+            'main_score': 'accuracy',
+            'samples_per_label': 32,
+        }
+
+
+class OnlineShopping(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            'name': 'OnlineShopping',
+            'hf_hub_name': 'C-MTEB/OnlineShopping-classification',
+            'description': 'Sentiment Analysis of User Reviews on Online Shopping Websites',
+            'category': 's2s',
+            'type': 'Classification',
+            'eval_splits': ['test'],
+            'eval_langs': ['zh'],
+            'main_score': 'accuracy',
+            'samples_per_label': 32,
+        }
+
+
+class Waimai(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            'name': 'Waimai',
+            'hf_hub_name': 'C-MTEB/waimai-classification',
+            'description': 'Sentiment Analysis of user reviews on takeaway platforms',
+            'category': 's2s',
+            'type': 'Classification',
+            'eval_splits': ['test'],
+            'eval_langs': ['zh'],
+            'main_score': 'accuracy',
+            'samples_per_label': 32,
+        }
diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py
@@ -20,4 +20,5 @@
 from .SweRecClassification import *
 from .ToxicConversationsClassification import *
 from .TweetSentimentExtractionClassification import *
+from .CMTEBClassification import *
 from .PolishClassification import *
diff --git a/mteb/tasks/Clustering/CMTEBClustering.py b/mteb/tasks/Clustering/CMTEBClustering.py
@@ -0,0 +1,71 @@
+from ...abstasks.AbsTaskClustering import AbsTaskClustering
+
+
+class CLSClusteringS2S(AbsTaskClustering):
+    @property
+    def description(self):
+        return {
+            "name": "CLSClusteringS2S",
+            "hf_hub_name": "C-MTEB/CLSClusteringS2S",
+            "description": (
+                "Clustering of titles from CLS dataset. Clustering of 13 sets, based on the main category."
+            ),
+            "reference": "https://arxiv.org/abs/2209.05034",
+            "type": "Clustering",
+            "category": "s2s",
+            "eval_splits": ["test"],
+            "eval_langs": ["zh"],
+            "main_score": "v_measure",
+        }
+
+
+
+class CLSClusteringP2P(AbsTaskClustering):
+    @property
+    def description(self):
+        return {
+            "name": "CLSClusteringP2P",
+            "hf_hub_name": "C-MTEB/CLSClusteringP2P",
+            "description": (
+                "Clustering of titles + abstract from CLS dataset. Clustering of 13 sets, based on the main category."
+            ),
+            "reference": "https://arxiv.org/abs/2209.05034",
+            "type": "Clustering",
+            "category": "p2p",
+            "eval_splits": ["test"],
+            "eval_langs": ["zh"],
+            "main_score": "v_measure",
+        }
+
+
+
+class ThuNewsClusteringS2S(AbsTaskClustering):
+    @property
+    def description(self):
+        return {
+            'name': 'ThuNewsClusteringS2S',
+            'hf_hub_name': 'C-MTEB/ThuNewsClusteringS2S',
+            'description': 'Clustering of titles from the THUCNews dataset',
+            "reference": "http://thuctc.thunlp.org/",
+            "type": "Clustering",
+            "category": "s2s",
+            "eval_splits": ["test"],
+            "eval_langs": ["zh"],
+            "main_score": "v_measure",
+        }
+
+
+class ThuNewsClusteringP2P(AbsTaskClustering):
+    @property
+    def description(self):
+        return {
+            'name': 'ThuNewsClusteringP2P',
+            'hf_hub_name': 'C-MTEB/ThuNewsClusteringP2P',
+            'description': 'Clustering of titles + abstracts from the THUCNews dataset',
+            "reference": "http://thuctc.thunlp.org/",
+            "type": "Clustering",
+            "category": "p2p",
+            "eval_splits": ["test"],
+            "eval_langs": ["zh"],
+            "main_score": "v_measure",
+        }
diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py
@@ -13,4 +13,5 @@
 from .TenKGnadClusteringP2P import *
 from .TenKGnadClusteringS2S import *
 from .TwentyNewsgroupsClustering import *
+from .CMTEBClustering import *
 from .PolishClustering import *
diff --git a/mteb/tasks/PairClassification/CMTEBPairClassification.py b/mteb/tasks/PairClassification/CMTEBPairClassification.py
@@ -0,0 +1,33 @@
+from ...abstasks.AbsTaskPairClassification import AbsTaskPairClassification
+
+
+class Ocnli(AbsTaskPairClassification):
+    @property
+    def description(self):
+        return {
+            'name': 'Ocnli',
+            "hf_hub_name": "C-MTEB/OCNLI",
+            'description': 'Original Chinese Natural Language Inference dataset',
+            "reference": "https://arxiv.org/abs/2010.05444",
+            'category': 's2s',
+            'type': 'PairClassification',
+            'eval_splits': ['validation'],
+            'eval_langs': ['zh'],
+            'main_score': 'ap',
+        }
+
+
+class Cmnli(AbsTaskPairClassification):
+    @property
+    def description(self):
+        return {
+            'name': 'Cmnli',
+            "hf_hub_name": "C-MTEB/CMNLI",
+            'description': 'Chinese Multi-Genre NLI',
+            "reference": "https://huggingface.co/datasets/clue/viewer/cmnli",
+            'category': 's2s',
+            'type': 'PairClassification',
+            'eval_splits': ['validation'],
+            'eval_langs': ['zh'],
+            'main_score': 'ap',
+        }
diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py
@@ -1,4 +1,5 @@
 from .SprintDuplicateQuestionsPC import *
 from .TwitterSemEval2015PC import *
 from .TwitterURLCorpusPC import *
+from .CMTEBPairClassification import *
 from .PolishPC import *
diff --git a/mteb/tasks/Reranking/CMTEBReranking.py b/mteb/tasks/Reranking/CMTEBReranking.py
@@ -0,0 +1,65 @@
+from ...abstasks.AbsTaskReranking import AbsTaskReranking
+
+
+class T2Reranking(AbsTaskReranking):
+    @property
+    def description(self):
+        return {
+            'name': 'T2Reranking',
+            'hf_hub_name': "C-MTEB/T2Reranking",
+            'description': 'T2Ranking: A large-scale Chinese Benchmark for Passage Ranking',
+            "reference": "https://arxiv.org/abs/2304.03679",
+            'type': 'Reranking',
+            'category': 's2p',
+            'eval_splits': ['dev'],
+            'eval_langs': ['zh'],
+            'main_score': 'map',
+        }
+
+
+class MMarcoReranking(AbsTaskReranking):
+    @property
+    def description(self):
+        return {
+            'name': 'MMarcoReranking',
+            'hf_hub_name': "C-MTEB/Mmarco-reranking",
+            'description': 'mMARCO is a multilingual version of the MS MARCO passage ranking dataset',
+            "reference": "https://github.com/unicamp-dl/mMARCO",
+            'type': 'Reranking',
+            'category': 's2p',
+            'eval_splits': ['dev'],
+            'eval_langs': ['zh'],
+            'main_score': 'map',
+        }
+
+
+class CMedQAv1(AbsTaskReranking):
+    @property
+    def description(self):
+        return {
+            'name': 'CMedQAv1',
+            "hf_hub_name": "C-MTEB/CMedQAv1-reranking",
+            'description': 'Chinese community medical question answering',
+            "reference": "https://github.com/zhangsheng93/cMedQA",
+            'type': 'Reranking',
+            'category': 's2p',
+            'eval_splits': ['test'],
+            'eval_langs': ['zh'],
+            'main_score': 'map',
+        }
+
+
+class CMedQAv2(AbsTaskReranking):
+    @property
+    def description(self):
+        return {
+            'name': 'CMedQAv2',
+            "hf_hub_name": "C-MTEB/CMedQAv2-reranking",
+            'description': 'Chinese community medical question answering',
+            "reference": "https://github.com/zhangsheng93/cMedQA2",
+            'type': 'Reranking',
+            'category': 's2p',
+            'eval_splits': ['test'],
+            'eval_langs': ['zh'],
+            'main_score': 'map',
+        }
diff --git a/mteb/tasks/Reranking/__init__.py b/mteb/tasks/Reranking/__init__.py
@@ -2,3 +2,4 @@
 from .MindSmallReranking import *
 from .SciDocsReranking import *
 from .StackOverflowDupQuestions import *
+from .CMTEBReranking import *