Skip to content

Commit

Permalink
Add Chinese tasks (C-MTEB) (#134)
Browse files Browse the repository at this point in the history
* add C_MTEB

* add C_MTEB

* rename MMarcoReranking

* rename MMarcoReranking

* Update mteb/tasks/Retrieval/CMTEBRetrieval.py

* Update README.md

* Allow custom encode functions

---------

Co-authored-by: shitao <stxiao@bupt.edu.cn>
Co-authored-by: Nouamane Tazi <nouamane98@gmail.com>
Co-authored-by: Niklas Muennighoff <n.muennighoff@gmail.com>
  • Loading branch information
4 people authored Aug 26, 2023
1 parent 2779344 commit 071974a
Show file tree
Hide file tree
Showing 16 changed files with 652 additions and 12 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,8 @@ evaluation.run(model)
## Leaderboard

The MTEB Leaderboard is available [here](https://huggingface.co/spaces/mteb/leaderboard). To submit:
1. Run on MTEB: You can reference [scripts/run_mteb_english.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/run_mteb_english.py) for all MTEB English datasets used in the main ranking. Advanced scripts with different models are available in the [mteb/mtebscripts repo](https://github.com/embeddings-benchmark/mtebscripts).
1. Run on MTEB: You can reference [scripts/run_mteb_english.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/run_mteb_english.py) for all MTEB English datasets used in the main ranking, or [scripts/run_mteb_chinese.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/run_mteb_chinese.py) for the Chinese ones.
Advanced scripts with different models are available in the [mteb/mtebscripts repo](https://github.com/embeddings-benchmark/mtebscripts).
2. Format the json files into metadata using the script at `scripts/mteb_meta.py`. For example
`python scripts/mteb_meta.py path_to_results_folder`, which will create a `mteb_metadata.md` file. If you ran CQADupstack retrieval, make sure to merge the results first with `python scripts/merge_cqadupstack.py path_to_results_folder`.
3. Copy the content of the `mteb_metadata.md` file to the top of a `README.md` file of your model on the Hub. See [here](https://huggingface.co/Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit/blob/main/README.md) for an example.
Expand Down Expand Up @@ -304,6 +305,8 @@ The MTEB Leaderboard is available [here](https://huggingface.co/spaces/mteb/lead
| [CDSC-R](https://aclanthology.org/P17-1073.pdf) | [PL-MTEB/cdscr-sts](https://huggingface.co/datasets/PL-MTEB/cdscr-sts) | Compositional Distributional Semantics Corpus for textual relatedness. | STS | s2s | 1 | 16000 | 2000 | 2000 | 72.1 | 73.2 | 75.0 |
| [SummEval](https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html) | [mteb/summeval](https://huggingface.co/datasets/mteb/summeval) | News Article Summary Semantic Similarity Estimation. | Summarization | s2s | 1 | 0 | 0 | 2800 | 0 | 0 | 359.8 |

For Chinese tasks, you can refer to [C_MTEB](https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB).

## Citation

If you find MTEB useful, feel free to cite our publication [MTEB: Massive Text Embedding Benchmark](https://arxiv.org/abs/2210.07316):
Expand Down
1 change: 1 addition & 0 deletions mteb/evaluation/MTEB.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def __init__(
else:
self._task_types = task_types
self._task_categories = task_categories
self._tasks = None

self._task_langs = task_langs if task_langs is not None else []
if type(self._task_langs) is str:
Expand Down
28 changes: 19 additions & 9 deletions mteb/evaluation/evaluators/RerankingEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,17 +71,22 @@ def compute_metrics_batched(self, model):
all_mrr_scores = []
all_ap_scores = []

# using encode_queries and encode_corpus functions if they exists,
# which can be defined by users to add different instructions for query and passage conveniently
encode_queries_func = model.encode_queries if hasattr(model, 'encode_queries') else model.encode
encode_corpus_func = model.encode_corpus if hasattr(model, 'encode_corpus') else model.encode

logger.info("Encoding queries...")
if isinstance(self.samples[0]["query"], str):
all_query_embs = model.encode(
[sample["query"] for sample in self.samples],
convert_to_tensor=True,
batch_size=self.batch_size,
)
all_query_embs = encode_queries_func(
[sample["query"] for sample in self.samples],
convert_to_tensor=True,
batch_size=self.batch_size,
)
elif isinstance(self.samples[0]["query"], list):
# In case the query is a list of strings, we get the most similar embedding to any of the queries
all_query_flattened = [q for sample in self.samples for q in sample["query"]]
all_query_embs = model.encode(all_query_flattened, convert_to_tensor=True, batch_size=self.batch_size)
all_query_embs = encode_queries_func(all_query_flattened, convert_to_tensor=True, batch_size=self.batch_size)
else:
raise ValueError(f"Query must be a string or a list of strings but is {type(self.samples[0]['query'])}")

Expand All @@ -91,7 +96,7 @@ def compute_metrics_batched(self, model):
all_docs.extend(sample["positive"])
all_docs.extend(sample["negative"])

all_docs_embs = model.encode(all_docs, convert_to_tensor=True, batch_size=self.batch_size)
all_docs_embs = encode_corpus_func(all_docs, convert_to_tensor=True, batch_size=self.batch_size)

# Compute scores
logger.info("Evaluating...")
Expand Down Expand Up @@ -130,6 +135,11 @@ def compute_metrics_individual(self, model):
all_mrr_scores = []
all_ap_scores = []

# using encode_queries and encode_corpus functions if they exists,
# which can be defined by users to add different instructions for query and passage conveniently
encode_queries_func = model.encode_queries if hasattr(model, 'encode_queries') else model.encode
encode_corpus_func = model.encode_corpus if hasattr(model, 'encode_corpus') else model.encode

for instance in tqdm.tqdm(self.samples, desc="Samples"):
query = instance["query"]
positive = list(instance["positive"])
Expand All @@ -141,8 +151,8 @@ def compute_metrics_individual(self, model):
docs = positive + negative
is_relevant = [True] * len(positive) + [False] * len(negative)

query_emb = model.encode([query], convert_to_tensor=True, batch_size=self.batch_size)
docs_emb = model.encode(docs, convert_to_tensor=True, batch_size=self.batch_size)
query_emb = encode_queries_func([query], convert_to_tensor=True, batch_size=self.batch_size)
docs_emb = encode_corpus_func(docs, convert_to_tensor=True, batch_size=self.batch_size)

scores = self._compute_metrics_instance(query_emb, docs_emb, is_relevant)
all_mrr_scores.append(scores["mrr"])
Expand Down
101 changes: 101 additions & 0 deletions mteb/tasks/Classification/CMTEBClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from ...abstasks import AbsTaskClassification

class TNews(AbsTaskClassification):
@property
def description(self):
return {
'name': 'TNews',
'hf_hub_name': 'C-MTEB/TNews-classification',
'description': 'Short Text Classification for News',
"reference": "https://www.cluebenchmarks.com/introduce.html",
'type': 'Classification',
'category': 's2s',
'eval_splits': ['validation'],
'eval_langs': ['zh'],
'main_score': 'accuracy',
'samples_per_label': 32,
}


class IFlyTek(AbsTaskClassification):
@property
def description(self):
return {
'name': 'IFlyTek',
'hf_hub_name': 'C-MTEB/IFlyTek-classification',
'description': 'Long Text classification for the description of Apps',
"reference": "https://www.cluebenchmarks.com/introduce.html",
'type': 'Classification',
'category': 's2s',
'eval_splits': ['validation'],
'eval_langs': ['zh'],
'main_score': 'accuracy',
'samples_per_label': 32,
'n_experiments': 5
}


class MultilingualSentiment(AbsTaskClassification):
@property
def description(self):
return {
'name': 'MultilingualSentiment',
'hf_hub_name': 'C-MTEB/MultilingualSentiment-classification',
'description': 'A collection of multilingual sentiments datasets grouped into 3 classes -- positive, neutral, negative',
"reference": "https://github.com/tyqiangz/multilingual-sentiment-datasets",
'category': 's2s',
'type': 'Classification',
'eval_splits': ['validation'],
'eval_langs': ['zh'],
'main_score': 'accuracy',
'samples_per_label': 32,
}



class JDReview(AbsTaskClassification):
@property
def description(self):
return {
'name': 'JDReview',
'hf_hub_name': 'C-MTEB/JDReview-classification',
'description': 'review for iphone',
'category': 's2s',
'type': 'Classification',
'eval_splits': ['test'],
'eval_langs': ['zh'],
'main_score': 'accuracy',
'samples_per_label': 32,
}


class OnlineShopping(AbsTaskClassification):
@property
def description(self):
return {
'name': 'OnlineShopping',
'hf_hub_name': 'C-MTEB/OnlineShopping-classification',
'description': 'Sentiment Analysis of User Reviews on Online Shopping Websites',
'category': 's2s',
'type': 'Classification',
'eval_splits': ['test'],
'eval_langs': ['zh'],
'main_score': 'accuracy',
'samples_per_label': 32,
}


class Waimai(AbsTaskClassification):
@property
def description(self):
return {
'name': 'Waimai',
'hf_hub_name': 'C-MTEB/waimai-classification',
'description': 'Sentiment Analysis of user reviews on takeaway platforms',
'category': 's2s',
'type': 'Classification',
'eval_splits': ['test'],
'eval_langs': ['zh'],
'main_score': 'accuracy',
'samples_per_label': 32,
}
1 change: 1 addition & 0 deletions mteb/tasks/Classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@
from .SweRecClassification import *
from .ToxicConversationsClassification import *
from .TweetSentimentExtractionClassification import *
from .CMTEBClassification import *
from .PolishClassification import *
71 changes: 71 additions & 0 deletions mteb/tasks/Clustering/CMTEBClustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from ...abstasks.AbsTaskClustering import AbsTaskClustering


class CLSClusteringS2S(AbsTaskClustering):
@property
def description(self):
return {
"name": "CLSClusteringS2S",
"hf_hub_name": "C-MTEB/CLSClusteringS2S",
"description": (
"Clustering of titles from CLS dataset. Clustering of 13 sets, based on the main category."
),
"reference": "https://arxiv.org/abs/2209.05034",
"type": "Clustering",
"category": "s2s",
"eval_splits": ["test"],
"eval_langs": ["zh"],
"main_score": "v_measure",
}



class CLSClusteringP2P(AbsTaskClustering):
@property
def description(self):
return {
"name": "CLSClusteringP2P",
"hf_hub_name": "C-MTEB/CLSClusteringP2P",
"description": (
"Clustering of titles + abstract from CLS dataset. Clustering of 13 sets, based on the main category."
),
"reference": "https://arxiv.org/abs/2209.05034",
"type": "Clustering",
"category": "p2p",
"eval_splits": ["test"],
"eval_langs": ["zh"],
"main_score": "v_measure",
}



class ThuNewsClusteringS2S(AbsTaskClustering):
@property
def description(self):
return {
'name': 'ThuNewsClusteringS2S',
'hf_hub_name': 'C-MTEB/ThuNewsClusteringS2S',
'description': 'Clustering of titles from the THUCNews dataset',
"reference": "http://thuctc.thunlp.org/",
"type": "Clustering",
"category": "s2s",
"eval_splits": ["test"],
"eval_langs": ["zh"],
"main_score": "v_measure",
}


class ThuNewsClusteringP2P(AbsTaskClustering):
@property
def description(self):
return {
'name': 'ThuNewsClusteringP2P',
'hf_hub_name': 'C-MTEB/ThuNewsClusteringP2P',
'description': 'Clustering of titles + abstracts from the THUCNews dataset',
"reference": "http://thuctc.thunlp.org/",
"type": "Clustering",
"category": "p2p",
"eval_splits": ["test"],
"eval_langs": ["zh"],
"main_score": "v_measure",
}
1 change: 1 addition & 0 deletions mteb/tasks/Clustering/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@
from .TenKGnadClusteringP2P import *
from .TenKGnadClusteringS2S import *
from .TwentyNewsgroupsClustering import *
from .CMTEBClustering import *
from .PolishClustering import *
33 changes: 33 additions & 0 deletions mteb/tasks/PairClassification/CMTEBPairClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from ...abstasks.AbsTaskPairClassification import AbsTaskPairClassification


class Ocnli(AbsTaskPairClassification):
@property
def description(self):
return {
'name': 'Ocnli',
"hf_hub_name": "C-MTEB/OCNLI",
'description': 'Original Chinese Natural Language Inference dataset',
"reference": "https://arxiv.org/abs/2010.05444",
'category': 's2s',
'type': 'PairClassification',
'eval_splits': ['validation'],
'eval_langs': ['zh'],
'main_score': 'ap',
}


class Cmnli(AbsTaskPairClassification):
@property
def description(self):
return {
'name': 'Cmnli',
"hf_hub_name": "C-MTEB/CMNLI",
'description': 'Chinese Multi-Genre NLI',
"reference": "https://huggingface.co/datasets/clue/viewer/cmnli",
'category': 's2s',
'type': 'PairClassification',
'eval_splits': ['validation'],
'eval_langs': ['zh'],
'main_score': 'ap',
}
1 change: 1 addition & 0 deletions mteb/tasks/PairClassification/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .SprintDuplicateQuestionsPC import *
from .TwitterSemEval2015PC import *
from .TwitterURLCorpusPC import *
from .CMTEBPairClassification import *
from .PolishPC import *
65 changes: 65 additions & 0 deletions mteb/tasks/Reranking/CMTEBReranking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from ...abstasks.AbsTaskReranking import AbsTaskReranking


class T2Reranking(AbsTaskReranking):
@property
def description(self):
return {
'name': 'T2Reranking',
'hf_hub_name': "C-MTEB/T2Reranking",
'description': 'T2Ranking: A large-scale Chinese Benchmark for Passage Ranking',
"reference": "https://arxiv.org/abs/2304.03679",
'type': 'Reranking',
'category': 's2p',
'eval_splits': ['dev'],
'eval_langs': ['zh'],
'main_score': 'map',
}


class MMarcoReranking(AbsTaskReranking):
@property
def description(self):
return {
'name': 'MMarcoReranking',
'hf_hub_name': "C-MTEB/Mmarco-reranking",
'description': 'mMARCO is a multilingual version of the MS MARCO passage ranking dataset',
"reference": "https://github.com/unicamp-dl/mMARCO",
'type': 'Reranking',
'category': 's2p',
'eval_splits': ['dev'],
'eval_langs': ['zh'],
'main_score': 'map',
}


class CMedQAv1(AbsTaskReranking):
@property
def description(self):
return {
'name': 'CMedQAv1',
"hf_hub_name": "C-MTEB/CMedQAv1-reranking",
'description': 'Chinese community medical question answering',
"reference": "https://github.com/zhangsheng93/cMedQA",
'type': 'Reranking',
'category': 's2p',
'eval_splits': ['test'],
'eval_langs': ['zh'],
'main_score': 'map',
}


class CMedQAv2(AbsTaskReranking):
@property
def description(self):
return {
'name': 'CMedQAv2',
"hf_hub_name": "C-MTEB/CMedQAv2-reranking",
'description': 'Chinese community medical question answering',
"reference": "https://github.com/zhangsheng93/cMedQA2",
'type': 'Reranking',
'category': 's2p',
'eval_splits': ['test'],
'eval_langs': ['zh'],
'main_score': 'map',
}
1 change: 1 addition & 0 deletions mteb/tasks/Reranking/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
from .MindSmallReranking import *
from .SciDocsReranking import *
from .StackOverflowDupQuestions import *
from .CMTEBReranking import *
Loading

0 comments on commit 071974a

Please sign in to comment.