From 769157b1e49c97ee6ca334a299392392bc3a6523 Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Date: Wed, 20 Mar 2024 09:48:31 +0100
Subject: [PATCH 1/4] Restructing the readme (#262)

* restructing the readme

* removed double specification of versions and moved all setup to pyproject.toml

* correctly use flat-layout for the package
---
 .gitignore                                    |   3 +
 README.md                                     | 193 ++----------------
 docs/adding_a_model.md                        |  11 +
 docs/contributing.md                          |  29 +++
 {images => docs/images}/hf_logo.png           | Bin
 .../mteb_logo/mteb_logo_tight_hfhub.drawio    |   0
 .../mteb_logo/mteb_logo_tight_hfhub.png       | Bin
 .../mteb_logo/mteb_logo_transparent.png       | Bin
 .../mteb_logo/mteb_logo_wide_github.drawio    |   0
 .../mteb_logo/mteb_logo_wide_github.png       | Bin
 docs/tasks.md                                 | 136 ++++++++++++
 mteb/__init__.py                              |   6 +-
 mteb/evaluation/MTEB.py                       |  80 +++++---
 pyproject.toml                                | 121 ++++++++++-
 setup.cfg                                     |  54 -----
 setup.py                                      |  94 ---------
 tests/test_all_abstasks.py                    |  52 ++++-
 17 files changed, 424 insertions(+), 355 deletions(-)
 create mode 100644 docs/adding_a_model.md
 create mode 100644 docs/contributing.md
 rename {images => docs/images}/hf_logo.png (100%)
 rename {images => docs/images}/mteb_logo/mteb_logo_tight_hfhub.drawio (100%)
 rename {images => docs/images}/mteb_logo/mteb_logo_tight_hfhub.png (100%)
 rename {images => docs/images}/mteb_logo/mteb_logo_transparent.png (100%)
 rename {images => docs/images}/mteb_logo/mteb_logo_wide_github.drawio (100%)
 rename {images => docs/images}/mteb_logo/mteb_logo_wide_github.png (100%)
 create mode 100644 docs/tasks.md
 delete mode 100644 setup.cfg
 delete mode 100644 setup.py
diff --git a/.gitignore b/.gitignore
index 37b7564c1a..0575d7ecd1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -133,3 +133,6 @@ dmypy.json
 
 # error logs
 error_logs.txt
+
+# tests
+tests/results
diff --git a/README.md b/README.md
index 9de182ea22..fee7ab8da6 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@
 </h4>
 
 <h3 align="center">
-    <a href="https://huggingface.co/spaces/mteb/leaderboard"><img style="float: middle; padding: 10px 10px 10px 10px;" width="60" height="55" src="./images/hf_logo.png" /></a>
+    <a href="https://huggingface.co/spaces/mteb/leaderboard"><img style="float: middle; padding: 10px 10px 10px 10px;" width="60" height="55" src="./docs/images/hf_logo.png" /></a>
 </h3>
 
 
@@ -62,13 +62,17 @@ mteb -m sentence-transformers/all-MiniLM-L6-v2 \
     --verbosity 3
 
 # if nothing is specified default to saving the results in the results/{model_name} folder
-
-mteb -m sentence-transformers/all-MiniLM-L6-v2
 ```
 
 * Using multiple GPUs in parallel can be done by just having a custom encode function that distributes the inputs to multiple GPUs like e.g. [here](https://github.com/microsoft/unilm/blob/b60c741f746877293bb85eed6806736fc8fa0ffd/e5/mteb_eval.py#L60) or [here](https://github.com/ContextualAI/gritlm/blob/09d8630f0c95ac6a456354bcb6f964d7b9b6a609/gritlm/gritlm.py#L75).
 
-## Advanced usage
+<br /> 
+
+<details>
+  <summary> Advanced Usage (click to unfold) </summary>
+
+
+## Advanced Usage
 
 
 ### Dataset selection
@@ -206,49 +210,25 @@ evaluation.run(model)
 
 > **Note:** for multilingual tasks, make sure your class also inherits from the `MultilingualTask` class like in [this](https://github.com/embeddings-benchmark/mteb-draft/blob/main/mteb/tasks/Classification/MTOPIntentClassification.py) example.
 
-## Leaderboard
-
-The MTEB Leaderboard is available [here](https://huggingface.co/spaces/mteb/leaderboard). To submit:
-1. Run on MTEB: You can reference [scripts/run_mteb_english.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/run_mteb_english.py) for all MTEB English datasets used in the main ranking, or [scripts/run_mteb_chinese.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/run_mteb_chinese.py) for the Chinese ones. 
-Advanced scripts with different models are available in the [mteb/mtebscripts repo](https://github.com/embeddings-benchmark/mtebscripts).
-2. Format the json files into metadata using the script at `scripts/mteb_meta.py`. For example
-`python scripts/mteb_meta.py path_to_results_folder`, which will create a `mteb_metadata.md` file. If you ran CQADupstack retrieval, make sure to merge the results first with `python scripts/merge_cqadupstack.py path_to_results_folder`.
-3. Copy the content of the `mteb_metadata.md` file to the top of a `README.md` file of your model on the Hub. See [here](https://huggingface.co/Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit/blob/main/README.md) for an example.
-4. Hit the Refresh button at the bottom of the leaderboard and you should see your scores 🥇
-5. To have the scores appear without refreshing, you can open an issue on the [Community Tab of the LB](https://huggingface.co/spaces/mteb/leaderboard/discussions) and someone will restart the space to cache your average scores. The cache is updated anyways ~1x/week.
+</details>
 
+<br /> 
 
+## Documentation
 
-## Contributing to MTEB
-We welcome contributions such as new datasets to MTEB! This section describes how to set up the repository for development.
+| Documentation                          |                        |
+| ------------------------------ | ---------------------- |
+| 📋 [Tasks] | Overview of available tasks |
+| 📈 [Leaderboard] | The interactive leaderboard of the benchmark |
+| 🤖 [Adding a model] | Information related to how to submit a model to the leaderboard |
+| 🤝  [Contributing] | How to contribute to MTEB and set it up for development |
+| 
 
-### Development Installation
-If you want to submit a dataset or on other ways contribute to MTEB, you can install the package in development mode:
-
-```bash
-git clone https://github.com/embeddings-benchmark/mteb
-cd mteb
-
-# create your virtual environment and activate it
-make install
-```
-
-### Running Tests
-To run the tests, you can use the following command:
-
-```bash
-make test
-# or if you want to run on multiple cores
-make test-parallel
-```
-
-### Running linting
-To run the linting before a PR you can use the following command:
-
-```bash
-make lint
-```
 
+[Tasks]: docs/tasks.md
+[Contributing]: docs/contributing.md
+[Adding a model]: docs/adding_a_model.md
+[Leaderboard]: https://huggingface.co/spaces/mteb/leaderboard
 
 ## Citing
 
@@ -272,132 +252,3 @@ You may also want to read and cite the amazing work that has extended MTEB & int
 - Silvan Wehrli, Bert Arnrich, Christopher Irrgang. "[German Text Embedding Clustering Benchmark](https://arxiv.org/abs/2401.02709)" arXiv 2024
 
 For works that have used MTEB for benchmarking, you can find them on the [leaderboard](https://huggingface.co/spaces/mteb/leaderboard).
-
-
-## Available tasks
-
-| Name                                                                                                                                                                  | Hub URL                                                                                                                              | Description                                                                                                                                                                                                      | Type               | Category | #Languages | Train #Samples | Dev #Samples | Test #Samples | Avg. chars / train | Avg. chars / dev | Avg. chars / test |
-| :-------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------------------------------------------------------------------------------------------------------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------- | :------- | ---------: | -------------: | -----------: | ------------: | -----------------: | ---------------: | ----------------: |
-| [BUCC](https://comparable.limsi.fr/bucc2018/bucc2018-task.html)                                                                                                       | [mteb/bucc-bitext-mining](https://huggingface.co/datasets/mteb/bucc-bitext-mining)                                                   | BUCC bitext mining dataset                                                                                                                                                                                       | BitextMining       | s2s      |          4 |              0 |            0 |        641684 |                  0 |                0 |             101.3 |
-| [Tatoeba](https://github.com/facebookresearch/LASER/tree/main/data/tatoeba/v1)                                                                                        | [mteb/tatoeba-bitext-mining](https://huggingface.co/datasets/mteb/tatoeba-bitext-mining)                                             | 1,000 English-aligned sentence pairs for each language based on the Tatoeba corpus                                                                                                                               | BitextMining       | s2s      |        112 |              0 |            0 |          2000 |                  0 |                0 |              39.4 |
-| [Bornholm parallel](https://aclanthology.org/W19-6138/)                                                                                                               | [strombergnlp/bornholmsk_parallel](https://huggingface.co/datasets/strombergnlp/bornholmsk_parallel)                                 | Danish Bornholmsk Parallel Corpus.                                                                                                                                                                               | BitextMining       | s2s      |          2 |            100 |          100 |           100 |               64.6 |             86.2 |              89.7 |
-| [DiaBLaBitextMining](https://inria.hal.science/hal-03021633) | [rbawden/DiaBLa](https://huggingface.co/datasets/rbawden/DiaBLa) | English-French Parallel Corpus. DiaBLa is an English-French dataset for the evaluation of Machine Translation (MT) for informal, written bilingual dialogue. | BitextMining | s2s | 1 | 5748 | 0 | 0 | 0 | 0 | 0 |
-| [FloresBitextMining](https://huggingface.co/datasets/facebook/flores) | [facebook/flores](https://huggingface.co/datasets/facebook/flores) | FLORES is a benchmark dataset for machine translation between English and low-resource languages. | BitextMining | s2s | 200 | 0 | 997 | 1012 | 0 | 0 | 0 |
-| [AmazonCounterfactualClassification](https://arxiv.org/abs/2104.06893)                                                                                                | [mteb/amazon_counterfactual](https://huggingface.co/datasets/mteb/amazon_counterfactual)                                             | A collection of Amazon customer reviews annotated for counterfactual detection pair classification.                                                                                                              | Classification     | s2s      |          4 |           4018 |          335 |           670 |              107.3 |            109.2 |             106.1 |
-| [AmazonPolarityClassification](https://dl.acm.org/doi/10.1145/2507157.2507163)                                                                                        | [mteb/amazon_polarity](https://huggingface.co/datasets/mteb/amazon_polarity)                                                         | Amazon Polarity Classification Dataset.                                                                                                                                                                          | Classification     | s2s      |          1 |        3600000 |            0 |        400000 |              431.6 |                0 |             431.4 |
-| [AmazonReviewsClassification](https://arxiv.org/abs/2010.02573)                                                                                                       | [mteb/amazon_reviews_multi](https://huggingface.co/datasets/mteb/amazon_reviews_multi)                                               | A collection of Amazon reviews specifically designed to aid research in multilingual text classification.                                                                                                        | Classification     | s2s      |          6 |        1200000 |        30000 |         30000 |              160.5 |            159.2 |             160.4 |
-| [MasakhaNEWSClassification](https://arxiv.org/abs/2304.09972) | [masakhane/masakhanews](https://huggingface.co/datasets/masakhane/masakhanews) | MasakhaNEWS is the largest publicly available dataset for news topic classification in 16 languages widely spoken in Africa. The train/validation/test sets are available for all the 16 languages. | Classification | s2s | 16 | 1476 | 211 | 422 | 5064.8 | 4756.1 | 5116.6 |
-| [Banking77Classification](https://arxiv.org/abs/2003.04807)                                                                                                           | [mteb/banking77](https://huggingface.co/datasets/mteb/banking77)                                                                     | Dataset composed of online banking queries annotated with their corresponding intents.                                                                                                                           | Classification     | s2s      |          1 |          10003 |            0 |          3080 |               59.5 |                0 |              54.2 |
-| [EmotionClassification](https://www.aclweb.org/anthology/D18-1404)                                                                                                    | [mteb/emotion](https://huggingface.co/datasets/mteb/emotion)                                                                         | Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. For more detailed information please refer to the paper.                                | Classification     | s2s      |          1 |          16000 |         2000 |          2000 |               96.8 |             95.3 |              96.6 |
-| [ImdbClassification](http://www.aclweb.org/anthology/P11-1015)                                                                                                        | [mteb/imdb](https://huggingface.co/datasets/mteb/imdb)                                                                               | Large Movie Review Dataset                                                                                                                                                                                       | Classification     | p2p      |          1 |          25000 |            0 |         25000 |             1325.1 |                0 |            1293.8 |
-| [MassiveIntentClassification](https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.)   | [mteb/amazon_massive_intent](https://huggingface.co/datasets/mteb/amazon_massive_intent)                                             | MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages                                                                                                | Classification     | s2s      |         51 |          11514 |         2033 |          2974 |               35.0 |             34.8 |              34.6 |
-| [MassiveScenarioClassification](https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.) | [mteb/amazon_massive_scenario](https://huggingface.co/datasets/mteb/amazon_massive_scenario)                                         | MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages                                                                                                | Classification     | s2s      |         51 |          11514 |         2033 |          2974 |               35.0 |             34.8 |              34.6 |
-| [MTOPDomainClassification](https://arxiv.org/pdf/2008.09335.pdf)                                                                                                      | [mteb/mtop_domain](https://huggingface.co/datasets/mteb/mtop_domain)                                                                 | MTOP: Multilingual Task-Oriented Semantic Parsing                                                                                                                                                                | Classification     | s2s      |          6 |          15667 |         2235 |          4386 |               36.6 |             36.5 |              36.8 |
-| [MTOPIntentClassification](https://arxiv.org/pdf/2008.09335.pdf)                                                                                                      | [mteb/mtop_intent](https://huggingface.co/datasets/mteb/mtop_intent)                                                                 | MTOP: Multilingual Task-Oriented Semantic Parsing                                                                                                                                                                | Classification     | s2s      |          6 |          15667 |         2235 |          4386 |               36.6 |             36.5 |              36.8 |
-| [ToxicConversationsClassification](https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview)                                    | [mteb/toxic_conversations_50k](https://huggingface.co/datasets/mteb/toxic_conversations_50k)                                         | Collection of comments from the Civil Comments platform together with annotations if the comment is toxic or not.                                                                                                | Classification     | s2s      |          1 |          50000 |            0 |         50000 |              298.8 |                0 |             296.6 |
-| [TweetSentimentExtractionClassification](https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview)                                                     | [mteb/tweet_sentiment_extraction](https://huggingface.co/datasets/mteb/tweet_sentiment_extraction)                                   |                                                                                                                                                                                                                  | Classification     | s2s      |          1 |          27481 |            0 |          3534 |               68.3 |                0 |              67.8 |
-| [AngryTweetsClassification](https://aclanthology.org/2021.nodalida-main.53/)                                                                                          | [mteb/DDSC/angry-tweets](https://huggingface.co/datasets/DDSC/angry-tweets)                                                          | A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets                                                                                                                                 | Classification     | s2s      |          1 |           2410 |            0 |          1050 |              153.0 |                0 |             156.1 |
-| [DKHateClassification](https://aclanthology.org/2020.lrec-1.430/)                                                                                                     | [DDSC/dkhate](https://huggingface.co/datasets/DDSC/dkhate)                                                                           | Danish Tweets annotated for Hate Speech                                                                                                                                                                          | Classification     | s2s      |          1 |           2960 |            0 |           329 |               88.2 |                0 |             104.0 |
-| [DalajClassification](https://spraakbanken.gu.se/en/resources/dalaj-1-0)                                                                                               | [AI-Sweden/SuperLim](https://huggingface.co/datasets/AI-Sweden/SuperLim)                                                             | A Swedish dataset for linguistic accebtablity. Available as a part of Superlim                                                                                                                                   | Classification     | s2s      |          1 |           3840 |          445 |           444 |              243.7 |            242.5 |             243.8 |
-| [DanishPoliticalCommentsClassification](https://huggingface.co/datasets/danish_political_comments)                                                                    | [danish_political_comments](https://huggingface.co/datasets/danish_political_comments)                                               | A dataset of Danish political comments rated for sentiment                                                                                                                                                       | Classification     | s2s      |          1 |           9010 |            0 |             0 |               69.9 |                0 |                 0 |
-| [LccClassification](https://github.com/fnielsen/lcc-sentiment)                                                                                                        | [DDSC/lcc](https://huggingface.co/datasets/DDSC/lcc)                                                                                 | The leipzig corpora collection, annotated for sentiment                                                                                                                                                          | Classification     | s2s      |          1 |            349 |            0 |           150 |              113.5 |                0 |             118.7 |
-| [NoRecClassification](https://aclanthology.org/L18-1661/)                                                                                                             | [ScandEval/norec-mini](https://huggingface.co/datasets/ScandEval/norec-mini)                                                         | A Norwegian dataset for sentiment classification on review                                                                                                                                                       | Classification     | s2s      |          1 |           1020 |          256 |          2050 |               86.9 |             89.6 |              82.0 |
-| [NordicLangClassification](https://aclanthology.org/2021.vardial-1.8/)                                                                                                | [strombergnlp/nordic_langid](https://huggingface.co/datasets/strombergnlp/nordic_langid)                                             | A dataset for Nordic language identification.                                                                                                                                                                    | Classification     | s2s      |          6 |          57000 |            0 |          3000 |               78.4 |                0 |              78.2 |
-| [NorwegianParliamentClassification](https://huggingface.co/datasets/NbAiLab/norwegian_parliament)                                                                     | [NbAiLab/norwegian_parliament](https://huggingface.co/datasets/NbAiLab/norwegian_parliament)                                         | Norwegian parliament speeches annotated for sentiment                                                                                                                                                            | Classification     | s2s      |          1 |           3600 |         1200 |          1200 |             1773.6 |           1911.0 |            1884.0 |
-| [ScalaDaClassification](https://aclanthology.org/2023.nodalida-1.20/)                                                                                                 | [ScandEval/scala-da](https://huggingface.co/datasets/ScandEval/scala-da)                                                             | A modified version of DDT modified for linguistic acceptability classification                                                                                                                                   | Classification     | s2s      |          1 |           1024 |          256 |          2048 |              107.6 |            100.8 |             109.4 |
-| [ScalaNbClassification](https://aclanthology.org/2023.nodalida-1.20/)                                                                                                 | [ScandEval/scala-nb](https://huggingface.co/datasets/ScandEval/scala-nb)                                                             | A Norwegian dataset for linguistic acceptability classification for Bokmål                                                                                                                                       | Classification     | s2s      |          1 |           1024 |          256 |          2048 |               95.5 |             94.8 |              98.4 |
-| [ScalaNnClassification](https://aclanthology.org/2023.nodalida-1.20/)                                                                                                 | [ScandEval/scala-nn](https://huggingface.co/datasets/ScandEval/scala-nn)                                                             | A Norwegian dataset for linguistic acceptability classification for Nynorsk                                                                                                                                      | Classification     | s2s      |          1 |           1024 |          256 |          2048 |              105.3 |            103.5 |             104.8 |
-| [ScalaSvClassification](https://aclanthology.org/2023.nodalida-1.20/)                                                                                                 | [ScandEval/scala-sv](https://huggingface.co/datasets/ScandEval/scala-sv)                                                             | A Swedish dataset for linguistic acceptability classification                                                                                                                                                    | Classification     | s2s      |          1 |           1024 |          256 |          2048 |              102.6 |            113.0 |              98.3 |
-| [SweRecClassificition](https://aclanthology.org/2023.nodalida-1.20/)                                                                                                  | [ScandEval/swerec-mini](https://huggingface.co/datasets/ScandEval/swerec-mini)                                                       | A Swedish dataset for sentiment classification on reviews                                                                                                                                                        | Classification     | s2s      |          1 |           1024 |          256 |          2048 |              317.7 |            293.4 |             318.8 |
-| [CBD](http://2019.poleval.pl/files/poleval2019.pdf)                                                                                                                   | [PL-MTEB/cbd](https://huggingface.co/datasets/PL-MTEB/cbd)                                                                           | Polish Tweets annotated for cyberbullying detection.                                                                                                                                                             | Classification     | s2s      |          1 |          10041 |            0 |          1000 |               93.6 |                0 |              93.2 |
-| [PolEmo2.0-IN](https://aclanthology.org/K19-1092.pdf)                                                                                                                 | [PL-MTEB/polemo2_in](https://huggingface.co/datasets/PL-MTEB/polemo2_in)                                                             | A collection of Polish online reviews from four domains: medicine, hotels, products and school. The PolEmo2.0-IN task is to predict the sentiment of in-domain (medicine and hotels) reviews.                    | Classification     | s2s      |          1 |           5783 |          723 |           722 |              780.6 |            769.4 |             756.2 |
-| [PolEmo2.0-OUT](https://aclanthology.org/K19-1092.pdf)                                                                                                                | [PL-MTEB/polemo2_out](https://huggingface.co/datasets/PL-MTEB/polemo2_out)                                                           | A collection of Polish online reviews from four domains: medicine, hotels, products and school. The PolEmo2.0-OUT task is to predict the sentiment of out-of-domain (products and school) reviews using models train on reviews from medicine and hotels domains. | Classification | s2s | 1 | 5783 | 494 | 494 | 780.6 | 589.3 | 587.0 |
-| [AllegroReviews](https://aclanthology.org/2020.acl-main.111.pdf)                                                                                                      | [PL-MTEB/allegro-reviews](https://huggingface.co/datasets/PL-MTEB/allegro-reviews)                                                   | A Polish dataset for sentiment classification on reviews from e-commerce marketplace Allegro.                                                                                                                    | Classification     | s2s      |          1 |           9577 |         1002 |          1006 |              477.9 |            480.9 |             477.2 |
-| [PAC](https://arxiv.org/pdf/2211.13112.pdf)                                                                                                                           | [laugustyniak/abusive-clauses-pl](https://huggingface.co/datasets/laugustyniak/abusive-clauses-pl)                                   | Polish Abusive Clauses Dataset                                                                                                                                                                                   | Classification     | s2s      |          1 |           4284 |         1519 |          3453 |              185.3 |            256.8 |             185.3 |
-| [AlloProfClusteringP2P](https://huggingface.co/datasets/lyon-nlp/alloprof) | [lyon-nlp/alloprof](https://huggingface.co/datasets/lyon-nlp/alloprof) | Clustering of document titles and descriptions from Allo Prof dataset. Clustering of 10 sets on the document topic. | Clustering | p2p | 1 | 2798 | 0 | 0 | 0 | 0 | 0 |
-| [AlloProfClusteringS2S](https://huggingface.co/datasets/lyon-nlp/alloprof) | [lyon-nlp/alloprof](https://huggingface.co/datasets/lyon-nlp/alloprof) | Clustering of document titles from Allo Prof dataset. Clustering of 10 sets on the document topic. | Clustering | s2s | 1 | 2798 | 0 | 0 | 0 | 0 | 0 |
-| [ArxivClusteringP2P](https://www.kaggle.com/Cornell-University/arxiv)                                                                                                 | [mteb/arxiv-clustering-p2p](https://huggingface.co/datasets/mteb/arxiv-clustering-p2p)                                               | Clustering of titles+abstract from arxiv. Clustering of 30 sets, either on the main or secondary category                                                                                                        | Clustering         | p2p      |          1 |              0 |            0 |        732723 |                  0 |                0 |            1009.9 |
-| [ArxivClusteringS2S](https://www.kaggle.com/Cornell-University/arxiv)                                                                                                 | [mteb/arxiv-clustering-s2s](https://huggingface.co/datasets/mteb/arxiv-clustering-s2s)                                               | Clustering of titles from arxiv. Clustering of 30 sets, either on the main or secondary category                                                                                                                 | Clustering         | s2s      |          1 |              0 |            0 |        732723 |                  0 |                0 |              74.0 |
-| [BiorxivClusteringP2P](https://api.biorxiv.org/)                                                                                                                      | [mteb/biorxiv-clustering-p2p](https://huggingface.co/datasets/mteb/biorxiv-clustering-p2p)                                           | Clustering of titles+abstract from biorxiv. Clustering of 10 sets, based on the main category.                                                                                                                   | Clustering         | p2p      |          1 |              0 |            0 |         75000 |                  0 |                0 |            1666.2 |
-| [BiorxivClusteringS2S](https://api.biorxiv.org/)                                                                                                                      | [mteb/biorxiv-clustering-s2s](https://huggingface.co/datasets/mteb/biorxiv-clustering-s2s)                                           | Clustering of titles from biorxiv. Clustering of 10 sets, based on the main category.                                                                                                                            | Clustering         | s2s      |          1 |              0 |            0 |         75000 |                  0 |                0 |             101.6 |
-| [BlurbsClusteringP2P](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html)                                                             | [slvnwhrl/blurbs-clustering-p2p](https://huggingface.co/datasets/slvnwhrl/blurbs-clustering-p2p)                                     | Clustering of book titles+blurbs. Clustering of 28 sets, either on the main or secondary genre                                                                                                                   | Clustering         | p2p      |          1 |              0 |            0 |        174637 |                  0 |                0 |            664.09 |
-| [BlurbsClusteringS2S](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html)                                                             | [slvnwhrl/blurbs-clustering-s2s](https://huggingface.co/datasets/slvnwhrl/blurbs-clustering-s2s)                                     | Clustering of book titles. Clustering of 28 sets, either on the main or secondary genre.                                                                                                                         | Clustering         | s2s      |          1 |              0 |            0 |        174637 |                  0 |                0 |             23.02 |
-| [HALClusteringS2S](https://huggingface.co/datasets/lyon-nlp/clustering-hal-s2s) | [lyon-nlp/clustering-hal-s2s](https://huggingface.co/datasets/lyon-nlp/clustering-hal-s2s) | Clustering of titles from HAL. Clustering of 10 sets on the main category. | Clustering | s2s | 1 | 85375 | 0 | 0 | 0 | 0 | 0 |
-| [MedrxivClusteringP2P](https://api.medrxiv.org/)                                                                                                                      | [mteb/medrxiv-clustering-p2p](https://huggingface.co/datasets/mteb/medrxiv-clustering-p2p)                                           | Clustering of titles+abstract from medrxiv. Clustering of 10 sets, based on the main category.                                                                                                                   | Clustering         | p2p      |          1 |              0 |            0 |         37500 |                  0 |                0 |            1981.2 |
-| [MedrxivClusteringS2S](https://api.medrxiv.org/)                                                                                                                      | [mteb/medrxiv-clustering-s2s](https://huggingface.co/datasets/mteb/medrxiv-clustering-s2s)                                           | Clustering of titles from medrxiv. Clustering of 10 sets, based on the main category.                                                                                                                            | Clustering         | s2s      |          1 |              0 |            0 |         37500 |                  0 |                0 |             114.7 |
-| [RedditClustering](https://arxiv.org/abs/2104.07081)                                                                                                                  | [mteb/reddit-clustering](https://huggingface.co/datasets/mteb/reddit-clustering)                                                     | Clustering of titles from 199 subreddits. Clustering of 25 sets, each with 10-50 classes, and each class with 100 - 1000 sentences.                                                                              | Clustering         | s2s      |          1 |              0 |            0 |        420464 |                  0 |                0 |              64.7 |
-| [RedditClusteringP2P](https://huggingface.co/datasets/sentence-transformers/reddit-title-body)                                                                        | [mteb/reddit-clustering-p2p](https://huggingface.co/datasets/mteb/reddit-clustering-p2p)                                             | Clustering of title+posts from reddit. Clustering of 10 sets of 50k paragraphs and 40 sets of 10k paragraphs.                                                                                                    | Clustering         | p2p      |          1 |              0 |            0 |        459399 |                  0 |                0 |             727.7 |
-| [StackExchangeClustering](https://arxiv.org/abs/2104.07081)                                                                                                           | [mteb/stackexchange-clustering](https://huggingface.co/datasets/mteb/stackexchange-clustering)                                       | Clustering of titles from 121 stackexchanges. Clustering of 25 sets, each with 10-50 classes, and each class with 100 - 1000 sentences.                                                                          | Clustering         | s2s      |          1 |              0 |       417060 |        373850 |                  0 |             56.8 |              57.0 |
-| [StackExchangeClusteringP2P](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_title_body_jsonl)                                                 | [mteb/stackexchange-clustering-p2p](https://huggingface.co/datasets/mteb/stackexchange-clustering-p2p)                               | Clustering of title+body from stackexchange. Clustering of 5 sets of 10k paragraphs and 5 sets of 5k paragraphs.                                                                                                 | Clustering         | p2p      |          1 |              0 |            0 |         75000 |                  0 |                0 |            1090.7 |
-| [TenKGnadClusteringP2P](https://tblock.github.io/10kGNAD/)                                                                                                            | [slvnwhrl/tenkgnad-clustering-p2p](https://huggingface.co/datasets/slvnwhrl/tenkgnad-clustering-p2p)                                 | Clustering of news article titles+subheadings+texts. Clustering of 10 splits on the news article category.                                                                                                       | Clustering         | p2p      |          1 |              0 |            0 |         45914 |                  0 |                0 |           2641.03 |
-| [TenKGnadClusteringS2S](https://tblock.github.io/10kGNAD/)                                                                                                            | [slvnwhrl/tenkgnad-clustering-s2s](https://huggingface.co/datasets/slvnwhrl/tenkgnad-clustering-s2s)                                 | Clustering of news article titles. Clustering of 10 splits on the news article category.                                                                                                                         | Clustering         | s2s      |          1 |              0 |            0 |         45914 |                  0 |                0 |             50.96 |
-| [TwentyNewsgroupsClustering](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html)                                                                           | [mteb/twentynewsgroups-clustering](https://huggingface.co/datasets/mteb/twentynewsgroups-clustering)                                 | Clustering of the 20 Newsgroups dataset (subject only).                                                                                                                                                          | Clustering         | s2s      |          1 |              0 |            0 |         59545 |                  0 |                0 |              32.0 |
-| [8TagsClustering](https://aclanthology.org/2020.lrec-1.207.pdf)                                                                                                       | [PL-MTEB/8tags-clustering](https://huggingface.co/datasets/PL-MTEB/8tags-clustering)                                                 | Clustering of headlines from social media posts in Polish belonging to 8 categories: film, history, food, medicine, motorization, work, sport and technology.                                                    | Clustering         | s2s      |          1 |          40001 |         5000 |          4372 |               78.2 |             77.6 |              79.2 |
-| [OpusparcusPC](https://gem-benchmark.com/data_cards/opusparcus) | [GEM/opusparcus](https://huggingface.co/datasets/GEM/opusparcus) | Opusparcus is a paraphrase corpus for six European language: German, English, Finnish, French, Russian, and Swedish. The paraphrases consist of subtitles from movies and TV shows. | PairClassification | s2s | 6 | 1007 | 0 | 0 | 0 | 0 | 0 |
-| [SprintDuplicateQuestions](https://www.aclweb.org/anthology/D18-1131/)                                                                                                | [mteb/sprintduplicatequestions-pairclassification](https://huggingface.co/datasets/mteb/sprintduplicatequestions-pairclassification) | Duplicate questions from the Sprint community.                                                                                                                                                                   | PairClassification | s2s      |          1 |              0 |       101000 |        101000 |                  0 |             65.2 |              67.9 |
-| [TwitterSemEval2015](https://alt.qcri.org/semeval2015/task1/)                                                                                                         | [mteb/twittersemeval2015-pairclassification](https://huggingface.co/datasets/mteb/twittersemeval2015-pairclassification)             | Paraphrase-Pairs of Tweets from the SemEval 2015 workshop.                                                                                                                                                       | PairClassification | s2s      |          1 |              0 |            0 |         16777 |                  0 |                0 |              38.3 |
-| [TwitterURLCorpus](https://languagenet.github.io/)                                                                                                                    | [mteb/twitterurlcorpus-pairclassification](https://huggingface.co/datasets/mteb/twitterurlcorpus-pairclassification)                 | Paraphrase-Pairs of Tweets.                                                                                                                                                                                      | PairClassification | s2s      |          1 |              0 |            0 |         51534 |                  0 |                0 |              79.5 |
-| [PPC](https://arxiv.org/pdf/2207.12759.pdf)                                                                                                                           | [PL-MTEB/ppc-pairclassification](https://huggingface.co/datasets/PL-MTEB/ppc-pairclassification)                                     | Polish Paraphrase Corpus                                                                                                                                                                                         | PairClassification | s2s      |          1 |           5000 |         1000 |          1000 |               41.0 |             41.0 |              40.2 |
-| [PSC](http://www.lrec-conf.org/proceedings/lrec2014/pdf/1211_Paper.pdf)                                                                                               | [PL-MTEB/psc-pairclassification](https://huggingface.co/datasets/PL-MTEB/psc-pairclassification)                                     | Polish Summaries Corpus                                                                                                                                                                                          | PairClassification | s2s      |          1 |           4302 |            0 |          1078 |              537.1 |                0 |             549.3 |
-| [SICK-E-PL](https://aclanthology.org/2020.lrec-1.207.pdf)                                                                                                             | [PL-MTEB/sicke-pl-pairclassification](https://huggingface.co/datasets/PL-MTEB/sicke-pl-pairclassification)                           | Polish version of SICK dataset for textual entailment.                                                                                                                                                           | PairClassification | s2s      |          1 |           4439 |          495 |          4906 |               43.4 |             44.7 |              43.2 |
-| [CDSC-E](https://aclanthology.org/P17-1073.pdf)                                                                                                                       | [PL-MTEB/cdsce-pairclassification](https://huggingface.co/datasets/PL-MTEB/cdsce-pairclassification)                                 | Compositional Distributional Semantics Corpus for textual entailment.                                                                                                                                            | PairClassification | s2s      |          1 |           8000 |         1000 |          1000 |               71.9 |             73.5 |              75.2 |
-| [AskUbuntuDupQuestions](https://github.com/taolei87/askubuntu)                                                                                                        | [mteb/askubuntudupquestions-reranking](https://huggingface.co/datasets/mteb/askubuntudupquestions-reranking)                         | AskUbuntu Question Dataset - Questions from AskUbuntu with manual annotations marking pairs of questions as similar or non-similar                                                                               | Reranking          | s2s      |          1 |              0 |            0 |          2255 |                  0 |                0 |              52.5 |
-| [MindSmallReranking](https://msnews.github.io/assets/doc/ACL2020_MIND.pdf)                                                                                            | [mteb/mind_small](https://huggingface.co/datasets/mteb/mind_small)                                                                   | Microsoft News Dataset: A Large-Scale English Dataset for News Recommendation Research                                                                                                                           | Reranking          | s2s      |          1 |         231530 |            0 |        107968 |               69.0 |                0 |              70.9 |
-| [SciDocsRR](https://allenai.org/data/scidocs)                                                                                                                         | [mteb/scidocs-reranking](https://huggingface.co/datasets/mteb/scidocs-reranking)                                                     | Ranking of related scientific papers based on their title.                                                                                                                                                       | Reranking          | s2s      |          1 |              0 |        19594 |         19599 |                  0 |             69.4 |              69.0 |
-| [StackOverflowDupQuestions](https://www.microsoft.com/en-us/research/uploads/prod/2019/03/nl4se18LinkSO.pdf)                                                          | [mteb/stackoverflowdupquestions-reranking](https://huggingface.co/datasets/mteb/stackoverflowdupquestions-reranking)                 | Stack Overflow Duplicate Questions Task for questions with the tags Java, JavaScript and Python                                                                                                                  | Reranking          | s2s      |          1 |          23018 |            0 |          3467 |               49.6 |                0 |              49.8 |
-| [AlloprofRetrieval](https://huggingface.co/datasets/antoinelb7/alloprof) | [lyon-nlp/alloprof](https://huggingface.co/datasets/lyon-nlp/alloprof) | This dataset was provided by AlloProf, an organisation in Quebec, Canada offering resources and a help forum curated by a large number of teachers to students on all subjects taught from in primary and secondary school | Retrieval | s2p | 1 | 2798 | 0 | 0 | 0 | 0 | 0 |
-| [ArguAna](http://argumentation.bplaced.net/arguana/data)                                                                                                              | [mteb/arguana](https://huggingface.co/datasets/mteb/arguana)                                                                         | NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval                                                                                                                                 | Retrieval          | p2p      |          1 |              0 |            0 |         10080 |                  0 |                0 |            1052.9 |
-| [BSARDRetrieval](https://huggingface.co/datasets/maastrichtlawtech/bsard) | [maastrichtlawtech/bsard](https://huggingface.co/datasets/maastrichtlawtech/bsard) | The Belgian Statutory Article Retrieval Dataset (BSARD) is a French native dataset for studying legal information retrieval. BSARD consists of more than 22,600 statutory articles from Belgian law and about 1,100 legal questions posed by Belgian citizens and labeled by experienced jurists with relevant articles from the corpus. | Retrieval | s2p | 1 | 222 | 0 | 0 | 0 | 0 | 0 |
-| [ClimateFEVER](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html)                                                                                  | [mteb/climate-fever](https://huggingface.co/datasets/mteb/climate-fever)                                                             | CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims regarding climate-change.                                                                                     | Retrieval          | s2p      |          1 |              0 |            0 |       5418128 |                  0 |                0 |             539.1 |
-| [CQADupstackAndroidRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                   | [mteb/cqadupstack-android](https://huggingface.co/datasets/mteb/cqadupstack-android)                                                   | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         23697 |                  0 |                0 |             578.7 |
-| [CQADupstackEnglishRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                   | [mteb/cqadupstack-english](https://huggingface.co/datasets/mteb/cqadupstack-english)                                                   | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         41791 |                  0 |                0 |             467.1 |
-| [CQADupstackGamingRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                    | [mteb/cqadupstack-gaming](https://huggingface.co/datasets/mteb/cqadupstack-gaming)                                                    | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         46896 |                  0 |                0 |             474.7 |
-| [CQADupstackGisRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                       | [mteb/cqadupstack-gis](https://huggingface.co/datasets/mteb/cqadupstack-gis)                                                       | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         38522 |                  0 |                0 |             991.1 |
-| [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                               | [mteb/cqadupstack-mathematica](https://huggingface.co/datasets/mteb/cqadupstack-mathematica)                                               | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         17509 |                  0 |                0 |            1103.7 |
-| [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                   | [mteb/cqadupstack-physics](https://huggingface.co/datasets/mteb/cqadupstack-physics)                                                   | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         39355 |                  0 |                0 |             799.4 |
-| [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                               | [mteb/cqadupstack-programmers](https://huggingface.co/datasets/mteb/cqadupstack-programmers)                                               | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         33052 |                  0 |                0 |            1030.2 |
-| [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                     | [mteb/cqadupstack-stats](https://huggingface.co/datasets/mteb/cqadupstack-stats)                                                     | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         42921 |                  0 |                0 |            1041.0 |
-| [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                       | [mteb/cqadupstack-tex](https://huggingface.co/datasets/mteb/cqadupstack-tex)                                                       | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         71090 |                  0 |                0 |            1246.9 |
-| [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                      | [mteb/cqadupstack-unix](https://huggingface.co/datasets/mteb/cqadupstack-unix)                                                      | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         48454 |                  0 |                0 |             984.7 |
-| [CQADupstackWebmastersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                | [mteb/cqadupstack-webmasters](https://huggingface.co/datasets/mteb/cqadupstack-webmasters)                                                | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         17911 |                  0 |                0 |             689.8 |
-| [CQADupstackWordpressRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                 | [mteb/cqadupstack-wordpress](https://huggingface.co/datasets/mteb/cqadupstack-wordpress)                                                 | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         49146 |                  0 |                0 |            1111.9 |
-| [DBPedia](https://github.com/iai-group/DBpedia-Entity/)                                                                                                               | [mteb/dbpedia](https://huggingface.co/datasets/mteb/dbpedia)                                                           | DBpedia-Entity is a standard test collection for entity search over the DBpedia knowledge base                                                                                                                   | Retrieval          | s2p      |          1 |              0 |      4635989 |       4636322 |                  0 |            310.2 |             310.1 |
-| [FEVER](https://fever.ai/)                                                                                                                                            | [mteb/fever](https://huggingface.co/datasets/mteb/fever)                                                                             | FEVER (Fact Extraction and VERification) consists of 185,445 claims generated by altering sentences extracted from Wikipedia and subsequently verified without knowledge of the sentence they were derived from. | Retrieval          | s2p      |          1 |              0 |            0 |       5423234 |                  0 |                0 |             538.6 |
-| [FiQA2018](https://sites.google.com/view/fiqa/)                                                                                                                       | [mteb/fiqa](https://huggingface.co/datasets/mteb/fiqa)                                                                               | Financial Opinion Mining and Question Answering                                                                                                                                                                  | Retrieval          | s2p      |          1 |              0 |            0 |         58286 |                  0 |                0 |             760.4 |
-| [HagridRetrieval](https://github.com/project-miracl/hagrid) | [miracl/hagrid](https://huggingface.co/datasets/miracl/hagrid) | HAGRID (Human-in-the-loop Attributable Generative Retrieval for Information-seeking Dataset) is a dataset for generative information-seeking scenarios. It consists of queries along with a set of manually labelled relevant passages | Retrieval | s2p | 1 | 716 | 0 | 0 | 0 | 0 | 0 |
-| [HotpotQA](https://hotpotqa.github.io/)                                                                                                                               | [mteb/hotpotqa](https://huggingface.co/datasets/mteb/hotpotqa)                                                                       | HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong supervision for supporting facts to enable more explainable question answering systems.                             | Retrieval          | s2p      |          1 |              0 |            0 |       5240734 |                  0 |                0 |             288.6 |
-| [MSMARCO](https://microsoft.github.io/msmarco/)                                                                                                                       | [mteb/msmarco](https://huggingface.co/datasets/mteb/msmarco)                                                                         | MS MARCO is a collection of datasets focused on deep learning in search. Note that the dev set is used for the leaderboard.                                                                                      | Retrieval          | s2p      |          1 |              0 |      8848803 |       8841866 |                  0 |            336.6 |             336.8 |
-| [MSMARCOv2](https://microsoft.github.io/msmarco/TREC-Deep-Learning.html)                                                                                              | [mteb/msmarco-v2](https://huggingface.co/datasets/mteb/msmarco-v2)                                                                   | MS MARCO is a collection of datasets focused on deep learning in search                                                                                                                                          | Retrieval          | s2p      |          1 |      138641342 |    138368101 |             0 |              341.4 |            342.0 |                 0 |
-| [NFCorpus](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/)                                                                                                   | [mteb/nfcorpus](https://huggingface.co/datasets/mteb/nfcorpus)                                                                       | NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval                                                                                                                                 | Retrieval          | s2p      |          1 |              0 |            0 |          3956 |                  0 |                0 |            1462.7 |
-| [NQ](https://ai.google.com/research/NaturalQuestions/)                                                                                                                | [mteb/nq](https://huggingface.co/datasets/mteb/nq)                                                                                   | NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval                                                                                                                                 | Retrieval          | s2p      |          1 |              0 |            0 |       2684920 |                  0 |                0 |             492.7 |
-| [QuoraRetrieval](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs)                                                                              | [mteb/quora](https://huggingface.co/datasets/mteb/quora)                                                                             | QuoraRetrieval is based on questions that are marked as duplicates on the Quora platform. Given a question, find other (duplicate) questions.                                                                    | Retrieval          | s2s      |          1 |              0 |            0 |        532931 |                  0 |                0 |              62.9 |
-| [SCIDOCS](https://allenai.org/data/scidocs)                                                                                                                           | [mteb/scidocs](https://huggingface.co/datasets/mteb/scidocs)                                                                         | SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from citation prediction, to document classification and recommendation.                                                    | Retrieval          | s2p      |          1 |              0 |            0 |         26657 |                  0 |                0 |            1161.9 |
-| [SciFact](https://github.com/allenai/scifact)                                                                                                                         | [mteb/scifact](https://huggingface.co/datasets/mteb/scifact)                                                                         | SciFact verifies scientific claims using evidence from the research literature containing scientific paper abstracts.                                                                                            | Retrieval          | s2p      |          1 |              0 |            0 |          5483 |                  0 |                0 |            1422.3 |
-| [Touche2020](https://webis.de/events/touche-20/shared-task-1.html)                                                                                                    | [mteb/touche2020](https://huggingface.co/datasets/mteb/touche2020)                                                       | Touché Task 1: Argument Retrieval for Controversial Questions                                                                                                                                                    | Retrieval          | s2p      |          1 |              0 |            0 |        382594 |                  0 |                0 |            1720.1 |
-| [TRECCOVID](https://ir.nist.gov/covidSubmit/index.html)                                                                                                               | [mteb/trec-covid](https://huggingface.co/datasets/mteb/trec-covid)                                                                   | TRECCOVID is an ad-hoc search challenge based on the CORD-19 dataset containing scientific articles related to the COVID-19 pandemic                                                                             | Retrieval          | s2p      |          1 |              0 |            0 |        171382 |                  0 |                0 |            1117.4 |
-| [ArguAna-PL](http://argumentation.bplaced.net/arguana/data)                                                                                                           | [BeIR-PL/arguana-pl](https://huggingface.co/datasets/clarin-knext/arguana-pl) | NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval | Retrieval | p2p | 1 | 0 | 0 | 10080 | 0 | 0 | 1052.9 |
-| [DBPedia-PL](https://github.com/iai-group/DBpedia-Entity/)                                                                                                            | [BeIR-PL/dbpedia-pl](https://huggingface.co/datasets/clarin-knext/dbpedia-pl) | DBpedia-Entity is a standard test collection for entity search over the DBpedia knowledge base | Retrieval | s2p | 1 | 0 | 4635989 | 4636322 | 0 | 310.2 | 310.1 |
-| [FiQA-PL](https://sites.google.com/view/fiqa/)                                                                                                                        | [BeIR-PL/fiqa-pl](https://huggingface.co/datasets/clarin-knext/fiqa-pl) | Financial Opinion Mining and Question Answering | Retrieval | s2p | 1 | 0 | 0 | 58286 | 0 | 0 | 760.4 |
-| [HotpotQA-PL](https://hotpotqa.github.io/) | [BeIR-PL/hotpotqa-pl](https://huggingface.co/datasets/clarin-knext/hotpotqa-pl) | HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong supervision for supporting facts to enable more explainable question answering systems. | Retrieval | s2p | 1 | 0 | 0 | 5240734 | 0 | 0 | 288.6 |
-| [MSMARCO-PL](https://microsoft.github.io/msmarco/) | [BeIR-PL/msmarco-pl](https://huggingface.co/datasets/clarin-knext/msmarco-pl) | MS MARCO is a collection of datasets focused on deep learning in search. Note that the dev set is used for the leaderboard. | Retrieval | s2p | 1 | 0 | 8848803 | 8841866 | 0 | 336.6 | 336.8 |
-| [NFCorpus-PL](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) | [BeIR-PL/nfcorpus-pl](https://huggingface.co/datasets/clarin-knext/nfcorpus-pl) | NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval | Retrieval | s2p | 1 | 0 | 0 | 3956 | 0 | 0 | 1462.7 |
-| [NQ-PL](https://ai.google.com/research/NaturalQuestions/) | [BeIR-PL/nq-pl](https://huggingface.co/datasets/clarin-knext/nq-pl) | Natural Questions: A Benchmark for Question Answering Research | Retrieval | s2p | 1 | 0 | 0 | 2684920 | 0 | 0 | 492.7 |
-| [Quora-PL](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | [BeIR-PL/quora-pl](https://huggingface.co/datasets/clarin-knext/quora-pl) | QuoraRetrieval is based on questions that are marked as duplicates on the Quora platform. Given a question, find other (duplicate) questions. | Retrieval | s2s | 1 | 0 | 0 | 532931 | 0 | 0 | 62.9 |
-| [SCIDOCS-PL](https://allenai.org/data/scidocs) | [BeIR-PL/scidocs-pl](https://huggingface.co/datasets/clarin-knext/scidocs-pl) | SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from citation prediction, to document classification and recommendation. | Retrieval | s2p | 1 | 0 | 0 | 26657 | 0 | 0 | 1161.9 |
-| [SciFact-PL](https://github.com/allenai/scifact) | [BeIR-PL/scifact-pl](https://huggingface.co/datasets/clarin-knext/scifact-pl) | SciFact verifies scientific claims using evidence from the research literature containing scientific paper abstracts. | Retrieval | s2p | 1 | 0 | 0 | 5483 | 0 | 0 | 1422.3 |
-| [SweFAQ](https://spraakbanken.gu.se/en/resources/swefaq)                                                                                                                  | [AI-Sweden/SuperLim](https://huggingface.co/datasets/AI-Sweden/SuperLim)                                                             | Frequently asked questions from Swedish authorities' websites                                                                                                                                                    | Retrieval          | s2p      |          1 |              0 |            0 |           513 |                  0 |                0 |            390.57 |
-| [BIOSSES](https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html)                                                                                                      | [mteb/biosses-sts](https://huggingface.co/datasets/mteb/biosses-sts)                                                                 | Biomedical Semantic Similarity Estimation.                                                                                                                                                                       | STS                | s2s      |          1 |              0 |            0 |           200 |                  0 |                0 |             156.6 |
-| [SICK-R](https://www.aclweb.org/anthology/S14-2001.pdf)                                                                                                               | [mteb/sickr-sts](https://huggingface.co/datasets/mteb/sickr-sts)                                                                     | Semantic Textual Similarity SICK-R dataset as described here:                                                                                                                                                    | STS                | s2s      |          1 |              0 |            0 |         19854 |                  0 |                0 |              46.1 |
-| [STS12](https://www.aclweb.org/anthology/S12-1051.pdf)                                                                                                                | [mteb/sts12-sts](https://huggingface.co/datasets/mteb/sts12-sts)                                                                     | SemEval STS 2012 dataset.                                                                                                                                                                                        | STS                | s2s      |          1 |           4468 |            0 |          6216 |              100.7 |                0 |              64.7 |
-| [STS13](https://www.aclweb.org/anthology/S13-1004/)                                                                                                                   | [mteb/sts13-sts](https://huggingface.co/datasets/mteb/sts13-sts)                                                                     | SemEval STS 2013 dataset.                                                                                                                                                                                        | STS                | s2s      |          1 |              0 |            0 |          3000 |                  0 |                0 |              54.0 |
-| [STS14](http://alt.qcri.org/semeval2014/task10/)                                                                                                                      | [mteb/sts14-sts](https://huggingface.co/datasets/mteb/sts14-sts)                                                                     | SemEval STS 2014 dataset. Currently only the English dataset                                                                                                                                                     | STS                | s2s      |          1 |              0 |            0 |          7500 |                  0 |                0 |              54.3 |
-| [STS15](http://alt.qcri.org/semeval2015/task2/)                                                                                                                       | [mteb/sts15-sts](https://huggingface.co/datasets/mteb/sts15-sts)                                                                     | SemEval STS 2015 dataset                                                                                                                                                                                         | STS                | s2s      |          1 |              0 |            0 |          6000 |                  0 |                0 |              57.7 |
-| [STS16](http://alt.qcri.org/semeval2016/task1/)                                                                                                                       | [mteb/sts16-sts](https://huggingface.co/datasets/mteb/sts16-sts)                                                                     | SemEval STS 2016 dataset                                                                                                                                                                                         | STS                | s2s      |          1 |              0 |            0 |          2372 |                  0 |                0 |              65.3 |
-| [STS17](http://alt.qcri.org/semeval2016/task1/)                                                                                                                       | [mteb/sts17-crosslingual-sts](https://huggingface.co/datasets/mteb/sts17-crosslingual-sts)                                           | STS 2017 dataset                                                                                                                                                                                                 | STS                | s2s      |         11 |              0 |            0 |           500 |                  0 |                0 |              43.3 |
-| [STS22](https://competitions.codalab.org/competitions/33835)                                                                                                          | [mteb/sts22-crosslingual-sts](https://huggingface.co/datasets/mteb/sts22-crosslingual-sts)                                           | SemEval 2022 Task 8: Multilingual News Article Similarity                                                                                                                                                        | STS                | s2s      |         18 |              0 |            0 |          8060 |                  0 |                0 |            1992.8 |
-| [STSBenchmark](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark)                                                                                                  | [mteb/stsbenchmark-sts](https://huggingface.co/datasets/mteb/stsbenchmark-sts)                                                       | Semantic Textual Similarity Benchmark (STSbenchmark) dataset.                                                                                                                                                    | STS                | s2s      |          1 |          11498 |         3000 |          2758 |               57.6 |             64.0 |              53.6 |
-| [SICK-R-PL](https://aclanthology.org/2020.lrec-1.207.pdf)                                                                                                             | [PL-MTEB/sickr-pl-sts](https://huggingface.co/datasets/PL-MTEB/sickr-pl-sts)                                                         | Polish version of SICK dataset for textual relatedness.                                                                                                                                                          | STS                | s2s      |          1 |           8878 |          990 |          9812 |               42.9 |             44.0 |              42.8 |
-| [CDSC-R](https://aclanthology.org/P17-1073.pdf)                                                                                                                       | [PL-MTEB/cdscr-sts](https://huggingface.co/datasets/PL-MTEB/cdscr-sts)                                                               | Compositional Distributional Semantics Corpus for textual relatedness.                                                                                                                                           | STS                | s2s      |          1 |          16000 |         2000 |          2000 |               72.1 |             73.2 |              75.0 |
-| [SummEval](https://github.com/Yale-LILY/SummEval)                                                                                                     | [mteb/summeval](https://huggingface.co/datasets/mteb/summeval)                                                                       | News Article Summary Semantic Similarity Estimation.                                                                                                                                                             | Summarization      | s2s      |          1 |              0 |            0 |          2800 |                  0 |                0 |             359.8 |
-
-For Chinese tasks, you can refer to [C_MTEB](https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB).
diff --git a/docs/adding_a_model.md b/docs/adding_a_model.md
new file mode 100644
index 0000000000..0cbbd2af45
--- /dev/null
+++ b/docs/adding_a_model.md
@@ -0,0 +1,11 @@
+## Adding a Model to the MTEB Leaderboard
+
+The MTEB Leaderboard is available [here](https://huggingface.co/spaces/mteb/leaderboard). To submit:
+
+1. Run on MTEB: You can reference [scripts/run_mteb_english.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/run_mteb_english.py) for all MTEB English datasets used in the main ranking, or [scripts/run_mteb_chinese.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/run_mteb_chinese.py) for the Chinese ones. 
+Advanced scripts with different models are available in the [mteb/mtebscripts repo](https://github.com/embeddings-benchmark/mtebscripts).
+2. Format the json files into metadata using the script at `scripts/mteb_meta.py`. For example
+`python scripts/mteb_meta.py path_to_results_folder`, which will create a `mteb_metadata.md` file. If you ran CQADupstack retrieval, make sure to merge the results first with `python scripts/merge_cqadupstack.py path_to_results_folder`.
+3. Copy the content of the `mteb_metadata.md` file to the top of a `README.md` file of your model on the Hub. See [here](https://huggingface.co/Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit/blob/main/README.md) for an example.
+4. Hit the Refresh button at the bottom of the leaderboard and you should see your scores 🥇
+5. To have the scores appear without refreshing, you can open an issue on the [Community Tab of the LB](https://huggingface.co/spaces/mteb/leaderboard/discussions) and someone will restart the space to cache your average scores. The cache is updated anyways ~1x/week.
\ No newline at end of file
diff --git a/docs/contributing.md b/docs/contributing.md
new file mode 100644
index 0000000000..cb0b0c7802
--- /dev/null
+++ b/docs/contributing.md
@@ -0,0 +1,29 @@
+## Contributing to MTEB
+We welcome contributions such as new datasets to MTEB! This section describes how to set up the repository for development.
+
+### Development Installation
+If you want to submit a dataset or on other ways contribute to MTEB, you can install the package in development mode:
+
+```bash
+git clone https://github.com/embeddings-benchmark/mteb
+cd mteb
+
+# create your virtual environment and activate it
+make install
+```
+
+### Running Tests
+To run the tests, you can use the following command:
+
+```bash
+make test
+# or if you want to run on multiple cores
+make test-parallel
+```
+
+### Running linting
+To run the linting before a PR you can use the following command:
+
+```bash
+make lint
+```
diff --git a/images/hf_logo.png b/docs/images/hf_logo.png
similarity index 100%
rename from images/hf_logo.png
rename to docs/images/hf_logo.png
diff --git a/images/mteb_logo/mteb_logo_tight_hfhub.drawio b/docs/images/mteb_logo/mteb_logo_tight_hfhub.drawio
similarity index 100%
rename from images/mteb_logo/mteb_logo_tight_hfhub.drawio
rename to docs/images/mteb_logo/mteb_logo_tight_hfhub.drawio
diff --git a/images/mteb_logo/mteb_logo_tight_hfhub.png b/docs/images/mteb_logo/mteb_logo_tight_hfhub.png
similarity index 100%
rename from images/mteb_logo/mteb_logo_tight_hfhub.png
rename to docs/images/mteb_logo/mteb_logo_tight_hfhub.png
diff --git a/images/mteb_logo/mteb_logo_transparent.png b/docs/images/mteb_logo/mteb_logo_transparent.png
similarity index 100%
rename from images/mteb_logo/mteb_logo_transparent.png
rename to docs/images/mteb_logo/mteb_logo_transparent.png
diff --git a/images/mteb_logo/mteb_logo_wide_github.drawio b/docs/images/mteb_logo/mteb_logo_wide_github.drawio
similarity index 100%
rename from images/mteb_logo/mteb_logo_wide_github.drawio
rename to docs/images/mteb_logo/mteb_logo_wide_github.drawio
diff --git a/images/mteb_logo/mteb_logo_wide_github.png b/docs/images/mteb_logo/mteb_logo_wide_github.png
similarity index 100%
rename from images/mteb_logo/mteb_logo_wide_github.png
rename to docs/images/mteb_logo/mteb_logo_wide_github.png
diff --git a/docs/tasks.md b/docs/tasks.md
new file mode 100644
index 0000000000..ab98823f5a
--- /dev/null
+++ b/docs/tasks.md
@@ -0,0 +1,136 @@
+## Available tasks
+The following tables gives you an overview of the tasks in MTEB.
+
+<!-- This allows the table to be autogenerated in the future: -->
+<!-- TABLE START -->
+
+## Available tasks
+
+| Name                                                                                                                                                                  | Hub URL                                                                                                                              | Description                                                                                                                                                                                                      | Type               | Category | #Languages | Train #Samples | Dev #Samples | Test #Samples | Avg. chars / train | Avg. chars / dev | Avg. chars / test |
+| :-------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------------------------------------------------------------------------------------------------------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------- | :------- | ---------: | -------------: | -----------: | ------------: | -----------------: | ---------------: | ----------------: |
+| [BUCC](https://comparable.limsi.fr/bucc2018/bucc2018-task.html)                                                                                                       | [mteb/bucc-bitext-mining](https://huggingface.co/datasets/mteb/bucc-bitext-mining)                                                   | BUCC bitext mining dataset                                                                                                                                                                                       | BitextMining       | s2s      |          4 |              0 |            0 |        641684 |                  0 |                0 |             101.3 |
+| [Tatoeba](https://github.com/facebookresearch/LASER/tree/main/data/tatoeba/v1)                                                                                        | [mteb/tatoeba-bitext-mining](https://huggingface.co/datasets/mteb/tatoeba-bitext-mining)                                             | 1,000 English-aligned sentence pairs for each language based on the Tatoeba corpus                                                                                                                               | BitextMining       | s2s      |        112 |              0 |            0 |          2000 |                  0 |                0 |              39.4 |
+| [Bornholm parallel](https://aclanthology.org/W19-6138/)                                                                                                               | [strombergnlp/bornholmsk_parallel](https://huggingface.co/datasets/strombergnlp/bornholmsk_parallel)                                 | Danish Bornholmsk Parallel Corpus.                                                                                                                                                                               | BitextMining       | s2s      |          2 |            100 |          100 |           100 |               64.6 |             86.2 |              89.7 |
+| [DiaBLaBitextMining](https://inria.hal.science/hal-03021633) | [rbawden/DiaBLa](https://huggingface.co/datasets/rbawden/DiaBLa) | English-French Parallel Corpus. DiaBLa is an English-French dataset for the evaluation of Machine Translation (MT) for informal, written bilingual dialogue. | BitextMining | s2s | 1 | 5748 | 0 | 0 | 0 | 0 | 0 |
+| [FloresBitextMining](https://huggingface.co/datasets/facebook/flores) | [facebook/flores](https://huggingface.co/datasets/facebook/flores) | FLORES is a benchmark dataset for machine translation between English and low-resource languages. | BitextMining | s2s | 200 | 0 | 997 | 1012 | 0 | 0 | 0 |
+| [AmazonCounterfactualClassification](https://arxiv.org/abs/2104.06893)                                                                                                | [mteb/amazon_counterfactual](https://huggingface.co/datasets/mteb/amazon_counterfactual)                                             | A collection of Amazon customer reviews annotated for counterfactual detection pair classification.                                                                                                              | Classification     | s2s      |          4 |           4018 |          335 |           670 |              107.3 |            109.2 |             106.1 |
+| [AmazonPolarityClassification](https://dl.acm.org/doi/10.1145/2507157.2507163)                                                                                        | [mteb/amazon_polarity](https://huggingface.co/datasets/mteb/amazon_polarity)                                                         | Amazon Polarity Classification Dataset.                                                                                                                                                                          | Classification     | s2s      |          1 |        3600000 |            0 |        400000 |              431.6 |                0 |             431.4 |
+| [AmazonReviewsClassification](https://arxiv.org/abs/2010.02573)                                                                                                       | [mteb/amazon_reviews_multi](https://huggingface.co/datasets/mteb/amazon_reviews_multi)                                               | A collection of Amazon reviews specifically designed to aid research in multilingual text classification.                                                                                                        | Classification     | s2s      |          6 |        1200000 |        30000 |         30000 |              160.5 |            159.2 |             160.4 |
+| [MasakhaNEWSClassification](https://arxiv.org/abs/2304.09972) | [masakhane/masakhanews](https://huggingface.co/datasets/masakhane/masakhanews) | MasakhaNEWS is the largest publicly available dataset for news topic classification in 16 languages widely spoken in Africa. The train/validation/test sets are available for all the 16 languages. | Classification | s2s | 16 | 1476 | 211 | 422 | 5064.8 | 4756.1 | 5116.6 |
+| [Banking77Classification](https://arxiv.org/abs/2003.04807)                                                                                                           | [mteb/banking77](https://huggingface.co/datasets/mteb/banking77)                                                                     | Dataset composed of online banking queries annotated with their corresponding intents.                                                                                                                           | Classification     | s2s      |          1 |          10003 |            0 |          3080 |               59.5 |                0 |              54.2 |
+| [EmotionClassification](https://www.aclweb.org/anthology/D18-1404)                                                                                                    | [mteb/emotion](https://huggingface.co/datasets/mteb/emotion)                                                                         | Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. For more detailed information please refer to the paper.                                | Classification     | s2s      |          1 |          16000 |         2000 |          2000 |               96.8 |             95.3 |              96.6 |
+| [ImdbClassification](http://www.aclweb.org/anthology/P11-1015)                                                                                                        | [mteb/imdb](https://huggingface.co/datasets/mteb/imdb)                                                                               | Large Movie Review Dataset                                                                                                                                                                                       | Classification     | p2p      |          1 |          25000 |            0 |         25000 |             1325.1 |                0 |            1293.8 |
+| [MassiveIntentClassification](https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.)   | [mteb/amazon_massive_intent](https://huggingface.co/datasets/mteb/amazon_massive_intent)                                             | MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages                                                                                                | Classification     | s2s      |         51 |          11514 |         2033 |          2974 |               35.0 |             34.8 |              34.6 |
+| [MassiveScenarioClassification](https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.) | [mteb/amazon_massive_scenario](https://huggingface.co/datasets/mteb/amazon_massive_scenario)                                         | MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages                                                                                                | Classification     | s2s      |         51 |          11514 |         2033 |          2974 |               35.0 |             34.8 |              34.6 |
+| [MTOPDomainClassification](https://arxiv.org/pdf/2008.09335.pdf)                                                                                                      | [mteb/mtop_domain](https://huggingface.co/datasets/mteb/mtop_domain)                                                                 | MTOP: Multilingual Task-Oriented Semantic Parsing                                                                                                                                                                | Classification     | s2s      |          6 |          15667 |         2235 |          4386 |               36.6 |             36.5 |              36.8 |
+| [MTOPIntentClassification](https://arxiv.org/pdf/2008.09335.pdf)                                                                                                      | [mteb/mtop_intent](https://huggingface.co/datasets/mteb/mtop_intent)                                                                 | MTOP: Multilingual Task-Oriented Semantic Parsing                                                                                                                                                                | Classification     | s2s      |          6 |          15667 |         2235 |          4386 |               36.6 |             36.5 |              36.8 |
+| [ToxicConversationsClassification](https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview)                                    | [mteb/toxic_conversations_50k](https://huggingface.co/datasets/mteb/toxic_conversations_50k)                                         | Collection of comments from the Civil Comments platform together with annotations if the comment is toxic or not.                                                                                                | Classification     | s2s      |          1 |          50000 |            0 |         50000 |              298.8 |                0 |             296.6 |
+| [TweetSentimentExtractionClassification](https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview)                                                     | [mteb/tweet_sentiment_extraction](https://huggingface.co/datasets/mteb/tweet_sentiment_extraction)                                   |                                                                                                                                                                                                                  | Classification     | s2s      |          1 |          27481 |            0 |          3534 |               68.3 |                0 |              67.8 |
+| [AngryTweetsClassification](https://aclanthology.org/2021.nodalida-main.53/)                                                                                          | [mteb/DDSC/angry-tweets](https://huggingface.co/datasets/DDSC/angry-tweets)                                                          | A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets                                                                                                                                 | Classification     | s2s      |          1 |           2410 |            0 |          1050 |              153.0 |                0 |             156.1 |
+| [DKHateClassification](https://aclanthology.org/2020.lrec-1.430/)                                                                                                     | [DDSC/dkhate](https://huggingface.co/datasets/DDSC/dkhate)                                                                           | Danish Tweets annotated for Hate Speech                                                                                                                                                                          | Classification     | s2s      |          1 |           2960 |            0 |           329 |               88.2 |                0 |             104.0 |
+| [DalajClassification](https://spraakbanken.gu.se/en/resources/dalaj-1-0)                                                                                               | [AI-Sweden/SuperLim](https://huggingface.co/datasets/AI-Sweden/SuperLim)                                                             | A Swedish dataset for linguistic accebtablity. Available as a part of Superlim                                                                                                                                   | Classification     | s2s      |          1 |           3840 |          445 |           444 |              243.7 |            242.5 |             243.8 |
+| [DanishPoliticalCommentsClassification](https://huggingface.co/datasets/danish_political_comments)                                                                    | [danish_political_comments](https://huggingface.co/datasets/danish_political_comments)                                               | A dataset of Danish political comments rated for sentiment                                                                                                                                                       | Classification     | s2s      |          1 |           9010 |            0 |             0 |               69.9 |                0 |                 0 |
+| [LccClassification](https://github.com/fnielsen/lcc-sentiment)                                                                                                        | [DDSC/lcc](https://huggingface.co/datasets/DDSC/lcc)                                                                                 | The leipzig corpora collection, annotated for sentiment                                                                                                                                                          | Classification     | s2s      |          1 |            349 |            0 |           150 |              113.5 |                0 |             118.7 |
+| [NoRecClassification](https://aclanthology.org/L18-1661/)                                                                                                             | [ScandEval/norec-mini](https://huggingface.co/datasets/ScandEval/norec-mini)                                                         | A Norwegian dataset for sentiment classification on review                                                                                                                                                       | Classification     | s2s      |          1 |           1020 |          256 |          2050 |               86.9 |             89.6 |              82.0 |
+| [NordicLangClassification](https://aclanthology.org/2021.vardial-1.8/)                                                                                                | [strombergnlp/nordic_langid](https://huggingface.co/datasets/strombergnlp/nordic_langid)                                             | A dataset for Nordic language identification.                                                                                                                                                                    | Classification     | s2s      |          6 |          57000 |            0 |          3000 |               78.4 |                0 |              78.2 |
+| [NorwegianParliamentClassification](https://huggingface.co/datasets/NbAiLab/norwegian_parliament)                                                                     | [NbAiLab/norwegian_parliament](https://huggingface.co/datasets/NbAiLab/norwegian_parliament)                                         | Norwegian parliament speeches annotated for sentiment                                                                                                                                                            | Classification     | s2s      |          1 |           3600 |         1200 |          1200 |             1773.6 |           1911.0 |            1884.0 |
+| [ScalaDaClassification](https://aclanthology.org/2023.nodalida-1.20/)                                                                                                 | [ScandEval/scala-da](https://huggingface.co/datasets/ScandEval/scala-da)                                                             | A modified version of DDT modified for linguistic acceptability classification                                                                                                                                   | Classification     | s2s      |          1 |           1024 |          256 |          2048 |              107.6 |            100.8 |             109.4 |
+| [ScalaNbClassification](https://aclanthology.org/2023.nodalida-1.20/)                                                                                                 | [ScandEval/scala-nb](https://huggingface.co/datasets/ScandEval/scala-nb)                                                             | A Norwegian dataset for linguistic acceptability classification for Bokmål                                                                                                                                       | Classification     | s2s      |          1 |           1024 |          256 |          2048 |               95.5 |             94.8 |              98.4 |
+| [ScalaNnClassification](https://aclanthology.org/2023.nodalida-1.20/)                                                                                                 | [ScandEval/scala-nn](https://huggingface.co/datasets/ScandEval/scala-nn)                                                             | A Norwegian dataset for linguistic acceptability classification for Nynorsk                                                                                                                                      | Classification     | s2s      |          1 |           1024 |          256 |          2048 |              105.3 |            103.5 |             104.8 |
+| [ScalaSvClassification](https://aclanthology.org/2023.nodalida-1.20/)                                                                                                 | [ScandEval/scala-sv](https://huggingface.co/datasets/ScandEval/scala-sv)                                                             | A Swedish dataset for linguistic acceptability classification                                                                                                                                                    | Classification     | s2s      |          1 |           1024 |          256 |          2048 |              102.6 |            113.0 |              98.3 |
+| [SweRecClassificition](https://aclanthology.org/2023.nodalida-1.20/)                                                                                                  | [ScandEval/swerec-mini](https://huggingface.co/datasets/ScandEval/swerec-mini)                                                       | A Swedish dataset for sentiment classification on reviews                                                                                                                                                        | Classification     | s2s      |          1 |           1024 |          256 |          2048 |              317.7 |            293.4 |             318.8 |
+| [CBD](http://2019.poleval.pl/files/poleval2019.pdf)                                                                                                                   | [PL-MTEB/cbd](https://huggingface.co/datasets/PL-MTEB/cbd)                                                                           | Polish Tweets annotated for cyberbullying detection.                                                                                                                                                             | Classification     | s2s      |          1 |          10041 |            0 |          1000 |               93.6 |                0 |              93.2 |
+| [PolEmo2.0-IN](https://aclanthology.org/K19-1092.pdf)                                                                                                                 | [PL-MTEB/polemo2_in](https://huggingface.co/datasets/PL-MTEB/polemo2_in)                                                             | A collection of Polish online reviews from four domains: medicine, hotels, products and school. The PolEmo2.0-IN task is to predict the sentiment of in-domain (medicine and hotels) reviews.                    | Classification     | s2s      |          1 |           5783 |          723 |           722 |              780.6 |            769.4 |             756.2 |
+| [PolEmo2.0-OUT](https://aclanthology.org/K19-1092.pdf)                                                                                                                | [PL-MTEB/polemo2_out](https://huggingface.co/datasets/PL-MTEB/polemo2_out)                                                           | A collection of Polish online reviews from four domains: medicine, hotels, products and school. The PolEmo2.0-OUT task is to predict the sentiment of out-of-domain (products and school) reviews using models train on reviews from medicine and hotels domains. | Classification | s2s | 1 | 5783 | 494 | 494 | 780.6 | 589.3 | 587.0 |
+| [AllegroReviews](https://aclanthology.org/2020.acl-main.111.pdf)                                                                                                      | [PL-MTEB/allegro-reviews](https://huggingface.co/datasets/PL-MTEB/allegro-reviews)                                                   | A Polish dataset for sentiment classification on reviews from e-commerce marketplace Allegro.                                                                                                                    | Classification     | s2s      |          1 |           9577 |         1002 |          1006 |              477.9 |            480.9 |             477.2 |
+| [PAC](https://arxiv.org/pdf/2211.13112.pdf)                                                                                                                           | [laugustyniak/abusive-clauses-pl](https://huggingface.co/datasets/laugustyniak/abusive-clauses-pl)                                   | Polish Abusive Clauses Dataset                                                                                                                                                                                   | Classification     | s2s      |          1 |           4284 |         1519 |          3453 |              185.3 |            256.8 |             185.3 |
+| [AlloProfClusteringP2P](https://huggingface.co/datasets/lyon-nlp/alloprof) | [lyon-nlp/alloprof](https://huggingface.co/datasets/lyon-nlp/alloprof) | Clustering of document titles and descriptions from Allo Prof dataset. Clustering of 10 sets on the document topic. | Clustering | p2p | 1 | 2798 | 0 | 0 | 0 | 0 | 0 |
+| [AlloProfClusteringS2S](https://huggingface.co/datasets/lyon-nlp/alloprof) | [lyon-nlp/alloprof](https://huggingface.co/datasets/lyon-nlp/alloprof) | Clustering of document titles from Allo Prof dataset. Clustering of 10 sets on the document topic. | Clustering | s2s | 1 | 2798 | 0 | 0 | 0 | 0 | 0 |
+| [ArxivClusteringP2P](https://www.kaggle.com/Cornell-University/arxiv)                                                                                                 | [mteb/arxiv-clustering-p2p](https://huggingface.co/datasets/mteb/arxiv-clustering-p2p)                                               | Clustering of titles+abstract from arxiv. Clustering of 30 sets, either on the main or secondary category                                                                                                        | Clustering         | p2p      |          1 |              0 |            0 |        732723 |                  0 |                0 |            1009.9 |
+| [ArxivClusteringS2S](https://www.kaggle.com/Cornell-University/arxiv)                                                                                                 | [mteb/arxiv-clustering-s2s](https://huggingface.co/datasets/mteb/arxiv-clustering-s2s)                                               | Clustering of titles from arxiv. Clustering of 30 sets, either on the main or secondary category                                                                                                                 | Clustering         | s2s      |          1 |              0 |            0 |        732723 |                  0 |                0 |              74.0 |
+| [BiorxivClusteringP2P](https://api.biorxiv.org/)                                                                                                                      | [mteb/biorxiv-clustering-p2p](https://huggingface.co/datasets/mteb/biorxiv-clustering-p2p)                                           | Clustering of titles+abstract from biorxiv. Clustering of 10 sets, based on the main category.                                                                                                                   | Clustering         | p2p      |          1 |              0 |            0 |         75000 |                  0 |                0 |            1666.2 |
+| [BiorxivClusteringS2S](https://api.biorxiv.org/)                                                                                                                      | [mteb/biorxiv-clustering-s2s](https://huggingface.co/datasets/mteb/biorxiv-clustering-s2s)                                           | Clustering of titles from biorxiv. Clustering of 10 sets, based on the main category.                                                                                                                            | Clustering         | s2s      |          1 |              0 |            0 |         75000 |                  0 |                0 |             101.6 |
+| [BlurbsClusteringP2P](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html)                                                             | [slvnwhrl/blurbs-clustering-p2p](https://huggingface.co/datasets/slvnwhrl/blurbs-clustering-p2p)                                     | Clustering of book titles+blurbs. Clustering of 28 sets, either on the main or secondary genre                                                                                                                   | Clustering         | p2p      |          1 |              0 |            0 |        174637 |                  0 |                0 |            664.09 |
+| [BlurbsClusteringS2S](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html)                                                             | [slvnwhrl/blurbs-clustering-s2s](https://huggingface.co/datasets/slvnwhrl/blurbs-clustering-s2s)                                     | Clustering of book titles. Clustering of 28 sets, either on the main or secondary genre.                                                                                                                         | Clustering         | s2s      |          1 |              0 |            0 |        174637 |                  0 |                0 |             23.02 |
+| [HALClusteringS2S](https://huggingface.co/datasets/lyon-nlp/clustering-hal-s2s) | [lyon-nlp/clustering-hal-s2s](https://huggingface.co/datasets/lyon-nlp/clustering-hal-s2s) | Clustering of titles from HAL. Clustering of 10 sets on the main category. | Clustering | s2s | 1 | 85375 | 0 | 0 | 0 | 0 | 0 |
+| [MedrxivClusteringP2P](https://api.medrxiv.org/)                                                                                                                      | [mteb/medrxiv-clustering-p2p](https://huggingface.co/datasets/mteb/medrxiv-clustering-p2p)                                           | Clustering of titles+abstract from medrxiv. Clustering of 10 sets, based on the main category.                                                                                                                   | Clustering         | p2p      |          1 |              0 |            0 |         37500 |                  0 |                0 |            1981.2 |
+| [MedrxivClusteringS2S](https://api.medrxiv.org/)                                                                                                                      | [mteb/medrxiv-clustering-s2s](https://huggingface.co/datasets/mteb/medrxiv-clustering-s2s)                                           | Clustering of titles from medrxiv. Clustering of 10 sets, based on the main category.                                                                                                                            | Clustering         | s2s      |          1 |              0 |            0 |         37500 |                  0 |                0 |             114.7 |
+| [RedditClustering](https://arxiv.org/abs/2104.07081)                                                                                                                  | [mteb/reddit-clustering](https://huggingface.co/datasets/mteb/reddit-clustering)                                                     | Clustering of titles from 199 subreddits. Clustering of 25 sets, each with 10-50 classes, and each class with 100 - 1000 sentences.                                                                              | Clustering         | s2s      |          1 |              0 |            0 |        420464 |                  0 |                0 |              64.7 |
+| [RedditClusteringP2P](https://huggingface.co/datasets/sentence-transformers/reddit-title-body)                                                                        | [mteb/reddit-clustering-p2p](https://huggingface.co/datasets/mteb/reddit-clustering-p2p)                                             | Clustering of title+posts from reddit. Clustering of 10 sets of 50k paragraphs and 40 sets of 10k paragraphs.                                                                                                    | Clustering         | p2p      |          1 |              0 |            0 |        459399 |                  0 |                0 |             727.7 |
+| [StackExchangeClustering](https://arxiv.org/abs/2104.07081)                                                                                                           | [mteb/stackexchange-clustering](https://huggingface.co/datasets/mteb/stackexchange-clustering)                                       | Clustering of titles from 121 stackexchanges. Clustering of 25 sets, each with 10-50 classes, and each class with 100 - 1000 sentences.                                                                          | Clustering         | s2s      |          1 |              0 |       417060 |        373850 |                  0 |             56.8 |              57.0 |
+| [StackExchangeClusteringP2P](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_title_body_jsonl)                                                 | [mteb/stackexchange-clustering-p2p](https://huggingface.co/datasets/mteb/stackexchange-clustering-p2p)                               | Clustering of title+body from stackexchange. Clustering of 5 sets of 10k paragraphs and 5 sets of 5k paragraphs.                                                                                                 | Clustering         | p2p      |          1 |              0 |            0 |         75000 |                  0 |                0 |            1090.7 |
+| [TenKGnadClusteringP2P](https://tblock.github.io/10kGNAD/)                                                                                                            | [slvnwhrl/tenkgnad-clustering-p2p](https://huggingface.co/datasets/slvnwhrl/tenkgnad-clustering-p2p)                                 | Clustering of news article titles+subheadings+texts. Clustering of 10 splits on the news article category.                                                                                                       | Clustering         | p2p      |          1 |              0 |            0 |         45914 |                  0 |                0 |           2641.03 |
+| [TenKGnadClusteringS2S](https://tblock.github.io/10kGNAD/)                                                                                                            | [slvnwhrl/tenkgnad-clustering-s2s](https://huggingface.co/datasets/slvnwhrl/tenkgnad-clustering-s2s)                                 | Clustering of news article titles. Clustering of 10 splits on the news article category.                                                                                                                         | Clustering         | s2s      |          1 |              0 |            0 |         45914 |                  0 |                0 |             50.96 |
+| [TwentyNewsgroupsClustering](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html)                                                                           | [mteb/twentynewsgroups-clustering](https://huggingface.co/datasets/mteb/twentynewsgroups-clustering)                                 | Clustering of the 20 Newsgroups dataset (subject only).                                                                                                                                                          | Clustering         | s2s      |          1 |              0 |            0 |         59545 |                  0 |                0 |              32.0 |
+| [8TagsClustering](https://aclanthology.org/2020.lrec-1.207.pdf)                                                                                                       | [PL-MTEB/8tags-clustering](https://huggingface.co/datasets/PL-MTEB/8tags-clustering)                                                 | Clustering of headlines from social media posts in Polish belonging to 8 categories: film, history, food, medicine, motorization, work, sport and technology.                                                    | Clustering         | s2s      |          1 |          40001 |         5000 |          4372 |               78.2 |             77.6 |              79.2 |
+| [OpusparcusPC](https://gem-benchmark.com/data_cards/opusparcus) | [GEM/opusparcus](https://huggingface.co/datasets/GEM/opusparcus) | Opusparcus is a paraphrase corpus for six European language: German, English, Finnish, French, Russian, and Swedish. The paraphrases consist of subtitles from movies and TV shows. | PairClassification | s2s | 6 | 1007 | 0 | 0 | 0 | 0 | 0 |
+| [SprintDuplicateQuestions](https://www.aclweb.org/anthology/D18-1131/)                                                                                                | [mteb/sprintduplicatequestions-pairclassification](https://huggingface.co/datasets/mteb/sprintduplicatequestions-pairclassification) | Duplicate questions from the Sprint community.                                                                                                                                                                   | PairClassification | s2s      |          1 |              0 |       101000 |        101000 |                  0 |             65.2 |              67.9 |
+| [TwitterSemEval2015](https://alt.qcri.org/semeval2015/task1/)                                                                                                         | [mteb/twittersemeval2015-pairclassification](https://huggingface.co/datasets/mteb/twittersemeval2015-pairclassification)             | Paraphrase-Pairs of Tweets from the SemEval 2015 workshop.                                                                                                                                                       | PairClassification | s2s      |          1 |              0 |            0 |         16777 |                  0 |                0 |              38.3 |
+| [TwitterURLCorpus](https://languagenet.github.io/)                                                                                                                    | [mteb/twitterurlcorpus-pairclassification](https://huggingface.co/datasets/mteb/twitterurlcorpus-pairclassification)                 | Paraphrase-Pairs of Tweets.                                                                                                                                                                                      | PairClassification | s2s      |          1 |              0 |            0 |         51534 |                  0 |                0 |              79.5 |
+| [PPC](https://arxiv.org/pdf/2207.12759.pdf)                                                                                                                           | [PL-MTEB/ppc-pairclassification](https://huggingface.co/datasets/PL-MTEB/ppc-pairclassification)                                     | Polish Paraphrase Corpus                                                                                                                                                                                         | PairClassification | s2s      |          1 |           5000 |         1000 |          1000 |               41.0 |             41.0 |              40.2 |
+| [PSC](http://www.lrec-conf.org/proceedings/lrec2014/pdf/1211_Paper.pdf)                                                                                               | [PL-MTEB/psc-pairclassification](https://huggingface.co/datasets/PL-MTEB/psc-pairclassification)                                     | Polish Summaries Corpus                                                                                                                                                                                          | PairClassification | s2s      |          1 |           4302 |            0 |          1078 |              537.1 |                0 |             549.3 |
+| [SICK-E-PL](https://aclanthology.org/2020.lrec-1.207.pdf)                                                                                                             | [PL-MTEB/sicke-pl-pairclassification](https://huggingface.co/datasets/PL-MTEB/sicke-pl-pairclassification)                           | Polish version of SICK dataset for textual entailment.                                                                                                                                                           | PairClassification | s2s      |          1 |           4439 |          495 |          4906 |               43.4 |             44.7 |              43.2 |
+| [CDSC-E](https://aclanthology.org/P17-1073.pdf)                                                                                                                       | [PL-MTEB/cdsce-pairclassification](https://huggingface.co/datasets/PL-MTEB/cdsce-pairclassification)                                 | Compositional Distributional Semantics Corpus for textual entailment.                                                                                                                                            | PairClassification | s2s      |          1 |           8000 |         1000 |          1000 |               71.9 |             73.5 |              75.2 |
+| [AskUbuntuDupQuestions](https://github.com/taolei87/askubuntu)                                                                                                        | [mteb/askubuntudupquestions-reranking](https://huggingface.co/datasets/mteb/askubuntudupquestions-reranking)                         | AskUbuntu Question Dataset - Questions from AskUbuntu with manual annotations marking pairs of questions as similar or non-similar                                                                               | Reranking          | s2s      |          1 |              0 |            0 |          2255 |                  0 |                0 |              52.5 |
+| [MindSmallReranking](https://msnews.github.io/assets/doc/ACL2020_MIND.pdf)                                                                                            | [mteb/mind_small](https://huggingface.co/datasets/mteb/mind_small)                                                                   | Microsoft News Dataset: A Large-Scale English Dataset for News Recommendation Research                                                                                                                           | Reranking          | s2s      |          1 |         231530 |            0 |        107968 |               69.0 |                0 |              70.9 |
+| [SciDocsRR](https://allenai.org/data/scidocs)                                                                                                                         | [mteb/scidocs-reranking](https://huggingface.co/datasets/mteb/scidocs-reranking)                                                     | Ranking of related scientific papers based on their title.                                                                                                                                                       | Reranking          | s2s      |          1 |              0 |        19594 |         19599 |                  0 |             69.4 |              69.0 |
+| [StackOverflowDupQuestions](https://www.microsoft.com/en-us/research/uploads/prod/2019/03/nl4se18LinkSO.pdf)                                                          | [mteb/stackoverflowdupquestions-reranking](https://huggingface.co/datasets/mteb/stackoverflowdupquestions-reranking)                 | Stack Overflow Duplicate Questions Task for questions with the tags Java, JavaScript and Python                                                                                                                  | Reranking          | s2s      |          1 |          23018 |            0 |          3467 |               49.6 |                0 |              49.8 |
+| [AlloprofRetrieval](https://huggingface.co/datasets/antoinelb7/alloprof) | [lyon-nlp/alloprof](https://huggingface.co/datasets/lyon-nlp/alloprof) | This dataset was provided by AlloProf, an organisation in Quebec, Canada offering resources and a help forum curated by a large number of teachers to students on all subjects taught from in primary and secondary school | Retrieval | s2p | 1 | 2798 | 0 | 0 | 0 | 0 | 0 |
+| [ArguAna](http://argumentation.bplaced.net/arguana/data)                                                                                                              | [mteb/arguana](https://huggingface.co/datasets/mteb/arguana)                                                                         | NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval                                                                                                                                 | Retrieval          | p2p      |          1 |              0 |            0 |         10080 |                  0 |                0 |            1052.9 |
+| [BSARDRetrieval](https://huggingface.co/datasets/maastrichtlawtech/bsard) | [maastrichtlawtech/bsard](https://huggingface.co/datasets/maastrichtlawtech/bsard) | The Belgian Statutory Article Retrieval Dataset (BSARD) is a French native dataset for studying legal information retrieval. BSARD consists of more than 22,600 statutory articles from Belgian law and about 1,100 legal questions posed by Belgian citizens and labeled by experienced jurists with relevant articles from the corpus. | Retrieval | s2p | 1 | 222 | 0 | 0 | 0 | 0 | 0 |
+| [ClimateFEVER](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html)                                                                                  | [mteb/climate-fever](https://huggingface.co/datasets/mteb/climate-fever)                                                             | CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims regarding climate-change.                                                                                     | Retrieval          | s2p      |          1 |              0 |            0 |       5418128 |                  0 |                0 |             539.1 |
+| [CQADupstackAndroidRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                   | [mteb/cqadupstack-android](https://huggingface.co/datasets/mteb/cqadupstack-android)                                                   | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         23697 |                  0 |                0 |             578.7 |
+| [CQADupstackEnglishRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                   | [mteb/cqadupstack-english](https://huggingface.co/datasets/mteb/cqadupstack-english)                                                   | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         41791 |                  0 |                0 |             467.1 |
+| [CQADupstackGamingRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                    | [mteb/cqadupstack-gaming](https://huggingface.co/datasets/mteb/cqadupstack-gaming)                                                    | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         46896 |                  0 |                0 |             474.7 |
+| [CQADupstackGisRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                       | [mteb/cqadupstack-gis](https://huggingface.co/datasets/mteb/cqadupstack-gis)                                                       | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         38522 |                  0 |                0 |             991.1 |
+| [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                               | [mteb/cqadupstack-mathematica](https://huggingface.co/datasets/mteb/cqadupstack-mathematica)                                               | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         17509 |                  0 |                0 |            1103.7 |
+| [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                   | [mteb/cqadupstack-physics](https://huggingface.co/datasets/mteb/cqadupstack-physics)                                                   | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         39355 |                  0 |                0 |             799.4 |
+| [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                               | [mteb/cqadupstack-programmers](https://huggingface.co/datasets/mteb/cqadupstack-programmers)                                               | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         33052 |                  0 |                0 |            1030.2 |
+| [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                     | [mteb/cqadupstack-stats](https://huggingface.co/datasets/mteb/cqadupstack-stats)                                                     | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         42921 |                  0 |                0 |            1041.0 |
+| [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                       | [mteb/cqadupstack-tex](https://huggingface.co/datasets/mteb/cqadupstack-tex)                                                       | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         71090 |                  0 |                0 |            1246.9 |
+| [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                      | [mteb/cqadupstack-unix](https://huggingface.co/datasets/mteb/cqadupstack-unix)                                                      | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         48454 |                  0 |                0 |             984.7 |
+| [CQADupstackWebmastersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                | [mteb/cqadupstack-webmasters](https://huggingface.co/datasets/mteb/cqadupstack-webmasters)                                                | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         17911 |                  0 |                0 |             689.8 |
+| [CQADupstackWordpressRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/)                                                                                 | [mteb/cqadupstack-wordpress](https://huggingface.co/datasets/mteb/cqadupstack-wordpress)                                                 | CQADupStack: A Benchmark Data Set for Community Question-Answering Research                                                                                                                                      | Retrieval          | s2p      |          1 |              0 |            0 |         49146 |                  0 |                0 |            1111.9 |
+| [DBPedia](https://github.com/iai-group/DBpedia-Entity/)                                                                                                               | [mteb/dbpedia](https://huggingface.co/datasets/mteb/dbpedia)                                                           | DBpedia-Entity is a standard test collection for entity search over the DBpedia knowledge base                                                                                                                   | Retrieval          | s2p      |          1 |              0 |      4635989 |       4636322 |                  0 |            310.2 |             310.1 |
+| [FEVER](https://fever.ai/)                                                                                                                                            | [mteb/fever](https://huggingface.co/datasets/mteb/fever)                                                                             | FEVER (Fact Extraction and VERification) consists of 185,445 claims generated by altering sentences extracted from Wikipedia and subsequently verified without knowledge of the sentence they were derived from. | Retrieval          | s2p      |          1 |              0 |            0 |       5423234 |                  0 |                0 |             538.6 |
+| [FiQA2018](https://sites.google.com/view/fiqa/)                                                                                                                       | [mteb/fiqa](https://huggingface.co/datasets/mteb/fiqa)                                                                               | Financial Opinion Mining and Question Answering                                                                                                                                                                  | Retrieval          | s2p      |          1 |              0 |            0 |         58286 |                  0 |                0 |             760.4 |
+| [HagridRetrieval](https://github.com/project-miracl/hagrid) | [miracl/hagrid](https://huggingface.co/datasets/miracl/hagrid) | HAGRID (Human-in-the-loop Attributable Generative Retrieval for Information-seeking Dataset) is a dataset for generative information-seeking scenarios. It consists of queries along with a set of manually labelled relevant passages | Retrieval | s2p | 1 | 716 | 0 | 0 | 0 | 0 | 0 |
+| [HotpotQA](https://hotpotqa.github.io/)                                                                                                                               | [mteb/hotpotqa](https://huggingface.co/datasets/mteb/hotpotqa)                                                                       | HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong supervision for supporting facts to enable more explainable question answering systems.                             | Retrieval          | s2p      |          1 |              0 |            0 |       5240734 |                  0 |                0 |             288.6 |
+| [MSMARCO](https://microsoft.github.io/msmarco/)                                                                                                                       | [mteb/msmarco](https://huggingface.co/datasets/mteb/msmarco)                                                                         | MS MARCO is a collection of datasets focused on deep learning in search. Note that the dev set is used for the leaderboard.                                                                                      | Retrieval          | s2p      |          1 |              0 |      8848803 |       8841866 |                  0 |            336.6 |             336.8 |
+| [MSMARCOv2](https://microsoft.github.io/msmarco/TREC-Deep-Learning.html)                                                                                              | [mteb/msmarco-v2](https://huggingface.co/datasets/mteb/msmarco-v2)                                                                   | MS MARCO is a collection of datasets focused on deep learning in search                                                                                                                                          | Retrieval          | s2p      |          1 |      138641342 |    138368101 |             0 |              341.4 |            342.0 |                 0 |
+| [NFCorpus](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/)                                                                                                   | [mteb/nfcorpus](https://huggingface.co/datasets/mteb/nfcorpus)                                                                       | NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval                                                                                                                                 | Retrieval          | s2p      |          1 |              0 |            0 |          3956 |                  0 |                0 |            1462.7 |
+| [NQ](https://ai.google.com/research/NaturalQuestions/)                                                                                                                | [mteb/nq](https://huggingface.co/datasets/mteb/nq)                                                                                   | NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval                                                                                                                                 | Retrieval          | s2p      |          1 |              0 |            0 |       2684920 |                  0 |                0 |             492.7 |
+| [QuoraRetrieval](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs)                                                                              | [mteb/quora](https://huggingface.co/datasets/mteb/quora)                                                                             | QuoraRetrieval is based on questions that are marked as duplicates on the Quora platform. Given a question, find other (duplicate) questions.                                                                    | Retrieval          | s2s      |          1 |              0 |            0 |        532931 |                  0 |                0 |              62.9 |
+| [SCIDOCS](https://allenai.org/data/scidocs)                                                                                                                           | [mteb/scidocs](https://huggingface.co/datasets/mteb/scidocs)                                                                         | SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from citation prediction, to document classification and recommendation.                                                    | Retrieval          | s2p      |          1 |              0 |            0 |         26657 |                  0 |                0 |            1161.9 |
+| [SciFact](https://github.com/allenai/scifact)                                                                                                                         | [mteb/scifact](https://huggingface.co/datasets/mteb/scifact)                                                                         | SciFact verifies scientific claims using evidence from the research literature containing scientific paper abstracts.                                                                                            | Retrieval          | s2p      |          1 |              0 |            0 |          5483 |                  0 |                0 |            1422.3 |
+| [Touche2020](https://webis.de/events/touche-20/shared-task-1.html)                                                                                                    | [mteb/touche2020](https://huggingface.co/datasets/mteb/touche2020)                                                       | Touché Task 1: Argument Retrieval for Controversial Questions                                                                                                                                                    | Retrieval          | s2p      |          1 |              0 |            0 |        382594 |                  0 |                0 |            1720.1 |
+| [TRECCOVID](https://ir.nist.gov/covidSubmit/index.html)                                                                                                               | [mteb/trec-covid](https://huggingface.co/datasets/mteb/trec-covid)                                                                   | TRECCOVID is an ad-hoc search challenge based on the CORD-19 dataset containing scientific articles related to the COVID-19 pandemic                                                                             | Retrieval          | s2p      |          1 |              0 |            0 |        171382 |                  0 |                0 |            1117.4 |
+| [ArguAna-PL](http://argumentation.bplaced.net/arguana/data)                                                                                                           | [BeIR-PL/arguana-pl](https://huggingface.co/datasets/clarin-knext/arguana-pl) | NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval | Retrieval | p2p | 1 | 0 | 0 | 10080 | 0 | 0 | 1052.9 |
+| [DBPedia-PL](https://github.com/iai-group/DBpedia-Entity/)                                                                                                            | [BeIR-PL/dbpedia-pl](https://huggingface.co/datasets/clarin-knext/dbpedia-pl) | DBpedia-Entity is a standard test collection for entity search over the DBpedia knowledge base | Retrieval | s2p | 1 | 0 | 4635989 | 4636322 | 0 | 310.2 | 310.1 |
+| [FiQA-PL](https://sites.google.com/view/fiqa/)                                                                                                                        | [BeIR-PL/fiqa-pl](https://huggingface.co/datasets/clarin-knext/fiqa-pl) | Financial Opinion Mining and Question Answering | Retrieval | s2p | 1 | 0 | 0 | 58286 | 0 | 0 | 760.4 |
+| [HotpotQA-PL](https://hotpotqa.github.io/) | [BeIR-PL/hotpotqa-pl](https://huggingface.co/datasets/clarin-knext/hotpotqa-pl) | HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong supervision for supporting facts to enable more explainable question answering systems. | Retrieval | s2p | 1 | 0 | 0 | 5240734 | 0 | 0 | 288.6 |
+| [MSMARCO-PL](https://microsoft.github.io/msmarco/) | [BeIR-PL/msmarco-pl](https://huggingface.co/datasets/clarin-knext/msmarco-pl) | MS MARCO is a collection of datasets focused on deep learning in search. Note that the dev set is used for the leaderboard. | Retrieval | s2p | 1 | 0 | 8848803 | 8841866 | 0 | 336.6 | 336.8 |
+| [NFCorpus-PL](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) | [BeIR-PL/nfcorpus-pl](https://huggingface.co/datasets/clarin-knext/nfcorpus-pl) | NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval | Retrieval | s2p | 1 | 0 | 0 | 3956 | 0 | 0 | 1462.7 |
+| [NQ-PL](https://ai.google.com/research/NaturalQuestions/) | [BeIR-PL/nq-pl](https://huggingface.co/datasets/clarin-knext/nq-pl) | Natural Questions: A Benchmark for Question Answering Research | Retrieval | s2p | 1 | 0 | 0 | 2684920 | 0 | 0 | 492.7 |
+| [Quora-PL](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | [BeIR-PL/quora-pl](https://huggingface.co/datasets/clarin-knext/quora-pl) | QuoraRetrieval is based on questions that are marked as duplicates on the Quora platform. Given a question, find other (duplicate) questions. | Retrieval | s2s | 1 | 0 | 0 | 532931 | 0 | 0 | 62.9 |
+| [SCIDOCS-PL](https://allenai.org/data/scidocs) | [BeIR-PL/scidocs-pl](https://huggingface.co/datasets/clarin-knext/scidocs-pl) | SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from citation prediction, to document classification and recommendation. | Retrieval | s2p | 1 | 0 | 0 | 26657 | 0 | 0 | 1161.9 |
+| [SciFact-PL](https://github.com/allenai/scifact) | [BeIR-PL/scifact-pl](https://huggingface.co/datasets/clarin-knext/scifact-pl) | SciFact verifies scientific claims using evidence from the research literature containing scientific paper abstracts. | Retrieval | s2p | 1 | 0 | 0 | 5483 | 0 | 0 | 1422.3 |
+| [SweFAQ](https://spraakbanken.gu.se/en/resources/swefaq)                                                                                                                  | [AI-Sweden/SuperLim](https://huggingface.co/datasets/AI-Sweden/SuperLim)                                                             | Frequently asked questions from Swedish authorities' websites                                                                                                                                                    | Retrieval          | s2p      |          1 |              0 |            0 |           513 |                  0 |                0 |            390.57 |
+| [BIOSSES](https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html)                                                                                                      | [mteb/biosses-sts](https://huggingface.co/datasets/mteb/biosses-sts)                                                                 | Biomedical Semantic Similarity Estimation.                                                                                                                                                                       | STS                | s2s      |          1 |              0 |            0 |           200 |                  0 |                0 |             156.6 |
+| [SICK-R](https://www.aclweb.org/anthology/S14-2001.pdf)                                                                                                               | [mteb/sickr-sts](https://huggingface.co/datasets/mteb/sickr-sts)                                                                     | Semantic Textual Similarity SICK-R dataset as described here:                                                                                                                                                    | STS                | s2s      |          1 |              0 |            0 |         19854 |                  0 |                0 |              46.1 |
+| [STS12](https://www.aclweb.org/anthology/S12-1051.pdf)                                                                                                                | [mteb/sts12-sts](https://huggingface.co/datasets/mteb/sts12-sts)                                                                     | SemEval STS 2012 dataset.                                                                                                                                                                                        | STS                | s2s      |          1 |           4468 |            0 |          6216 |              100.7 |                0 |              64.7 |
+| [STS13](https://www.aclweb.org/anthology/S13-1004/)                                                                                                                   | [mteb/sts13-sts](https://huggingface.co/datasets/mteb/sts13-sts)                                                                     | SemEval STS 2013 dataset.                                                                                                                                                                                        | STS                | s2s      |          1 |              0 |            0 |          3000 |                  0 |                0 |              54.0 |
+| [STS14](http://alt.qcri.org/semeval2014/task10/)                                                                                                                      | [mteb/sts14-sts](https://huggingface.co/datasets/mteb/sts14-sts)                                                                     | SemEval STS 2014 dataset. Currently only the English dataset                                                                                                                                                     | STS                | s2s      |          1 |              0 |            0 |          7500 |                  0 |                0 |              54.3 |
+| [STS15](http://alt.qcri.org/semeval2015/task2/)                                                                                                                       | [mteb/sts15-sts](https://huggingface.co/datasets/mteb/sts15-sts)                                                                     | SemEval STS 2015 dataset                                                                                                                                                                                         | STS                | s2s      |          1 |              0 |            0 |          6000 |                  0 |                0 |              57.7 |
+| [STS16](http://alt.qcri.org/semeval2016/task1/)                                                                                                                       | [mteb/sts16-sts](https://huggingface.co/datasets/mteb/sts16-sts)                                                                     | SemEval STS 2016 dataset                                                                                                                                                                                         | STS                | s2s      |          1 |              0 |            0 |          2372 |                  0 |                0 |              65.3 |
+| [STS17](http://alt.qcri.org/semeval2016/task1/)                                                                                                                       | [mteb/sts17-crosslingual-sts](https://huggingface.co/datasets/mteb/sts17-crosslingual-sts)                                           | STS 2017 dataset                                                                                                                                                                                                 | STS                | s2s      |         11 |              0 |            0 |           500 |                  0 |                0 |              43.3 |
+| [STS22](https://competitions.codalab.org/competitions/33835)                                                                                                          | [mteb/sts22-crosslingual-sts](https://huggingface.co/datasets/mteb/sts22-crosslingual-sts)                                           | SemEval 2022 Task 8: Multilingual News Article Similarity                                                                                                                                                        | STS                | s2s      |         18 |              0 |            0 |          8060 |                  0 |                0 |            1992.8 |
+| [STSBenchmark](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark)                                                                                                  | [mteb/stsbenchmark-sts](https://huggingface.co/datasets/mteb/stsbenchmark-sts)                                                       | Semantic Textual Similarity Benchmark (STSbenchmark) dataset.                                                                                                                                                    | STS                | s2s      |          1 |          11498 |         3000 |          2758 |               57.6 |             64.0 |              53.6 |
+| [SICK-R-PL](https://aclanthology.org/2020.lrec-1.207.pdf)                                                                                                             | [PL-MTEB/sickr-pl-sts](https://huggingface.co/datasets/PL-MTEB/sickr-pl-sts)                                                         | Polish version of SICK dataset for textual relatedness.                                                                                                                                                          | STS                | s2s      |          1 |           8878 |          990 |          9812 |               42.9 |             44.0 |              42.8 |
+| [CDSC-R](https://aclanthology.org/P17-1073.pdf)                                                                                                                       | [PL-MTEB/cdscr-sts](https://huggingface.co/datasets/PL-MTEB/cdscr-sts)                                                               | Compositional Distributional Semantics Corpus for textual relatedness.                                                                                                                                           | STS                | s2s      |          1 |          16000 |         2000 |          2000 |               72.1 |             73.2 |              75.0 |
+| [SummEval](https://github.com/Yale-LILY/SummEval)                                                                                                     | [mteb/summeval](https://huggingface.co/datasets/mteb/summeval)                                                                       | News Article Summary Semantic Similarity Estimation.                                                                                                                                                             | Summarization      | s2s      |          1 |              0 |            0 |          2800 |                  0 |                0 |             359.8 |
+
+For Chinese tasks, you can refer to [C_MTEB](https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB).
+
+
+<!-- TABLE END -->
\ No newline at end of file
diff --git a/mteb/__init__.py b/mteb/__init__.py
index 955c818dd5..16c164d865 100644
--- a/mteb/__init__.py
+++ b/mteb/__init__.py
@@ -1,7 +1,9 @@
-__version__ = "1.2.1.dev0"
+from importlib.metadata import version
 
 from mteb.evaluation import *
 
+__version__ = version("mteb")  # fetch version from install metadata
+
 
 MTEB_MAIN_EN = [
     "AmazonCounterfactualClassification",
@@ -70,5 +72,5 @@
     "TweetSentimentExtractionClassification",
     "TwentyNewsgroupsClustering",
     "TwitterSemEval2015",
-    "TwitterURLCorpus"
+    "TwitterURLCorpus",
 ]
diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
index e8d02d3226..f6868c6395 100644
--- a/mteb/evaluation/MTEB.py
+++ b/mteb/evaluation/MTEB.py
@@ -4,19 +4,18 @@
 import pathlib
 import traceback
 from datetime import datetime
+from importlib.metadata import version
 from time import time
+from typing import List, Union
 
 import datasets
 
-from .. import __version__
 from ..abstasks import *
 from ..abstasks import AbsTask, LangMapping
 from ..tasks import *
 
 logger = logging.getLogger(__name__)
 
-from typing import List, Union
-
 
 class MTEB:
     def __init__(
@@ -27,7 +26,7 @@ def __init__(
         tasks: List[Union[str, AbsTask]] = None,
         version=None,
         err_logs_path="error_logs.txt",
-        **kwargs
+        **kwargs,
     ):
         """
         Create an Evaluation pipeline. The tasks selected
@@ -119,7 +118,9 @@ def _display_tasks(self, task_list, name=None):
         if name:
             console.rule(f"[bold]{name}\n", style="grey15")
         for task_type in self.available_task_types:
-            current_type_tasks = list(filter(lambda x: x.description["type"] == task_type, task_list))
+            current_type_tasks = list(
+                filter(lambda x: x.description["type"] == task_type, task_list)
+            )
             if len(current_type_tasks) == 0:
                 continue
             else:
@@ -138,7 +139,9 @@ def _display_tasks(self, task_list, name=None):
                         if task.is_crosslingual
                         else ""
                     )
-                    console.print(f"{prefix}{name}{category}{multilingual}{crosslingual}")
+                    console.print(
+                        f"{prefix}{name}{category}{multilingual}{crosslingual}"
+                    )
                 console.print("\n")
 
     @classmethod
@@ -168,31 +171,46 @@ def select_tasks(self, **kwargs):
 
         # If `task_list` is specified, select list of tasks
         if self._tasks is not None:
-            self.tasks = list(filter(lambda x: (x.description["name"] in self._tasks), self.tasks_cls))
+            self.tasks = list(
+                filter(lambda x: (x.description["name"] in self._tasks), self.tasks_cls)
+            )
             if len(self.tasks) != len(self._tasks):
                 tasks_known = set([x.description["name"] for x in self.tasks_cls])
-                tasks_unknown = set(x for x in self._tasks if isinstance(x, str)) - tasks_known
+                tasks_unknown = (
+                    set(x for x in self._tasks if isinstance(x, str)) - tasks_known
+                )
                 if tasks_unknown:
-                    unknown_str, known_str = ",".join(sorted(list(tasks_unknown))), ",".join(sorted(list(tasks_known)))
-                    logger.warning(f"WARNING: Unknown tasks: {unknown_str}. Known tasks: {known_str}.")
+                    unknown_str, known_str = (
+                        ",".join(sorted(list(tasks_unknown))),
+                        ",".join(sorted(list(tasks_known))),
+                    )
+                    logger.warning(
+                        f"WARNING: Unknown tasks: {unknown_str}. Known tasks: {known_str}."
+                    )
             # add task if subclass of mteb.tasks
             self.tasks.extend([x for x in self._tasks if isinstance(x, AbsTask)])
             return
 
         # Otherwise use filters to select tasks
         filtered_tasks = filter(
-            lambda x: (self._task_types is None) or (x.description["type"] in self._task_types), self.tasks_cls
+            lambda x: (self._task_types is None)
+            or (x.description["type"] in self._task_types),
+            self.tasks_cls,
         )
         filtered_tasks = filter(
-            lambda x: (self._task_categories is None) or (x.description["category"] in self._task_categories),
+            lambda x: (self._task_categories is None)
+            or (x.description["category"] in self._task_categories),
             filtered_tasks,
         )
         filtered_tasks = filter(
-            lambda x: (self._version is None) or (x.description["version"] >= self._version), filtered_tasks
+            lambda x: (self._version is None)
+            or (x.description["version"] >= self._version),
+            filtered_tasks,
         )
         # keep only tasks with at least one language in the filter
         filtered_tasks = filter(
-            lambda x: (not (self._task_langs)) or (len(set(x.description["eval_langs"]) & set(self._task_langs)) > 0),
+            lambda x: (not (self._task_langs))
+            or (len(set(x.description["eval_langs"]) & set(self._task_langs)) > 0),
             filtered_tasks,
         )
 
@@ -216,7 +234,7 @@ def run(
         eval_splits=None,
         overwrite_results=False,
         raise_error: bool = True,
-        **kwargs
+        **kwargs,
     ):
         """
         Run the evaluation pipeline on the selected tasks.
@@ -251,18 +269,28 @@ def run(
         evaluation_results = {}
         while len(self.tasks) > 0:
             task = self.tasks[0]
-            logger.info(f"\n\n********************** Evaluating {task.description['name']} **********************")
+            logger.info(
+                f"\n\n********************** Evaluating {task.description['name']} **********************"
+            )
 
             # skip evaluation if results folder exists and overwrite_results is False
             if output_folder is not None:
-                save_path = os.path.join(output_folder, f"{task.description['name']}{task.save_suffix}.json")
+                save_path = os.path.join(
+                    output_folder, f"{task.description['name']}{task.save_suffix}.json"
+                )
                 if os.path.exists(save_path) and overwrite_results is False:
-                    logger.warning(f"WARNING: {task.description['name']} results already exists. Skipping.")
+                    logger.warning(
+                        f"WARNING: {task.description['name']} results already exists. Skipping."
+                    )
                     del self.tasks[0]
                     continue
 
             try:
-                task_eval_splits = eval_splits if eval_splits is not None else task.description.get("eval_splits", [])
+                task_eval_splits = (
+                    eval_splits
+                    if eval_splits is not None
+                    else task.description.get("eval_splits", [])
+                )
 
                 # load data
                 logger.info(f"Loading dataset for {task.description['name']}")
@@ -270,15 +298,19 @@ def run(
 
                 # run evaluation
                 task_results = {
-                    "mteb_version": __version__,
+                    "mteb_version": version("mteb"),
                     "dataset_revision": task.description.get("revision", None),
                     "mteb_dataset_name": task.description["name"],
                 }
                 for split in task_eval_splits:
                     tick = time()
-                    results = task.evaluate(model, split, output_folder=output_folder, **kwargs)
+                    results = task.evaluate(
+                        model, split, output_folder=output_folder, **kwargs
+                    )
                     tock = time()
-                    logger.info(f"Evaluation for {task.description['name']} on {split} took {tock - tick:.2f} seconds")
+                    logger.info(
+                        f"Evaluation for {task.description['name']} on {split} took {tock - tick:.2f} seconds"
+                    )
                     results["evaluation_time"] = round(tock - tick, 2)
                     task_results[split] = results
                     if verbosity >= 1:
@@ -295,7 +327,9 @@ def run(
                 logger.error(f"Error while evaluating {task.description['name']}: {e}")
                 if raise_error:
                     raise e
-                logger.error(f"Please check all the error logs at: {self.err_logs_path}")
+                logger.error(
+                    f"Please check all the error logs at: {self.err_logs_path}"
+                )
                 with open(self.err_logs_path, "a") as f_out:
                     f_out.write(f"{datetime.now()} >>> {task.description['name']}\n")
                     f_out.write(traceback.format_exc())
diff --git a/pyproject.toml b/pyproject.toml
index 9c886255ee..4c139a91eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,10 +1,125 @@
 [build-system]
-requires = ["pbr>=5.7.0", "setuptools>=36.6.0"]
-build-backend = "pbr.build"
+requires = ["setuptools>=42", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "mteb"
+version = "1.2.1.dev0"
+description = "Massive Text Embedding Benchmark"
+readme = "README.md"
+authors = [
+    { name = "MTEB Contributors", email = "niklas@huggingface.co" },
+    { email = "nouamane@huggingface.co" },
+    { email = "info@nils-reimers.de" }
+]
+license = { file = "LICENSE" }
+keywords = ["deep learning", "text embeddings", "benchmark"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Environment :: Console",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Information Technology",
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python"
+]
+requires-python = ">=3.8"
+dependencies = [
+    "datasets>=2.2.0",
+    "jsonlines",
+    "numpy",
+    "requests>=2.26.0",
+    "scikit_learn>=1.0.2",
+    "scipy",
+    "sentence_transformers>=2.2.0",
+    "torch",
+    "tqdm",
+    "rich",
+    "pytrec_eval"
+]
+
+
+[project.urls]
+homepage = "https://github.com/embeddings-benchmark/mteb"
+"Huggingface Organization" = "https://huggingface.co/mteb"
+"Source Code" = "https://github.com/embeddings-benchmark/mteb"
+
+[project.scripts]
+mteb = "mteb.cmd:main"
+
+[project.optional-dependencies]
+dev = [
+    "flake8",
+    "Flake8-pyproject>=1.2.3",
+    "isort",
+    "black==24.2.0",
+    "pytest",
+    "pytest-xdist"
+]
+
+
+[tool.setuptools.packages.find]
+exclude = ["tests", "results"]
 
 [tool.black]
 line-length = 119
 target-version = ['py35']
 
 [tool.isort]
-profile = "black"
\ No newline at end of file
+profile = "black"
+default_section = "FIRSTPARTY"
+ensure_newline_before_comments = true
+force_grid_wrap = 0
+include_trailing_comma = true
+known_first_party = "transformers"
+known_third_party = [
+    "absl",
+    "conllu",
+    "datasets",
+    "elasticsearch",
+    "fairseq",
+    "faiss-cpu",
+    "fastprogress",
+    "fire",
+    "fugashi",
+    "git",
+    "h5py",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "packaging",
+    "pandas",
+    "PIL",
+    "psutil",
+    "pytest",
+    "pytorch_lightning",
+    "rouge_score",
+    "sacrebleu",
+    "seqeval",
+    "sklearn",
+    "streamlit",
+    "tensorboardX",
+    "tensorflow",
+    "tensorflow_datasets",
+    "timeout_decorator",
+    "torch",
+    "torchaudio",
+    "torchtext",
+    "torchvision",
+    "torch_xla",
+    "tqdm",
+]
+line_length = 119
+lines_after_imports = 2
+multi_line_output = 3
+use_parentheses = true
+
+[tool.flake8]
+ignore = [
+    "E203",
+    "E501",
+    "E741",
+    "W503",
+    "W605",
+]
+max-line-length = 119
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index bfc0891a62..0000000000
--- a/setup.cfg
+++ /dev/null
@@ -1,54 +0,0 @@
-[metadata]
-description-file = README.md
-
-[isort]
-default_section = FIRSTPARTY
-ensure_newline_before_comments = True
-force_grid_wrap = 0
-include_trailing_comma = True
-known_first_party = transformers
-known_third_party =
-    absl
-    conllu
-    datasets
-    elasticsearch
-    fairseq
-    faiss-cpu
-    fastprogress
-    fire
-    fugashi
-    git
-    h5py
-    matplotlib
-    nltk
-    numpy
-    packaging
-    pandas
-    PIL
-    psutil
-    pytest
-    pytorch_lightning
-    rouge_score
-    sacrebleu
-    seqeval
-    sklearn
-    streamlit
-    tensorboardX
-    tensorflow
-    tensorflow_datasets
-    timeout_decorator
-    torch
-    torchaudio
-    torchtext
-    torchvision
-    torch_xla
-    tqdm
-
-line_length = 119
-lines_after_imports = 2
-multi_line_output = 3
-use_parentheses = True
-
-[flake8]
-ignore = E203, E501, E741, W503, W605
-max-line-length = 119
\ No newline at end of file
diff --git a/setup.py b/setup.py
deleted file mode 100644
index e2e83b8cfe..0000000000
--- a/setup.py
+++ /dev/null
@@ -1,94 +0,0 @@
-"""MTEB is an open library for benchmarking embeddings.
-Note:
-   VERSION needs to be formatted following the MAJOR.MINOR.PATCH convention
-   (we need to follow this convention to be able to retrieve versioned scripts)
-Inspired by: https://github.com/huggingface/datasets/blob/main/setup.py
-To create the package for pypi.
-0. Prerequisites:
-   - Dependencies:
-     - twine: "pip install twine"
-     - wheel: "pip install wheel"
-   - Create an account in (and join the 'datasets' project):
-     - PyPI: https://pypi.org/
-     - Test PyPI: https://test.pypi.org/
-1. Change the version in:
-   - mteb/__init__.py
-   - setup.py
-2. Commit these changes: "git commit -m 'Release: VERSION'"
-3. Add a tag in git to mark the release: "git tag VERSION -m 'Add tag VERSION for pypi'"
-   Push the tag to remote: git push --tags origin main
-4. Build both the sources and the wheel. Do not change anything in setup.py between
-   creating the wheel and the source distribution (obviously).
-   First, delete any "build" directory that may exist from previous builds.
-   For the wheel, run: "python setup.py bdist_wheel" in the top level directory.
-   (this will build a wheel for the python version you use to build it).
-   For the sources, run: "python setup.py sdist"
-   You should now have a /dist directory with both .whl and .tar.gz source versions.
-5. OPTIONAL: Check that everything looks correct by uploading the package to the pypi test server:
-   twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
-   Check that you can install it in a virtualenv/notebook by running:
-   pip install huggingface_hub fsspec aiohttp
-   pip install -U tqdm
-   pip install -i https://testpypi.python.org/pypi datasets
-6. Upload the final version to actual pypi:
-   twine upload dist/* -r pypi
-7. Fill release notes in the tag in github once everything is looking hunky-dory.
-8. Change the version in __init__.py and setup.py to X.X.X+1.dev0 (e.g. VERSION=1.18.3 -> 1.18.4.dev0).
-   Then push the change with a message 'set dev version'
-"""
-
-
-from setuptools import find_packages, setup
-
-with open("README.md", mode="r", encoding="utf-8") as readme_file:
-    readme = readme_file.read()
-
-setup(
-    name="mteb",
-    version="1.2.1.dev0",
-    description="Massive Text Embedding Benchmark",
-    long_description=readme,
-    long_description_content_type="text/markdown",
-    keywords="deep learning, text embeddings, benchmark",
-    license="Apache",
-    author="MTEB Contributors (https://github.com/embeddings-benchmark/mteb/graphs/contributors)",
-    author_email="niklas@huggingface.co, nouamane@huggingface.co, info@nils-reimers.de",
-    url="https://github.com/embeddings-benchmark/mteb",
-    project_urls={
-        "Huggingface Organization": "https://huggingface.co/mteb",
-        "Source Code": "https://github.com/embeddings-benchmark/mteb",
-    },
-    packages=find_packages(),
-    entry_points={
-        "console_scripts": [
-            "mteb=mteb.cmd:main",
-        ]
-    },
-    python_requires=">=3.7.0",
-    install_requires=[
-        "datasets>=2.2.0",
-        "jsonlines",
-        "numpy",
-        "requests>=2.26.0",
-        "scikit_learn>=1.0.2",
-        "scipy",
-        "sentence_transformers>=2.2.0",
-        "torch",
-        "tqdm",
-        "rich",
-        "pytrec_eval",
-    ],
-    # optional dependencies
-    extras_require={
-        "dev": ["flake8", "isort", "black==24.2.0", "pytest", "pytest-xdist"]
-    },
-    classifiers=[
-        "Development Status :: 4 - Beta",
-        "Environment :: Console",
-        "Intended Audience :: Developers",
-        "Intended Audience :: Information Technology",
-        "License :: OSI Approved :: Apache Software License",
-        "Operating System :: OS Independent",
-        "Programming Language :: Python",
-    ],
-)
diff --git a/tests/test_all_abstasks.py b/tests/test_all_abstasks.py
index 9b3373e183..ff578b54f4 100644
--- a/tests/test_all_abstasks.py
+++ b/tests/test_all_abstasks.py
@@ -1,25 +1,61 @@
 import logging
+from typing import Union
 
+import pytest
 from sentence_transformers import SentenceTransformer
 
 from mteb import MTEB
+from mteb.abstasks import AbsTask
 from mteb.tasks.BitextMining import BUCCBitextMining
 
 logging.basicConfig(level=logging.INFO)
+from mteb import MTEB
 
 
-def test_mteb_tasks():
+def test_two_mteb_tasks():
+    """
+    Test that two tasks can be fetched and run
+    """
     model = SentenceTransformer("average_word_embeddings_komninos")
     eval = MTEB(
         tasks=[
-            BUCCBitextMining(),
-            "Banking77Classification",
-            "TwentyNewsgroupsClustering",
-            "SciDocsRR",
-            "SprintDuplicateQuestions",
-            "NFCorpus",
             "STS12",
             "SummEval",
         ]
     )
-    eval.run(model)
+    eval.run(model, output_folder="tests/results", overwrite_results=True)
+
+
+@pytest.mark.parametrize(
+    "task",
+    [
+        BUCCBitextMining(),
+        "TwentyNewsgroupsClustering",
+        "Banking77Classification",
+        "SciDocsRR",
+        "SprintDuplicateQuestions",
+        "NFCorpus",
+        "STS12",
+        "SummEval",
+    ],
+)
+@pytest.mark.parametrize(
+    "model_name",
+    [
+        "average_word_embeddings_komninos",
+    ],
+)
+def test_mteb_task(task: Union[str, AbsTask], model_name: str):
+    """
+    Test that a task can be fetched and run
+    """
+    model = SentenceTransformer(model_name)
+    eval = MTEB(tasks=[task])
+    eval.run(model, output_folder="tests/results", overwrite_results=True)
+
+
+def test_all_tasks_fetch():
+    """
+    Test that all tasks can be fetched
+    """
+    MTEB.mteb_tasks()

From aa9234cc24f6dd3408961895d092ee019551fab2 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Wed, 20 Mar 2024 18:00:40 +0100
Subject: [PATCH 2/4] docs: typos in readme (#268)

---
 README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/README.md b/README.md
index fee7ab8da6..553794358c 100644
--- a/README.md
+++ b/README.md
@@ -222,8 +222,6 @@ evaluation.run(model)
 | 📈 [Leaderboard] | The interactive leaderboard of the benchmark |
 | 🤖 [Adding a model] | Information related to how to submit a model to the leaderboard |
 | 🤝  [Contributing] | How to contribute to MTEB and set it up for development |
-| 
-
 
 [Tasks]: docs/tasks.md
 [Contributing]: docs/contributing.md

From 023e8817f108a76718fc37f7c8937e000de56786 Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Date: Wed, 20 Mar 2024 18:02:27 +0100
Subject: [PATCH 3/4] replaced linter with ruff (#265)

* restructing the readme

* removed double specification of versions and moved all setup to pyproject.toml

* correctly use flat-layout for the package

* replaced linter with ruff

* rerun tests

* ci: Added in newer workflow

some of them are disables as they require other issues to be solved

* Update Makefile

Co-authored-by: Niklas Muennighoff <n.muennighoff@gmail.com>

---------

Co-authored-by: Niklas Muennighoff <n.muennighoff@gmail.com>
---
 .github/disabled_workflows/lint.yml    | 28 +++++++++
 .github/disabled_workflows/release.yml | 50 ++++++++++++++++
 .github/workflows/python-package.yml   | 36 -----------
 .github/workflows/tests.yml            | 43 ++++++++++++++
 .vscode/settings.json                  |  2 +-
 Makefile                               | 48 +++------------
 pyproject.toml                         | 82 +++++++-------------------
 7 files changed, 152 insertions(+), 137 deletions(-)
 create mode 100644 .github/disabled_workflows/lint.yml
 create mode 100644 .github/disabled_workflows/release.yml
 delete mode 100644 .github/workflows/python-package.yml
 create mode 100644 .github/workflows/tests.yml

diff --git a/.github/disabled_workflows/lint.yml b/.github/disabled_workflows/lint.yml
new file mode 100644
index 0000000000..bb38cb8fd0
--- /dev/null
+++ b/.github/disabled_workflows/lint.yml
@@ -0,0 +1,28 @@
+# GitHub action to run linting
+
+name: run-linting
+
+on:
+  pull_request:
+    branches: [main]
+  push:
+    branches: [main]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+          cache: "pip"
+
+      - name: Install dependencies
+        run: make install
+
+      - name: Lint
+        id: lint
+        run: |
+          make lint
diff --git a/.github/disabled_workflows/release.yml b/.github/disabled_workflows/release.yml
new file mode 100644
index 0000000000..c875231e9f
--- /dev/null
+++ b/.github/disabled_workflows/release.yml
@@ -0,0 +1,50 @@
+# This workflow will
+# - Find the latest version tag based on the commit history
+#   - Create a git tag for the new version
+#   - Update the version number in pyproject.toml based on the commit history
+# - Upload the package to PyPI
+# - Create a release on GitHub
+
+# This workflow required the following secrets to be set:
+# - a GitHub personal access token with the `repo` scope called `RELEASE`
+# - and that you setup trusted publishing using PyPI as described here: https://blog.pypi.org/posts/2023-04-20-introducing-trusted-publishers/
+
+name: Release
+on:
+  workflow_run:
+    workflows: ["tests"]
+    types:
+      - completed
+jobs:
+  release:
+    runs-on: ubuntu-latest
+    concurrency: release
+    permissions:
+      id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing using PyPI 
+
+
+    if: ${{ github.ref == 'refs/heads/main' && github.event.workflow_run.conclusion == 'success'}}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.RELEASE }}
+
+      - name: Python Semantic Release
+        id: release
+        uses: python-semantic-release/python-semantic-release@v8.0.4
+        with:
+          github_token: ${{ secrets.RELEASE }}
+
+      - name: Publish package distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        if: steps.release.outputs.released == 'true'
+        # This action supports PyPI's trusted publishing implementation, which allows authentication to PyPI without a manually 
+        # configured API token or username/password combination. To perform trusted publishing with this action, your project's 
+        # publisher must already be configured on PyPI.
+
+      - name: Publish package distributions to GitHub Releases
+        uses: python-semantic-release/upload-to-gh-release@main
+        if: steps.release.outputs.released == 'true'
+        with:
+          github_token: ${{ secrets.RELEASE }}
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
deleted file mode 100644
index 712d950a0f..0000000000
--- a/.github/workflows/python-package.yml
+++ /dev/null
@@ -1,36 +0,0 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-
-name: Python package
-
-on:
-  push:
-    branches: [ "main" ]
-  pull_request:
-    branches: [ "main" ]
-
-jobs:
-  build:
-
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.8", "3.9", "3.10"]
-
-    steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        make install
-    - name: Lint with flake8
-      run: |
-        make lint
-    - name: Test with pytest
-      run: |
-        make test
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000000..67c71e53a3
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,43 @@
+# This workflow will:
+# 1) install Python dependencies
+# 2) run make test
+
+
+name: Tests
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  pytest:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest] #, macos-latest, windows-latest]
+        python-version: ["3.8", "3.9", "3.10"]
+
+    # This allows a subsequently queued workflow run to interrupt previous runs
+    concurrency:
+      group: "${{ github.workflow }}-${{ matrix.python-version}}-${{ matrix.os }} @ ${{ github.ref }}"
+      cancel-in-progress: true
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: "pip"
+        
+      - name: Install dependencies
+        shell: bash
+        run: |
+          make install
+
+      - name: Run tests
+        shell: bash
+        run: |
+          make test
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 2a79dd386f..b52f600915 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -4,5 +4,5 @@
     ],
     "python.testing.unittestEnabled": false,
     "python.testing.pytestEnabled": true,
-    "editor.defaultFormatter": "ms-python.black-formatter"
+    "editor.defaultFormatter": "charliermarsh.ruff",
 }
diff --git a/Makefile b/Makefile
index b8ac81aca0..134210f7ea 100644
--- a/Makefile
+++ b/Makefile
@@ -1,49 +1,17 @@
-.PHONY: modified_only_fixup quality style fixup tests
-
-check_dirs := tests mteb scripts
-
-modified_only_fixup:
-	$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
-	@if test -n "$(modified_py_files)"; then \
-		echo "Checking/fixing $(modified_py_files)"; \
-		black --preview $(modified_py_files); \
-		isort $(modified_py_files); \
-		flake8 $(modified_py_files); \
-	else \
-		echo "No library .py files were modified"; \
-	fi
-
-# Super fast fix and check target that only works on relevant modified files since the branch was made
-fixup: modified_only_fixup
-
-
-# This installs all the required dependencies
 install:
+	@echo "--- 🚀 Installing project dependencies ---"
 	pip install -e ".[dev]"
 
-# this target runs checks on all files
-quality:
-	black --check --preview $(check_dirs)
-	isort --check-only $(check_dirs)
-	flake8 $(check_dirs)
-
-
-# this target runs checks on all files and potentially modifies some of them
-style:
-	black --preview $(check_dirs)
-	isort $(check_dirs)
-
-# runs the same lints as the github actions
 lint:
-	# stop the build if there are Python syntax errors or undefined names
-	flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-	# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-	flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+	@echo "--- 🧹 Running linters ---"
+	ruff format . 			# running ruff formatting
+	ruff check . --fix  	# running ruff linting
 
-# Run tests for the library
 test:
+	@echo "--- 🧪 Running tests ---"
 	pytest
 
-# add parllel test for faster execution (can sometimes cause issues with some tests)
 test-parallel:
-	pytest -n auto --dist=loadfile -s -v
\ No newline at end of file
+	@echo "--- 🧪 Running tests ---"
+	@echo "Note that parallel tests can sometimes cause issues with some tests."
+	pytest -n auto --dist=loadfile -s -v
diff --git a/pyproject.toml b/pyproject.toml
index 4c139a91eb..d14c10a029 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,10 +49,7 @@ mteb = "mteb.cmd:main"
 
 [project.optional-dependencies]
 dev = [
-    "flake8",
-    "Flake8-pyproject>=1.2.3",
-    "isort",
-    "black==24.2.0",
+    "ruff>=0.0.254",
     "pytest",
     "pytest-xdist"
 ]
@@ -61,65 +58,30 @@ dev = [
 [tool.setuptools.packages.find]
 exclude = ["tests", "results"]
 
-[tool.black]
-line-length = 119
-target-version = ['py35']
+[tool.ruff]
+target-version = "py38"
 
-[tool.isort]
-profile = "black"
-default_section = "FIRSTPARTY"
-ensure_newline_before_comments = true
-force_grid_wrap = 0
-include_trailing_comma = true
-known_first_party = "transformers"
-known_third_party = [
-    "absl",
-    "conllu",
-    "datasets",
-    "elasticsearch",
-    "fairseq",
-    "faiss-cpu",
-    "fastprogress",
-    "fire",
-    "fugashi",
-    "git",
-    "h5py",
-    "matplotlib",
-    "nltk",
-    "numpy",
-    "packaging",
-    "pandas",
-    "PIL",
-    "psutil",
-    "pytest",
-    "pytorch_lightning",
-    "rouge_score",
-    "sacrebleu",
-    "seqeval",
-    "sklearn",
-    "streamlit",
-    "tensorboardX",
-    "tensorflow",
-    "tensorflow_datasets",
-    "timeout_decorator",
-    "torch",
-    "torchaudio",
-    "torchtext",
-    "torchvision",
-    "torch_xla",
-    "tqdm",
-]
-line_length = 119
-lines_after_imports = 2
-multi_line_output = 3
-use_parentheses = true
-
-[tool.flake8]
+[tool.ruff.lint]
 ignore = [
     "E203",
     "E501",
     "E741",
-    "W503",
-    "W605",
+    "F403"
 ]
-max-line-length = 119
\ No newline at end of file
+ignore-init-module-imports = true
+
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+
+[tool.ruff.lint.flake8-annotations]
+mypy-init-return = true
+suppress-none-returning = true
+
+[tool.semantic_release]
+branch = "main"
+version_toml = ["pyproject.toml:project.version"]
+build_command = "python -m pip install build; python -m build"
+
+[tool.semantic_release.commit_parser_options]
+minor_types = ["feat"]
+patch_types = ["*", "fix", "perf"] # "*" mean always patch

From dd5d61724e71b2cdba9f9cf7e01fbed1b81cb423 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 21 Mar 2024 10:38:20 +0100
Subject: [PATCH 4/4] refactor: add metadata basemodel (#260)

* refactor: rename description to metadata dict

* refactor: add TaskMetadata and first example

* update 9 files

* update TaskMetadata.py

* update TaskMetadata.py

* update TaskMetadata.py

* update LICENSE, TaskMetadata.py and requirements.dev.txt

* update 151 files

* update 150 files

* update 43 files and delete 1 file

* update 106 files

* update 45 files

* update 6 files

* update 14 files

* Added model results to repo and updated CLI to create consistent folder structure. (#254)

* Added model results to repo and updated CLI to create consistent folder structure.

* ci: updated ci to use make install

* Added missing pytest dependencies

* Update README.md

Co-authored-by: Niklas Muennighoff <n.muennighoff@gmail.com>

---------

Co-authored-by: Niklas Muennighoff <n.muennighoff@gmail.com>

* Restructing the readme (#262)

* restructing the readme

* removed double specification of versions and moved all setup to pyproject.toml

* correctly use flat-layout for the package

* build(deps): update TaskMetadata.py and pyproject.toml

* update 221 files

* build(deps): update pyproject.toml

* build(deps): update pyproject.toml

* build(deps): update pyproject.toml

---------

Co-authored-by: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Co-authored-by: Niklas Muennighoff <n.muennighoff@gmail.com>
---
 mteb/__init__.py                              |   2 +
 mteb/abstasks/AbsTask.py                      |  10 +-
 mteb/abstasks/AbsTaskBitextMining.py          |  14 +-
 mteb/abstasks/AbsTaskClassification.py        |  25 +-
 mteb/abstasks/AbsTaskClustering.py            |  14 +-
 mteb/abstasks/AbsTaskPairClassification.py    |   8 +-
 mteb/abstasks/AbsTaskReranking.py             |   4 +-
 mteb/abstasks/AbsTaskRetrieval.py             |  18 +-
 mteb/abstasks/AbsTaskSTS.py                   |  12 +-
 mteb/abstasks/AbsTaskSummarization.py         |  12 +-
 mteb/abstasks/CrosslingualTask.py             |  12 +-
 mteb/abstasks/LangMapping.py                  |   3 +
 mteb/abstasks/MultilingualTask.py             |  16 +-
 mteb/abstasks/TaskMetadata.py                 | 116 ++++++
 mteb/abstasks/__init__.py                     |   2 +
 mteb/cmd.py                                   |   1 +
 mteb/evaluation/MTEB.py                       |  63 +--
 mteb/evaluation/__init__.py                   |   2 +
 .../evaluators/BitextMiningEvaluator.py       |  31 +-
 .../evaluators/ClassificationEvaluator.py     |  68 +++-
 .../evaluators/ClusteringEvaluator.py         |  28 +-
 mteb/evaluation/evaluators/Evaluator.py       |   2 +
 .../evaluators/PairClassificationEvaluator.py |  23 +-
 .../evaluators/RerankingEvaluator.py          |  58 ++-
 .../evaluators/RetrievalEvaluator.py          | 234 +++++++----
 mteb/evaluation/evaluators/STSEvaluator.py    |  18 +-
 .../evaluators/SummarizationEvaluator.py      |  31 +-
 mteb/evaluation/evaluators/__init__.py        |   2 +
 mteb/evaluation/evaluators/utils.py           | 123 +++---
 mteb/logging.py                               |   6 +-
 mteb/tasks/BitextMining/__init__.py           |   2 +
 .../BitextMining/da/BornholmskBitextMining.py |  48 ++-
 .../multilingual/BUCCBitextMining.py          |  42 +-
 .../multilingual/DiaBLaBitextMining.py        |  61 +--
 .../multilingual/FloresBitextMining.py        |  56 ++-
 .../NorwegianCourtsBitextMining.py            |  48 ++-
 .../multilingual/TatoebaBitextMining.py       |  42 +-
 mteb/tasks/Classification/__init__.py         |  20 +-
 .../da/AngryTweetsClassification.py           |  46 ++-
 .../Classification/da/DKHateClassification.py |  53 ++-
 .../Classification/da/DalajClassification.py  |  62 ++-
 .../DanishPoliticalCommentsClassification.py  |  49 ++-
 .../{ => da}/DdiscoCohesionClassification.py  |  95 +++--
 .../da/LccSentimentClassification.py          |  46 ++-
 .../en/AmazonPolarityClassification.py        |  42 +-
 .../en/Banking77Classification.py             |  42 +-
 .../en/EmotionClassification.py               |  50 ++-
 .../Classification/en/ImdbClassification.py   |  42 +-
 .../en/ToxicConversationsClassification.py    |  52 ++-
 .../TweetSentimentExtractionClassification.py |  47 ++-
 .../AmazonCounterfactualClassification.py     |  51 ++-
 .../AmazonReviewsClassification.py            |  45 ++-
 .../multilingual/MTOPDomainClassification.py  |  42 +-
 .../multilingual/MTOPIntentClassification.py  |  42 +-
 .../multilingual/MasakhaNEWSClassification.py |  46 ++-
 .../MassiveIntentClassification.py            |  45 ++-
 .../MassiveScenarioClassification.py          |  45 ++-
 .../multilingual/NordicLangClassification.py  |  50 ++-
 .../multilingual/ScalaClassification.py       | 203 ++++++----
 .../Classification/nb/NoRecClassification.py  |  43 +-
 .../nb/NorwegianParliamentClassification.py   |  43 +-
 .../Classification/pl/PolishClassification.py | 196 +++++++---
 .../Classification/sv/SweRecClassification.py |  43 +-
 .../Classification/zh/CMTEBClassification.py  | 252 ++++++++----
 mteb/tasks/Clustering/__init__.py             |  28 +-
 .../Clustering/de/BlurbsClusteringP2P.py      |  41 +-
 .../Clustering/de/BlurbsClusteringS2S.py      |  42 +-
 .../Clustering/de/TenKGnadClusteringP2P.py    |  42 +-
 .../Clustering/de/TenKGnadClusteringS2S.py    |  42 +-
 .../tasks/Clustering/en/ArxivClusteringP2P.py |  45 ++-
 .../tasks/Clustering/en/ArxivClusteringS2S.py |  44 ++-
 .../Clustering/en/BigPatentClustering.py      |  45 ++-
 .../Clustering/en/BiorxivClusteringP2P.py     |  43 +-
 .../Clustering/en/BiorxivClusteringS2S.py     |  41 +-
 .../Clustering/en/MedrxivClusteringP2P.py     |  44 ++-
 .../Clustering/en/MedrxivClusteringS2S.py     |  42 +-
 mteb/tasks/Clustering/en/RedditClustering.py  |  45 ++-
 .../Clustering/en/RedditClusteringP2P.py      |  44 ++-
 .../Clustering/en/StackExchangeClustering.py  |  30 +-
 .../en/StackExchangeClusteringP2P.py          |  45 ++-
 .../en/TwentyNewsgroupsClustering.py          |  42 +-
 .../Clustering/en/WikiCitiesClustering.py     |  45 ++-
 .../Clustering/es/FloresClusteringS2S.py      |  44 ++-
 .../Clustering/es/SpanishNewsClusteringP2P.py |  34 +-
 .../Clustering/fr/AlloProfClusteringP2P.py    |  48 ++-
 .../Clustering/fr/AlloProfClusteringS2S.py    |  50 ++-
 mteb/tasks/Clustering/fr/HALClusteringS2S.py  |  48 ++-
 .../tasks/Clustering/fr/MLSUMClusteringP2P.py |  54 ++-
 .../tasks/Clustering/fr/MLSUMClusteringS2S.py |  62 ++-
 .../multilingual/MasakhaNEWSClusteringP2P.py  |  54 ++-
 .../multilingual/MasakhaNEWSClusteringS2S.py  |  56 ++-
 mteb/tasks/Clustering/pl/PolishClustering.py  |  43 +-
 mteb/tasks/Clustering/zh/CMTEBClustering.py   | 162 +++++---
 mteb/tasks/PairClassification/__init__.py     |  10 +-
 .../en/SprintDuplicateQuestionsPC.py          |  42 +-
 .../en/TwitterSemEval2015PC.py                |  42 +-
 .../en/TwitterURLCorpusPC.py                  |  42 +-
 .../multilingual/OpusparcusPC.py              |  53 ++-
 .../PairClassification/multilingual/PawsX.py  |  46 ++-
 mteb/tasks/PairClassification/pl/PolishPC.py  | 152 +++++---
 .../zh/CMTEBPairClassification.py             |  79 ++--
 mteb/tasks/Reranking/__init__.py              |   9 +-
 .../Reranking/en/AskUbuntuDupQuestions.py     |  45 ++-
 mteb/tasks/Reranking/en/MindSmallReranking.py |  42 +-
 mteb/tasks/Reranking/en/SciDocsReranking.py   |  42 +-
 .../Reranking/en/StackOverflowDupQuestions.py |  44 ++-
 mteb/tasks/Reranking/fr/AlloprofReranking.py  |  45 ++-
 mteb/tasks/Reranking/fr/SyntecReranking.py    |  42 +-
 .../Reranking/multilingual/MIRACLReranking.py |  46 ++-
 mteb/tasks/Reranking/zh/CMTEBReranking.py     | 143 +++++--
 mteb/tasks/Retrieval/__init__.py              |  53 ++-
 mteb/tasks/Retrieval/de/GerDaLIRRetrieval.py  |  55 ++-
 mteb/tasks/Retrieval/de/GermanDPRRetrieval.py |  63 ++-
 .../tasks/Retrieval/de/GermanQuADRetrieval.py |  68 ++--
 mteb/tasks/Retrieval/en/ArguAnaRetrieval.py   |  42 +-
 .../en/CQADupstackAndroidRetrieval.py         |  32 +-
 .../en/CQADupstackEnglishRetrieval.py         |  42 +-
 .../en/CQADupstackGamingRetrieval.py          |  42 +-
 .../Retrieval/en/CQADupstackGisRetrieval.py   |  42 +-
 .../en/CQADupstackMathematicaRetrieval.py     |  42 +-
 .../en/CQADupstackPhysicsRetrieval.py         |  42 +-
 .../en/CQADupstackProgrammersRetrieval.py     |  42 +-
 .../Retrieval/en/CQADupstackStatsRetrieval.py |  42 +-
 .../Retrieval/en/CQADupstackTexRetrieval.py   |  42 +-
 .../Retrieval/en/CQADupstackUnixRetrieval.py  |  42 +-
 .../en/CQADupstackWebmastersRetrieval.py      |  42 +-
 .../en/CQADupstackWordpressRetrieval.py       |  42 +-
 .../Retrieval/en/ClimateFEVERRetrieval.py     |  45 ++-
 mteb/tasks/Retrieval/en/DBPediaRetrieval.py   |  44 ++-
 mteb/tasks/Retrieval/en/FEVERRetrieval.py     |  50 ++-
 mteb/tasks/Retrieval/en/FiQA2018Retrieval.py  |  42 +-
 mteb/tasks/Retrieval/en/HagridRetrieval.py    |  80 ++--
 mteb/tasks/Retrieval/en/HotpotQARetrieval.py  |  48 ++-
 mteb/tasks/Retrieval/en/MSMARCORetrieval.py   |  42 +-
 mteb/tasks/Retrieval/en/MSMARCOv2Retrieval.py |  42 +-
 mteb/tasks/Retrieval/en/NFCorpusRetrieval.py  |  42 +-
 mteb/tasks/Retrieval/en/NQRetrieval.py        |  42 +-
 .../Retrieval/en/NarrativeQARetrieval.py      |  73 +++-
 mteb/tasks/Retrieval/en/QuoraRetrieval.py     |  47 ++-
 mteb/tasks/Retrieval/en/SCIDOCSRetrieval.py   |  47 ++-
 mteb/tasks/Retrieval/en/SciFactRetrieval.py   |  42 +-
 mteb/tasks/Retrieval/en/TRECCOVIDRetrieval.py |  42 +-
 .../tasks/Retrieval/en/Touche2020Retrieval.py |  42 +-
 .../es/SpanishPassageRetrievalS2P.py          |  73 +++-
 .../es/SpanishPassageRetrievalS2S.py          |  73 +++-
 mteb/tasks/Retrieval/fr/AlloprofRetrieval.py  |  64 ++-
 mteb/tasks/Retrieval/fr/BSARDRetrieval.py     |  78 ++--
 mteb/tasks/Retrieval/fr/SyntecRetrieval.py    |  66 ++--
 mteb/tasks/Retrieval/ko/KoMiracl.py           |  42 +-
 mteb/tasks/Retrieval/ko/KoMrtydi.py           |  42 +-
 mteb/tasks/Retrieval/ko/KoStrategyQA.py       |  42 +-
 .../Retrieval/multilingual/MIRACLRetrieval.py |  56 ++-
 .../multilingual/MintakaRetrieval.py          |  61 ++-
 .../multilingual/MultiLongDocRetrieval.py     | 100 +++--
 .../multilingual/XMarketRetrieval.py          |  63 +--
 .../Retrieval/multilingual/XPQARetrieval.py   |  56 ++-
 mteb/tasks/Retrieval/pl/ArguAnaPLRetrieval.py |  43 +-
 mteb/tasks/Retrieval/pl/DBPediaPLRetrieval.py |  45 ++-
 mteb/tasks/Retrieval/pl/FiQAPLRetrieval.py    |  43 +-
 .../tasks/Retrieval/pl/HotpotQAPLRetrieval.py |  46 ++-
 mteb/tasks/Retrieval/pl/MSMARCOPLRetrieval.py |  43 +-
 .../tasks/Retrieval/pl/NFCorpusPLRetrieval.py |  43 +-
 mteb/tasks/Retrieval/pl/NQPLRetrieval.py      |  43 +-
 mteb/tasks/Retrieval/pl/QuoraPLRetrieval.py   |  46 ++-
 mteb/tasks/Retrieval/pl/SCIDOCSPLRetrieval.py |  46 ++-
 mteb/tasks/Retrieval/pl/SciFactPLRetrieval.py |  43 +-
 .../Retrieval/pl/TRECCOVIDPLRetrieval.py      |  43 +-
 mteb/tasks/Retrieval/zh/CMTEBRetrieval.py     | 367 +++++++++++-------
 mteb/tasks/STS/__init__.py                    |  14 +-
 mteb/tasks/STS/de/GermanSTSBenchmarkSTS.py    |  49 ++-
 mteb/tasks/STS/en/BiossesSTS.py               |  47 ++-
 mteb/tasks/STS/en/STS12STS.py                 |  47 ++-
 mteb/tasks/STS/en/STS13STS.py                 |  47 ++-
 mteb/tasks/STS/en/STS14STS.py                 |  47 ++-
 mteb/tasks/STS/en/STS15STS.py                 |  47 ++-
 mteb/tasks/STS/en/STS16STS.py                 |  47 ++-
 mteb/tasks/STS/en/STSBenchmarkSTS.py          |  47 ++-
 mteb/tasks/STS/en/SickrSTS.py                 |  47 ++-
 mteb/tasks/STS/es/STSES.py                    |  56 ++-
 mteb/tasks/STS/fr/SickFrSTS.py                |  67 ++--
 .../STS/multilingual/STS17CrosslingualSTS.py  |  61 ++-
 .../STS/multilingual/STS22CrosslingualSTS.py  |  87 +++--
 .../STSBenchmarkMultilingualSTS.py            |  89 +++--
 mteb/tasks/STS/pl/PolishSTS.py                |  87 +++--
 mteb/tasks/STS/zh/CMTEBSTS.py                 | 292 +++++++++-----
 mteb/tasks/Summarization/__init__.py          |   4 +-
 .../Summarization/en/SummEvalSummarization.py |  47 ++-
 .../fr/SummEvalFrSummarization.py             |  47 ++-
 mteb/tasks/__init__.py                        |   2 +
 pyproject.toml                                |  23 +-
 scripts/data/amazon_polarity/create_data.py   |  15 +-
 .../data/amazon_reviews_multi/create_data.py  |  23 +-
 scripts/data/arxiv/script_clustering.py       |   6 +-
 scripts/data/arxiv/script_raw.py              |   6 +-
 scripts/data/biorxiv/script_clustering.py     |   6 +-
 scripts/data/biorxiv/script_raw.py            |   6 +-
 scripts/data/bucc/create_data.py              |   3 +-
 scripts/data/create_task_table.py             |  31 +-
 scripts/data/germanquad/process_data.py       |  34 +-
 scripts/data/hal/create_data.py               |  25 +-
 scripts/data/imdb/create_data.py              |  11 +-
 scripts/data/medrxiv/script_clustering.py     |   6 +-
 scripts/data/medrxiv/script_raw.py            |   6 +-
 scripts/data/mind/prepare_data.py             |  11 +-
 scripts/data/redditp2p/script_clustering.py   |   6 +-
 .../stackexchangep2p/script_clustering.py     |  10 +-
 .../sts22-crosslingual-sts/create_data.py     |  11 +-
 scripts/data/summeval_fr/create_data.py       |  40 +-
 .../toxic_conversations_50k/create_data.py    |   3 +-
 scripts/merge_cqadupstack.py                  |  11 +-
 scripts/mteb_meta.py                          |  40 +-
 scripts/run_mteb_chinese.py                   |   7 +-
 scripts/run_mteb_english.py                   |  13 +-
 scripts/run_mteb_french.py                    |  24 +-
 scripts/run_mteb_german.py                    |  18 +-
 scripts/run_mteb_korean.py                    |  30 +-
 scripts/run_mteb_polish.py                    |  29 +-
 tests/test_ClusteringEvaluator.py             |   2 +
 tests/test_PairClassificationEvaluator.py     |  12 +-
 tests/test_RerankingEvaluator.py              |  18 +-
 tests/test_RetrievalEvaluator.py              |  11 +-
 tests/test_all_abstasks.py                    |   3 +-
 222 files changed, 6943 insertions(+), 3425 deletions(-)
 create mode 100644 mteb/abstasks/TaskMetadata.py
 rename mteb/tasks/Classification/{ => da}/DdiscoCohesionClassification.py (65%)

diff --git a/mteb/__init__.py b/mteb/__init__.py
index 16c164d865..304121b506 100644
--- a/mteb/__init__.py
+++ b/mteb/__init__.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from importlib.metadata import version
 
 from mteb.evaluation import *
diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py
index 4a41b3e728..75785652f8 100644
--- a/mteb/abstasks/AbsTask.py
+++ b/mteb/abstasks/AbsTask.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import random
 from abc import ABC, abstractmethod
 
@@ -24,17 +26,19 @@ def load_data(self, **kwargs):
         """
         Load dataset from HuggingFace hub
         """
-        if self.data_loaded: return
+        if self.data_loaded:
+            return
 
         # TODO: add split argument
         self.dataset = datasets.load_dataset(
-            self.description["hf_hub_name"], revision=self.description.get("revision", None)
+            self.metadata_dict["hf_hub_name"],
+            revision=self.metadata_dict.get("revision", None),
         )
         self.data_loaded = True
 
     @property
     @abstractmethod
-    def description(self):
+    def metadata_dict(self) -> dict[str, str]:
         """
         Returns a description of the task. Should contain the following fields:
         name: Name of the task (usually equal to the class name. Should be a valid name for a path on disc)
diff --git a/mteb/abstasks/AbsTaskBitextMining.py b/mteb/abstasks/AbsTaskBitextMining.py
index 960d88f751..ced42a7307 100644
--- a/mteb/abstasks/AbsTaskBitextMining.py
+++ b/mteb/abstasks/AbsTaskBitextMining.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import logging
 
 from ..evaluation.evaluators import BitextMiningEvaluator
@@ -11,7 +13,7 @@ class AbsTaskBitextMining(AbsTask):
     Abstract class for BitextMining tasks
     The similarity is computed between pairs and the results are ranked.
 
-    self.load_data() must generate a huggingface dataset with a split matching self.description["eval_splits"], and assign it to self.dataset. It must contain the following columns:
+    self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
         id: str
         sentence1: str
         sentence2: str
@@ -28,13 +30,13 @@ def evaluate(self, model, split, **kwargs):
             scores = {}
             for lang in self.dataset:
                 logger.info(
-                    f"\nTask: {self.description['name']}, split: {split}, language: {lang}. Running..."
+                    f"\nTask: {self.metadata_dict['name']}, split: {split}, language: {lang}. Running..."
                 )
                 data_split = self.dataset[lang][split]
                 scores[lang] = self._evaluate_split(model, data_split, **kwargs)
         else:
             logger.info(
-                f"\nTask: {self.description['name']}, split: {split}. Running..."
+                f"\nTask: {self.metadata_dict['name']}, split: {split}. Running..."
             )
             data_split = self.dataset[split]
             scores = self._evaluate_split(model, data_split, **kwargs)
@@ -72,9 +74,9 @@ def _evaluate_split(self, model, data_split, **kwargs):
         return metrics
 
     def _add_main_score(self, scores):
-        if self.description["main_score"] in scores:
-            scores["main_score"] = scores[self.description["main_score"]]
+        if self.metadata_dict["main_score"] in scores:
+            scores["main_score"] = scores[self.metadata_dict["main_score"]]
         else:
             logger.warn(
-                f"main score {self.description['main_score']} not found in scores {scores.keys()}"
+                f"main score {self.metadata_dict['main_score']} not found in scores {scores.keys()}"
             )
diff --git a/mteb/abstasks/AbsTaskClassification.py b/mteb/abstasks/AbsTaskClassification.py
index fb05063398..92bcfac7f8 100644
--- a/mteb/abstasks/AbsTaskClassification.py
+++ b/mteb/abstasks/AbsTaskClassification.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import logging
 from collections import defaultdict
 
@@ -18,7 +20,7 @@ class AbsTaskClassification(AbsTask):
     Abstract class for kNN classification tasks
     The similarity is computed between pairs and the results are ranked.
 
-    self.load_data() must generate a huggingface dataset with a split matching self.description["eval_splits"], and assign it to self.dataset. It must contain the following columns:
+    self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
         text: str
         label: int
     """
@@ -40,23 +42,30 @@ def __init__(
         self.n_experiments = (
             n_experiments
             if n_experiments is not None
-            else self.description.get("n_experiments", 10)
+            else self.metadata_dict.get("n_experiments", 10)
         )
         self.samples_per_label = (
             samples_per_label
             if samples_per_label is not None
-            else self.description.get("samples_per_label", 8)
+            else self.metadata_dict.get("samples_per_label", 8)
         )
 
         # kNN parameters
         self.k = k
 
+        # Run metadata validation by instantiating addressing the attribute
+        # This is quite hacky. Ideally, this would be done in the constructor of
+        # each concrete task, but then we have to duplicate the __init__ method's
+        # interface.
+        if hasattr(self, "metadata"):
+            self.metadata
+
     def _add_main_score(self, scores):
-        if self.description["main_score"] in scores:
-            scores["main_score"] = scores[self.description["main_score"]]
+        if self.metadata_dict["main_score"] in scores:
+            scores["main_score"] = scores[self.metadata_dict["main_score"]]
         else:
             logger.warn(
-                f"main score {self.description['main_score']} not found in scores {scores.keys()}"
+                f"main score {self.metadata_dict['main_score']} not found in scores {scores.keys()}"
             )
 
     def evaluate(self, model, eval_split="test", train_split="train", **kwargs):
@@ -67,7 +76,7 @@ def evaluate(self, model, eval_split="test", train_split="train", **kwargs):
             scores = {}
             for lang in self.dataset:
                 logger.info(
-                    f"\nTask: {self.description['name']}, split: {eval_split}, language: {lang}. Running..."
+                    f"\nTask: {self.metadata_dict['name']}, split: {eval_split}, language: {lang}. Running..."
                 )
                 scores[lang] = self._evaluate_monolingual(
                     model, self.dataset[lang], eval_split, train_split, **kwargs
@@ -75,7 +84,7 @@ def evaluate(self, model, eval_split="test", train_split="train", **kwargs):
                 self._add_main_score(scores[lang])
         else:
             logger.info(
-                f"\nTask: {self.description['name']}, split: {eval_split}. Running..."
+                f"\nTask: {self.metadata_dict['name']}, split: {eval_split}. Running..."
             )
             scores = self._evaluate_monolingual(
                 model, self.dataset, eval_split, train_split, **kwargs
diff --git a/mteb/abstasks/AbsTaskClustering.py b/mteb/abstasks/AbsTaskClustering.py
index 14489933ab..6c6652bff5 100644
--- a/mteb/abstasks/AbsTaskClustering.py
+++ b/mteb/abstasks/AbsTaskClustering.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import logging
 
 import numpy as np
@@ -14,7 +16,7 @@ class AbsTaskClustering(AbsTask):
     Abstract class for Clustering tasks
     The similarity is computed between pairs and the results are ranked.
 
-    self.load_data() must generate a huggingface dataset with a split matching self.description["eval_splits"], and assign it to self.dataset. It must contain the following columns:
+    self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
         sentences: list of str
         labels: list of str
     """
@@ -23,11 +25,11 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
     def _add_main_score(self, scores):
-        if self.description["main_score"] in scores:
-            scores["main_score"] = scores[self.description["main_score"]]
+        if self.metadata_dict["main_score"] in scores:
+            scores["main_score"] = scores[self.metadata_dict["main_score"]]
         else:
             logger.warn(
-                f"main score {self.description['main_score']} not found in scores {scores.keys()}"
+                f"main score {self.metadata_dict['main_score']} not found in scores {scores.keys()}"
             )
 
     def evaluate(self, model, split="test", **kwargs):
@@ -38,7 +40,7 @@ def evaluate(self, model, split="test", **kwargs):
             scores = {}
             for lang in self.dataset:
                 logger.info(
-                    f"\nTask: {self.description['name']}, split: {split}, language: {lang}. Running..."
+                    f"\nTask: {self.metadata_dict['name']}, split: {split}, language: {lang}. Running..."
                 )
                 scores[lang] = self._evaluate_monolingual(
                     model, self.dataset[lang], split, **kwargs
@@ -46,7 +48,7 @@ def evaluate(self, model, split="test", **kwargs):
                 self._add_main_score(scores[lang])
         else:
             logger.info(
-                f"\nTask: {self.description['name']}, split: {split}. Running..."
+                f"\nTask: {self.metadata_dict['name']}, split: {split}. Running..."
             )
             scores = self._evaluate_monolingual(model, self.dataset, split, **kwargs)
             self._add_main_score(scores)
diff --git a/mteb/abstasks/AbsTaskPairClassification.py b/mteb/abstasks/AbsTaskPairClassification.py
index 6b3070ef40..df700b76a2 100644
--- a/mteb/abstasks/AbsTaskPairClassification.py
+++ b/mteb/abstasks/AbsTaskPairClassification.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import logging
 from collections import defaultdict
 
@@ -13,7 +15,7 @@ class AbsTaskPairClassification(AbsTask):
     The similarity is computed between pairs and the results are ranked. Average precision
     is computed to measure how well the methods can be used for pairwise pair classification.
 
-    self.load_data() must generate a huggingface dataset with a split matching self.description["eval_splits"], and assign it to self.dataset. It must contain the following columns:
+    self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
         sent1: list[str]
         sent2: list[str]
         labels: list[int]
@@ -53,7 +55,7 @@ def evaluate(self, model, split="test", **kwargs):
             print("loaded langs:", self.dataset.keys())
             for lang, monolingual_dataset in self.dataset.items():
                 logger.info(
-                    f"\nTask: {self.description['name']}, split: {split}, language: {lang}. Running..."
+                    f"\nTask: {self.metadata_dict['name']}, split: {split}, language: {lang}. Running..."
                 )
                 scores[lang] = self._evaluate_monolingual(
                     model, monolingual_dataset, split=split, **kwargs
@@ -61,7 +63,7 @@ def evaluate(self, model, split="test", **kwargs):
             return scores
         else:
             logger.info(
-                f"\nTask: {self.description['name']}, split: {split}. Running..."
+                f"\nTask: {self.metadata_dict['name']}, split: {split}. Running..."
             )
             return self._evaluate_monolingual(
                 model, self.dataset, split=split, **kwargs
diff --git a/mteb/abstasks/AbsTaskReranking.py b/mteb/abstasks/AbsTaskReranking.py
index faefc055ad..9f8ed11049 100644
--- a/mteb/abstasks/AbsTaskReranking.py
+++ b/mteb/abstasks/AbsTaskReranking.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from ..evaluation.evaluators import RerankingEvaluator
 from .AbsTask import AbsTask
 
@@ -6,7 +8,7 @@ class AbsTaskReranking(AbsTask):
     """
     Abstract class for re-ranking experiments.
 
-    self.load_data() must generate a huggingface dataset with a split matching self.description["eval_splits"], and assign it to self.dataset. It must contain the following columns:
+    self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
         query: str
         positive: list[str]
         negative: list[str]
diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py
index 482d9be2c4..3ba2e37840 100644
--- a/mteb/abstasks/AbsTaskRetrieval.py
+++ b/mteb/abstasks/AbsTaskRetrieval.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import json
 import logging
 import os
@@ -71,7 +73,7 @@ def check(fIn: str, ext: str):
 
     def load(
         self, split="test"
-    ) -> Tuple[Dict[str, Dict[str, str]], Dict[str, str], Dict[str, Dict[str, int]]]:
+    ) -> Tuple[Dict[str, dict[str, str]], dict[str, str], dict[str, dict[str, int]]]:
         if not self.hf_repo:
             self.qrels_file = os.path.join(self.qrels_folder, split + ".tsv")
             self.check(fIn=self.corpus_file, ext="jsonl")
@@ -103,7 +105,7 @@ def qrels_dict_init(row):
 
         return self.corpus, self.queries, self.qrels
 
-    def load_corpus(self) -> Dict[str, Dict[str, str]]:
+    def load_corpus(self) -> dict[str, dict[str, str]]:
         if not self.hf_repo:
             self.check(fIn=self.corpus_file, ext="jsonl")
 
@@ -217,13 +219,13 @@ def load_data(self, **kwargs):
             return
         self.corpus, self.queries, self.relevant_docs = {}, {}, {}
         hf_repo_qrels = (
-            self.description["hf_hub_name"] + "-qrels"
-            if "clarin-knext" in self.description["hf_hub_name"]
+            self.metadata_dict["hf_hub_name"] + "-qrels"
+            if "clarin-knext" in self.metadata_dict["hf_hub_name"]
             else None
         )
-        for split in kwargs.get("eval_splits", self.description["eval_splits"]):
+        for split in kwargs.get("eval_splits", self.metadata_dict["eval_splits"]):
             corpus, queries, qrels = HFDataLoader(
-                hf_repo=self.description["hf_hub_name"],
+                hf_repo=self.metadata_dict["hf_hub_name"],
                 hf_repo_qrels=hf_repo_qrels,
                 streaming=False,
                 keep_in_memory=False,
@@ -295,11 +297,11 @@ def _evaluate_monolingual(
                     }
             if lang is None:
                 qrels_save_path = (
-                    f"{output_folder}/{self.description['name']}_qrels.json"
+                    f"{output_folder}/{self.metadata_dict['name']}_qrels.json"
                 )
             else:
                 qrels_save_path = (
-                    f"{output_folder}/{self.description['name']}_{lang}_qrels.json"
+                    f"{output_folder}/{self.metadata_dict['name']}_{lang}_qrels.json"
                 )
 
             with open(qrels_save_path, "w") as f:
diff --git a/mteb/abstasks/AbsTaskSTS.py b/mteb/abstasks/AbsTaskSTS.py
index 559a8720bb..264db3122d 100644
--- a/mteb/abstasks/AbsTaskSTS.py
+++ b/mteb/abstasks/AbsTaskSTS.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import logging
 
 from ..evaluation.evaluators import STSEvaluator
@@ -10,7 +12,7 @@ class AbsTaskSTS(AbsTask):
     """
     Abstract class for STS experiments.
 
-    self.load_data() must generate a huggingface dataset with a split matching self.description["eval_splits"], and assign it to self.dataset. It must contain the following columns::
+    self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns::
         sentence1: str
         sentence2: str
         score: float
@@ -21,11 +23,11 @@ def __init__(self, **kwargs):
 
     @property
     def min_score(self):
-        return self.description["min_score"]
+        return self.metadata_dict["min_score"]
 
     @property
     def max_score(self):
-        return self.description["max_score"]
+        return self.metadata_dict["max_score"]
 
     def evaluate(self, model, split, **kwargs):
         if not self.data_loaded:
@@ -35,13 +37,13 @@ def evaluate(self, model, split, **kwargs):
             scores = {}
             for lang in self.dataset:
                 logger.info(
-                    f"Task: {self.description['name']}, split: {split}, language: {lang}. Running..."
+                    f"Task: {self.metadata_dict['name']}, split: {split}, language: {lang}. Running..."
                 )
                 data_split = self.dataset[lang][split]
                 scores[lang] = self._evaluate_split(model, data_split, **kwargs)
         else:
             logger.info(
-                f"\nTask: {self.description['name']}, split: {split}. Running..."
+                f"\nTask: {self.metadata_dict['name']}, split: {split}. Running..."
             )
             data_split = self.dataset[split]
             scores = self._evaluate_split(model, data_split, **kwargs)
diff --git a/mteb/abstasks/AbsTaskSummarization.py b/mteb/abstasks/AbsTaskSummarization.py
index 2d9a49cb76..f27c24138d 100644
--- a/mteb/abstasks/AbsTaskSummarization.py
+++ b/mteb/abstasks/AbsTaskSummarization.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import logging
 
 import numpy as np
@@ -12,7 +14,7 @@ class AbsTaskSummarization(AbsTask):
     """
     Abstract class for summarization experiments.
 
-    self.load_data() must generate a huggingface dataset with a split matching self.description["eval_splits"], and assign it to self.dataset. It must contain the following columns:
+    self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
         text: str
         human_summaries: list[str]
         machine_summaries: list[str]
@@ -24,11 +26,11 @@ def __init__(self, **kwargs):
 
     @property
     def min_score(self):
-        return self.description["min_score"]
+        return self.metadata_dict["min_score"]
 
     @property
     def max_score(self):
-        return self.description["max_score"]
+        return self.metadata_dict["max_score"]
 
     def evaluate(self, model, split, **kwargs):
         if not self.data_loaded:
@@ -38,13 +40,13 @@ def evaluate(self, model, split, **kwargs):
             scores = {}
             for lang in self.dataset:
                 logger.info(
-                    f"\nTask: {self.description['name']}, split: {split}, language: {lang}. Running..."
+                    f"\nTask: {self.metadata_dict['name']}, split: {split}, language: {lang}. Running..."
                 )
                 data_split = self.dataset[lang][split]
                 scores[lang] = self._evaluate_split(model, data_split, **kwargs)
         else:
             logger.info(
-                f"\nTask: {self.description['name']}, split: {split}. Running..."
+                f"\nTask: {self.metadata_dict['name']}, split: {split}. Running..."
             )
             data_split = self.dataset[split]
             scores = self._evaluate_split(model, data_split, **kwargs)
diff --git a/mteb/abstasks/CrosslingualTask.py b/mteb/abstasks/CrosslingualTask.py
index 4e3af2a535..bcec04d821 100644
--- a/mteb/abstasks/CrosslingualTask.py
+++ b/mteb/abstasks/CrosslingualTask.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import datasets
 
 from .AbsTask import AbsTask
@@ -6,12 +8,12 @@
 class CrosslingualTask(AbsTask):
     def __init__(self, langs=None, **kwargs):
         super().__init__(**kwargs)
-        if type(langs) is list:
-            langs = [lang for lang in langs if lang in self.description["eval_langs"]]
+        if isinstance(langs, list):
+            langs = [lang for lang in langs if lang in self.metadata_dict["eval_langs"]]
         if langs is not None and len(langs) > 0:
             self.langs = langs
         else:
-            self.langs = self.description["eval_langs"]
+            self.langs = self.metadata_dict["eval_langs"]
         self.is_crosslingual = True
 
     def load_data(self, **kwargs):
@@ -23,8 +25,8 @@ def load_data(self, **kwargs):
         self.dataset = {}
         for lang in self.langs:
             self.dataset[lang] = datasets.load_dataset(
-                self.description["hf_hub_name"],
+                self.metadata_dict["hf_hub_name"],
                 lang,
-                revision=self.description.get("revision", None),
+                revision=self.metadata_dict.get("revision", None),
             )
         self.data_loaded = True
diff --git a/mteb/abstasks/LangMapping.py b/mteb/abstasks/LangMapping.py
index 1cd7d95111..48a4edbd4c 100644
--- a/mteb/abstasks/LangMapping.py
+++ b/mteb/abstasks/LangMapping.py
@@ -1,4 +1,7 @@
 """The `LANG_MAPPING` dictionary is mapping bigram, trigram language codes to each others, to include various datasets that use one or the other"""
+
+from __future__ import annotations
+
 LANG_MAPPING = {
     "fr": ["fra", "fra_Latn"],
     "en": ["eng", "eng_Latn"],
diff --git a/mteb/abstasks/MultilingualTask.py b/mteb/abstasks/MultilingualTask.py
index 48fc8d473f..ca512f19c4 100644
--- a/mteb/abstasks/MultilingualTask.py
+++ b/mteb/abstasks/MultilingualTask.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import datasets
 
 from .AbsTask import AbsTask
@@ -6,12 +8,14 @@
 class MultilingualTask(AbsTask):
     def __init__(self, langs=None, **kwargs):
         super().__init__(**kwargs)
-        if type(langs) is list:
-            langs = [lang for lang in langs if lang in self.description["eval_langs"]]
+        if isinstance(langs, list):
+            langs = [lang for lang in langs if lang in self.metadata_dict["eval_langs"]]
         if langs is not None and len(langs) > 0:
-            self.langs = langs  # TODO: case where user provides langs not in the dataset
+            self.langs = (
+                langs  # TODO: case where user provides langs not in the dataset
+            )
         else:
-            self.langs = self.description["eval_langs"]
+            self.langs = self.metadata_dict["eval_langs"]
         self.is_multilingual = True
 
     def load_data(self, **kwargs):
@@ -23,8 +27,8 @@ def load_data(self, **kwargs):
         self.dataset = {}
         for lang in self.langs:
             self.dataset[lang] = datasets.load_dataset(
-                self.description["hf_hub_name"],
+                self.metadata_dict["hf_hub_name"],
                 lang,
-                revision=self.description.get("revision", None),
+                revision=self.metadata_dict.get("revision", None),
             )
         self.data_loaded = True
diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py
new file mode 100644
index 0000000000..499151533a
--- /dev/null
+++ b/mteb/abstasks/TaskMetadata.py
@@ -0,0 +1,116 @@
+from __future__ import annotations
+
+from datetime import date
+
+from pydantic import (
+    AnyUrl,
+    BaseModel,
+    BeforeValidator,
+    TypeAdapter,
+)
+from typing_extensions import Annotated, Literal
+
+TASK_SUBTYPE = Literal[
+    "Article retrieval",
+    "Dialect pairing",
+    "Dialog Systems",
+    "Discourse coherence",
+    "Language identification",
+    "Linguistic acceptability",
+    "Political",
+    "Question answering",
+    "Sentiment/Hate speech",
+    "Thematic clustering",
+]
+
+TASK_DOMAIN = Literal[
+    "Academic",
+    "Blog",
+    "Encyclopaedic",
+    "Fiction",
+    "Government",
+    "Legal",
+    "Medical",
+    "News",
+    "Non-fiction",
+    "Poetry",
+    "Religious",
+    "Reviews",
+    "Social",
+    "Spoken",
+    "Web",
+]
+
+TEXT_CREATION_METHOD = Literal[
+    "found",
+    "created",
+    "machine-translated",
+    "human-translated and localized",
+    "machine-translated and verified",
+    "machine-translated and localized",
+]
+
+SOCIOECONOMIC_STATUS = Literal[
+    "high",
+    "medium",
+    "low",
+]
+
+TASK_TYPE = Literal[
+    "BitextMining",
+    "Classification",
+    "Clustering",
+    "PairClassification",
+    "Reranking",
+    "Retrieval",
+    "STS",
+    "Summarization",
+]
+
+TASK_CATEGORY = Literal[
+    "s2s",  # Sentence-to-sentence
+    "s2p",  # Sentence-to-paragraph
+    "p2p",  # Paragraph-to-paragraph
+]
+
+ANNOTATOR_TYPE = Literal["expert-annotated", "human-annotated", "derived"]
+
+http_url_adapter = TypeAdapter(AnyUrl)
+STR_URL = Annotated[
+    str, BeforeValidator(lambda value: str(http_url_adapter.validate_python(value)))
+]  # Allows the type to be a string, but ensures that the string is a URL
+
+pastdate_adapter = TypeAdapter(date)
+STR_DATE = Annotated[
+    str, BeforeValidator(lambda value: str(pastdate_adapter.validate_python(value)))
+]  # Allows the type to be a string, but ensures that the string is a valid date
+
+
+class TaskMetadata(BaseModel):
+    # Meta: We can annotate the requirements here, and then link to it in the docs. This would move the documentation closer to the code, which I think is a good idea.
+
+    hf_hub_name: str
+    revision: str
+
+    name: str
+    description: str
+    type: TASK_TYPE
+    category: TASK_CATEGORY
+    reference: STR_URL | None  # URL to documentation, e.g. published paper
+
+    eval_splits: list[str]
+    eval_langs: list[str]  # Might want to have a literal of langs when #251 is resolved
+    main_score: str  # Might want a literal here
+
+    date: tuple[STR_DATE, STR_DATE] | None  # When the data was collected
+    form: list[Literal["spoken", "written"]] | None
+    domains: list[TASK_DOMAIN] | None
+    task_subtypes: list[TASK_SUBTYPE] | None
+    license: str | None
+
+    socioeconomic_status: SOCIOECONOMIC_STATUS | None
+    annotations_creators: ANNOTATOR_TYPE | None
+    dialect: list[str] | None
+
+    text_creation: TEXT_CREATION_METHOD | None
+    bibtex_citation: str | None
diff --git a/mteb/abstasks/__init__.py b/mteb/abstasks/__init__.py
index aabfedfbfd..e1e800c8ba 100644
--- a/mteb/abstasks/__init__.py
+++ b/mteb/abstasks/__init__.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from .AbsTask import *
 from .AbsTaskBitextMining import *
 from .AbsTaskClassification import *
diff --git a/mteb/cmd.py b/mteb/cmd.py
index d8540da09a..834b943e17 100644
--- a/mteb/cmd.py
+++ b/mteb/cmd.py
@@ -8,6 +8,7 @@
        --verbosity 3
 """
 
+from __future__ import annotations
 
 import argparse
 import datetime
diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
index f6868c6395..4ddfb1e6a6 100644
--- a/mteb/evaluation/MTEB.py
+++ b/mteb/evaluation/MTEB.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import json
 import logging
 import os
@@ -66,7 +68,7 @@ def __init__(
             self._tasks = None
 
         self._task_langs = task_langs if task_langs is not None else []
-        if type(self._task_langs) is str:
+        if isinstance(self._task_langs, str):
             self._task_langs = [self._task_langs]
 
         self._extend_lang_code()
@@ -79,15 +81,15 @@ def __init__(
 
     @property
     def available_tasks(self):
-        return [x.description["name"] for x in self.tasks_cls]
+        return [x.metadata_dict["name"] for x in self.tasks_cls]
 
     @property
     def available_task_types(self):
-        return set([x.description["type"] for x in self.tasks_cls])
+        return set([x.metadata_dict["type"] for x in self.tasks_cls])
 
     @property
     def available_task_categories(self):
-        return set([x.description["category"] for x in self.tasks_cls])
+        return set([x.metadata_dict["category"] for x in self.tasks_cls])
 
     def _extend_lang_code(self):
         # add all possible language codes
@@ -119,7 +121,7 @@ def _display_tasks(self, task_list, name=None):
             console.rule(f"[bold]{name}\n", style="grey15")
         for task_type in self.available_task_types:
             current_type_tasks = list(
-                filter(lambda x: x.description["type"] == task_type, task_list)
+                filter(lambda x: x.metadata_dict["type"] == task_type, task_list)
             )
             if len(current_type_tasks) == 0:
                 continue
@@ -127,15 +129,15 @@ def _display_tasks(self, task_list, name=None):
                 console.print(f"[bold]{task_type}[/]")
                 for task in current_type_tasks:
                     prefix = "    - "
-                    name = f"{task.description['name']}"
-                    category = f", [italic grey39]{task.description['category']}[/]"
+                    name = f"{task.metadata_dict['name']}"
+                    category = f", [italic grey39]{task.metadata_dict['category']}[/]"
                     multilingual = (
-                        f", [italic red]multilingual {len(task.langs)} / {len(task.description['eval_langs'])} langs[/]"
+                        f", [italic red]multilingual {len(task.langs)} / {len(task.metadata_dict['eval_langs'])} langs[/]"
                         if task.is_multilingual
                         else ""
                     )
                     crosslingual = (
-                        f", [italic cyan]crosslingual {len(task.langs)} / {len(task.description['eval_langs'])} pairs[/]"
+                        f", [italic cyan]crosslingual {len(task.langs)} / {len(task.metadata_dict['eval_langs'])} pairs[/]"
                         if task.is_crosslingual
                         else ""
                     )
@@ -172,10 +174,12 @@ def select_tasks(self, **kwargs):
         # If `task_list` is specified, select list of tasks
         if self._tasks is not None:
             self.tasks = list(
-                filter(lambda x: (x.description["name"] in self._tasks), self.tasks_cls)
+                filter(
+                    lambda x: (x.metadata_dict["name"] in self._tasks), self.tasks_cls
+                )
             )
             if len(self.tasks) != len(self._tasks):
-                tasks_known = set([x.description["name"] for x in self.tasks_cls])
+                tasks_known = set([x.metadata_dict["name"] for x in self.tasks_cls])
                 tasks_unknown = (
                     set(x for x in self._tasks if isinstance(x, str)) - tasks_known
                 )
@@ -194,23 +198,23 @@ def select_tasks(self, **kwargs):
         # Otherwise use filters to select tasks
         filtered_tasks = filter(
             lambda x: (self._task_types is None)
-            or (x.description["type"] in self._task_types),
+            or (x.metadata_dict["type"] in self._task_types),
             self.tasks_cls,
         )
         filtered_tasks = filter(
             lambda x: (self._task_categories is None)
-            or (x.description["category"] in self._task_categories),
+            or (x.metadata_dict["category"] in self._task_categories),
             filtered_tasks,
         )
         filtered_tasks = filter(
             lambda x: (self._version is None)
-            or (x.description["version"] >= self._version),
+            or (x.metadata_dict["version"] >= self._version),
             filtered_tasks,
         )
         # keep only tasks with at least one language in the filter
         filtered_tasks = filter(
             lambda x: (not (self._task_langs))
-            or (len(set(x.description["eval_langs"]) & set(self._task_langs)) > 0),
+            or (len(set(x.metadata_dict["eval_langs"]) & set(self._task_langs)) > 0),
             filtered_tasks,
         )
 
@@ -223,7 +227,7 @@ def load_tasks_data(self):
         """
         logger.info(f"\n\n## Loading datasets for {len(self.tasks)} tasks")
         for task in self.tasks:
-            logger.info(f"\n# Loading dataset for {task.description['name']}")
+            logger.info(f"\n# Loading dataset for {task.metadata_dict['name']}")
             task.load_data()
 
     def run(
@@ -270,17 +274,18 @@ def run(
         while len(self.tasks) > 0:
             task = self.tasks[0]
             logger.info(
-                f"\n\n********************** Evaluating {task.description['name']} **********************"
+                f"\n\n********************** Evaluating {task.metadata_dict['name']} **********************"
             )
 
             # skip evaluation if results folder exists and overwrite_results is False
             if output_folder is not None:
                 save_path = os.path.join(
-                    output_folder, f"{task.description['name']}{task.save_suffix}.json"
+                    output_folder,
+                    f"{task.metadata_dict['name']}{task.save_suffix}.json",
                 )
                 if os.path.exists(save_path) and overwrite_results is False:
                     logger.warning(
-                        f"WARNING: {task.description['name']} results already exists. Skipping."
+                        f"WARNING: {task.metadata_dict['name']} results already exists. Skipping."
                     )
                     del self.tasks[0]
                     continue
@@ -289,18 +294,18 @@ def run(
                 task_eval_splits = (
                     eval_splits
                     if eval_splits is not None
-                    else task.description.get("eval_splits", [])
+                    else task.metadata_dict.get("eval_splits", [])
                 )
 
                 # load data
-                logger.info(f"Loading dataset for {task.description['name']}")
+                logger.info(f"Loading dataset for {task.metadata_dict['name']}")
                 task.load_data(eval_splits=task_eval_splits, **kwargs)
 
                 # run evaluation
                 task_results = {
-                    "mteb_version": version("mteb"),
-                    "dataset_revision": task.description.get("revision", None),
-                    "mteb_dataset_name": task.description["name"],
+                    "mteb_version": version("mteb"),  # noqa: F405
+                    "dataset_revision": task.metadata_dict.get("revision", None),
+                    "mteb_dataset_name": task.metadata_dict["name"],
                 }
                 for split in task_eval_splits:
                     tick = time()
@@ -309,7 +314,7 @@ def run(
                     )
                     tock = time()
                     logger.info(
-                        f"Evaluation for {task.description['name']} on {split} took {tock - tick:.2f} seconds"
+                        f"Evaluation for {task.metadata_dict['name']} on {split} took {tock - tick:.2f} seconds"
                     )
                     results["evaluation_time"] = round(tock - tick, 2)
                     task_results[split] = results
@@ -321,17 +326,19 @@ def run(
                     with open(save_path, "w") as f_out:
                         json.dump(task_results, f_out, indent=2, sort_keys=True)
 
-                evaluation_results[task.description["name"]] = task_results
+                evaluation_results[task.metadata_dict["name"]] = task_results
 
             except Exception as e:
-                logger.error(f"Error while evaluating {task.description['name']}: {e}")
+                logger.error(
+                    f"Error while evaluating {task.metadata_dict['name']}: {e}"
+                )
                 if raise_error:
                     raise e
                 logger.error(
                     f"Please check all the error logs at: {self.err_logs_path}"
                 )
                 with open(self.err_logs_path, "a") as f_out:
-                    f_out.write(f"{datetime.now()} >>> {task.description['name']}\n")
+                    f_out.write(f"{datetime.now()} >>> {task.metadata_dict['name']}\n")
                     f_out.write(traceback.format_exc())
                     f_out.write("\n\n")
 
diff --git a/mteb/evaluation/__init__.py b/mteb/evaluation/__init__.py
index 689842bb9d..c0a1596c91 100644
--- a/mteb/evaluation/__init__.py
+++ b/mteb/evaluation/__init__.py
@@ -1 +1,3 @@
+from __future__ import annotations
+
 from .MTEB import *
diff --git a/mteb/evaluation/evaluators/BitextMiningEvaluator.py b/mteb/evaluation/evaluators/BitextMiningEvaluator.py
index 83a8bec4c7..caffeeb852 100644
--- a/mteb/evaluation/evaluators/BitextMiningEvaluator.py
+++ b/mteb/evaluation/evaluators/BitextMiningEvaluator.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import logging
 
 import numpy as np
@@ -11,7 +13,9 @@
 
 
 class BitextMiningEvaluator(Evaluator):
-    def __init__(self, sentences1, sentences2, gold, batch_size=32, limit=None, **kwargs):
+    def __init__(
+        self, sentences1, sentences2, gold, batch_size=32, limit=None, **kwargs
+    ):
         super().__init__(**kwargs)
         self.gold = gold
         self.sentences1 = [sentences1[i] for (i, j) in self.gold]
@@ -91,28 +95,41 @@ def _similarity_search(
             for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size):
                 # Compute cosine similarities
                 cos_scores = score_function(
-                    query_embeddings[query_start_idx : query_start_idx + query_chunk_size],
-                    corpus_embeddings[corpus_start_idx : corpus_start_idx + corpus_chunk_size],
+                    query_embeddings[
+                        query_start_idx : query_start_idx + query_chunk_size
+                    ],
+                    corpus_embeddings[
+                        corpus_start_idx : corpus_start_idx + corpus_chunk_size
+                    ],
                 )
 
                 # Get top-k scores
                 cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(
-                    cos_scores, min(top_k, len(cos_scores[0])), dim=1, largest=True, sorted=False
+                    cos_scores,
+                    min(top_k, len(cos_scores[0])),
+                    dim=1,
+                    largest=True,
+                    sorted=False,
                 )
                 cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
                 cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()
 
                 for query_itr in range(len(cos_scores)):
                     for sub_corpus_id, score in zip(
-                        cos_scores_top_k_idx[query_itr], cos_scores_top_k_values[query_itr]
+                        cos_scores_top_k_idx[query_itr],
+                        cos_scores_top_k_values[query_itr],
                     ):
                         corpus_id = corpus_start_idx + sub_corpus_id
                         query_id = query_start_idx + query_itr
-                        queries_result_list[query_id].append({"corpus_id": corpus_id, "score": score})
+                        queries_result_list[query_id].append(
+                            {"corpus_id": corpus_id, "score": score}
+                        )
 
         # Sort and strip to top_k results
         for idx in range(len(queries_result_list)):
-            queries_result_list[idx] = sorted(queries_result_list[idx], key=lambda x: x["score"], reverse=True)
+            queries_result_list[idx] = sorted(
+                queries_result_list[idx], key=lambda x: x["score"], reverse=True
+            )
             queries_result_list[idx] = queries_result_list[idx][0:top_k]
 
         return queries_result_list
diff --git a/mteb/evaluation/evaluators/ClassificationEvaluator.py b/mteb/evaluation/evaluators/ClassificationEvaluator.py
index bf8d6c5f82..4182b03fe4 100644
--- a/mteb/evaluation/evaluators/ClassificationEvaluator.py
+++ b/mteb/evaluation/evaluators/ClassificationEvaluator.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import logging
 
 import numpy as np
@@ -13,7 +15,17 @@
 
 
 class kNNClassificationEvaluator(Evaluator):
-    def __init__(self, sentences_train, y_train, sentences_test, y_test, k=1, batch_size=32, limit=None, **kwargs):
+    def __init__(
+        self,
+        sentences_train,
+        y_train,
+        sentences_test,
+        y_test,
+        k=1,
+        batch_size=32,
+        limit=None,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
         if limit is not None:
             sentences_train = sentences_train[:limit]
@@ -34,9 +46,13 @@ def __call__(self, model, test_cache=None):
         max_accuracy = 0
         max_f1 = 0
         max_ap = 0
-        X_train = np.asarray(model.encode(self.sentences_train, batch_size=self.batch_size))
+        X_train = np.asarray(
+            model.encode(self.sentences_train, batch_size=self.batch_size)
+        )
         if test_cache is None:
-            X_test = np.asarray(model.encode(self.sentences_test, batch_size=self.batch_size))
+            X_test = np.asarray(
+                model.encode(self.sentences_test, batch_size=self.batch_size)
+            )
             test_cache = X_test
         else:
             X_test = test_cache
@@ -63,7 +79,17 @@ def __call__(self, model, test_cache=None):
 
 
 class kNNClassificationEvaluatorPytorch(Evaluator):
-    def __init__(self, sentences_train, y_train, sentences_test, y_test, k=1, batch_size=32, limit=None, **kwargs):
+    def __init__(
+        self,
+        sentences_train,
+        y_train,
+        sentences_test,
+        y_test,
+        k=1,
+        batch_size=32,
+        limit=None,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
         if limit is not None:
             sentences_train = sentences_train[:limit]
@@ -85,9 +111,13 @@ def __call__(self, model, test_cache=None):
         max_accuracy = 0
         max_f1 = 0
         max_ap = 0
-        X_train = np.asarray(model.encode(self.sentences_train, batch_size=self.batch_size))
+        X_train = np.asarray(
+            model.encode(self.sentences_train, batch_size=self.batch_size)
+        )
         if test_cache is None:
-            X_test = np.asarray(model.encode(self.sentences_test, batch_size=self.batch_size))
+            X_test = np.asarray(
+                model.encode(self.sentences_test, batch_size=self.batch_size)
+            )
             test_cache = X_test
         else:
             X_test = test_cache
@@ -98,9 +128,13 @@ def __call__(self, model, test_cache=None):
                 distances = self._euclidean_dist(X_test, X_train)
             elif metric == "dot":
                 distances = -self._dot_score(X_test, X_train)
-            neigh_indices = torch.topk(distances, k=self.k, dim=1, largest=False).indices
+            neigh_indices = torch.topk(
+                distances, k=self.k, dim=1, largest=False
+            ).indices
             y_train = torch.tensor(self.y_train)
-            y_pred = torch.mode(y_train[neigh_indices], dim=1).values  # TODO: case where there is no majority
+            y_pred = torch.mode(
+                y_train[neigh_indices], dim=1
+            ).values  # TODO: case where there is no majority
             accuracy = accuracy_score(self.y_test, y_pred)
             f1 = f1_score(self.y_test, y_pred, average="macro")
             scores["accuracy_" + metric] = accuracy
@@ -183,7 +217,15 @@ def _dot_score(a: Tensor, b: Tensor):
 
 class logRegClassificationEvaluator(Evaluator):
     def __init__(
-        self, sentences_train, y_train, sentences_test, y_test, max_iter=100, batch_size=32, limit=None, **kwargs
+        self,
+        sentences_train,
+        y_train,
+        sentences_test,
+        y_test,
+        max_iter=100,
+        batch_size=32,
+        limit=None,
+        **kwargs,
     ):
         super().__init__(**kwargs)
         if limit is not None:
@@ -208,10 +250,14 @@ def __call__(self, model, test_cache=None):
             verbose=1 if logger.isEnabledFor(logging.DEBUG) else 0,
         )
         logger.info(f"Encoding {len(self.sentences_train)} training sentences...")
-        X_train = np.asarray(model.encode(self.sentences_train, batch_size=self.batch_size))
+        X_train = np.asarray(
+            model.encode(self.sentences_train, batch_size=self.batch_size)
+        )
         logger.info(f"Encoding {len(self.sentences_test)} test sentences...")
         if test_cache is None:
-            X_test = np.asarray(model.encode(self.sentences_test, batch_size=self.batch_size))
+            X_test = np.asarray(
+                model.encode(self.sentences_test, batch_size=self.batch_size)
+            )
             test_cache = X_test
         else:
             X_test = test_cache
diff --git a/mteb/evaluation/evaluators/ClusteringEvaluator.py b/mteb/evaluation/evaluators/ClusteringEvaluator.py
index 43686d7a06..bbf2aadeaa 100644
--- a/mteb/evaluation/evaluators/ClusteringEvaluator.py
+++ b/mteb/evaluation/evaluators/ClusteringEvaluator.py
@@ -1,16 +1,26 @@
+from __future__ import annotations
+
 import logging
 
 import numpy as np
 import sklearn
 import sklearn.cluster
 
-logger = logging.getLogger(__name__)
-
 from .Evaluator import Evaluator
 
+logger = logging.getLogger(__name__)
+
 
 class ClusteringEvaluator(Evaluator):
-    def __init__(self, sentences, labels, clustering_batch_size=500, batch_size=32, limit=None, **kwargs):
+    def __init__(
+        self,
+        sentences,
+        labels,
+        clustering_batch_size=500,
+        batch_size=32,
+        limit=None,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
         if limit is not None:
             sentences = sentences[:limit]
@@ -22,16 +32,22 @@ def __init__(self, sentences, labels, clustering_batch_size=500, batch_size=32,
 
     def __call__(self, model):
         logger.info(f"Encoding {len(self.sentences)} sentences...")
-        corpus_embeddings = np.asarray(model.encode(self.sentences, batch_size=self.batch_size))
+        corpus_embeddings = np.asarray(
+            model.encode(self.sentences, batch_size=self.batch_size)
+        )
 
         logger.info("Fitting Mini-Batch K-Means model...")
         clustering_model = sklearn.cluster.MiniBatchKMeans(
-            n_clusters=len(set(self.labels)), batch_size=self.clustering_batch_size, n_init="auto"
+            n_clusters=len(set(self.labels)),
+            batch_size=self.clustering_batch_size,
+            n_init="auto",
         )
         clustering_model.fit(corpus_embeddings)
         cluster_assignment = clustering_model.labels_
 
         logger.info("Evaluating...")
-        v_measure = sklearn.metrics.cluster.v_measure_score(self.labels, cluster_assignment)
+        v_measure = sklearn.metrics.cluster.v_measure_score(
+            self.labels, cluster_assignment
+        )
 
         return {"v_measure": v_measure}
diff --git a/mteb/evaluation/evaluators/Evaluator.py b/mteb/evaluation/evaluators/Evaluator.py
index e7202fdecd..ab74ec2431 100644
--- a/mteb/evaluation/evaluators/Evaluator.py
+++ b/mteb/evaluation/evaluators/Evaluator.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import random
 from abc import ABC, abstractmethod
 
diff --git a/mteb/evaluation/evaluators/PairClassificationEvaluator.py b/mteb/evaluation/evaluators/PairClassificationEvaluator.py
index 37390c6f8b..bb79a468f0 100644
--- a/mteb/evaluation/evaluators/PairClassificationEvaluator.py
+++ b/mteb/evaluation/evaluators/PairClassificationEvaluator.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import logging
 
 import numpy as np
@@ -29,7 +31,9 @@ class PairClassificationEvaluator(Evaluator):
     :param write_csv: Write results to a CSV file
     """
 
-    def __init__(self, sentences1, sentences2, labels, batch_size=32, limit=None, **kwargs):
+    def __init__(
+        self, sentences1, sentences2, labels, batch_size=32, limit=None, **kwargs
+    ):
         super().__init__(**kwargs)
         if limit:
             sentences1 = sentences1[:limit]
@@ -68,7 +72,10 @@ def compute_metrics(self, model):
 
         embeddings1_np = np.asarray(embeddings1)
         embeddings2_np = np.asarray(embeddings2)
-        dot_scores = [np.dot(embeddings1_np[i], embeddings2_np[i]) for i in range(len(embeddings1_np))]
+        dot_scores = [
+            np.dot(embeddings1_np[i], embeddings2_np[i])
+            for i in range(len(embeddings1_np))
+        ]
 
         logger.info("Computing metrics...")
         labels = np.asarray(self.labels)
@@ -99,10 +106,14 @@ def _compute_metrics(scores, labels, high_score_more_similar):
         acc, acc_threshold = PairClassificationEvaluator.find_best_acc_and_threshold(
             scores, labels, high_score_more_similar
         )
-        f1, precision, recall, f1_threshold = PairClassificationEvaluator.find_best_f1_and_threshold(
+        f1, precision, recall, f1_threshold = (
+            PairClassificationEvaluator.find_best_f1_and_threshold(
+                scores, labels, high_score_more_similar
+            )
+        )
+        ap = PairClassificationEvaluator.ap_score(
             scores, labels, high_score_more_similar
         )
-        ap = PairClassificationEvaluator.ap_score(scores, labels, high_score_more_similar)
 
         return {
             "accuracy": acc,
@@ -179,4 +190,6 @@ def find_best_f1_and_threshold(scores, labels, high_score_more_similar: bool):
 
     @staticmethod
     def ap_score(scores, labels, high_score_more_similar: bool):
-        return average_precision_score(labels, scores * (1 if high_score_more_similar else -1))
+        return average_precision_score(
+            labels, scores * (1 if high_score_more_similar else -1)
+        )
diff --git a/mteb/evaluation/evaluators/RerankingEvaluator.py b/mteb/evaluation/evaluators/RerankingEvaluator.py
index 518e8c8384..35fac47d95 100644
--- a/mteb/evaluation/evaluators/RerankingEvaluator.py
+++ b/mteb/evaluation/evaluators/RerankingEvaluator.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import logging
 
 import numpy as np
@@ -49,7 +51,9 @@ def __init__(
 
         ### Remove sample with empty positive / negative set
         self.samples = [
-            sample for sample in self.samples if len(sample["positive"]) > 0 and len(sample["negative"]) > 0
+            sample
+            for sample in self.samples
+            if len(sample["positive"]) > 0 and len(sample["negative"]) > 0
         ]
 
     def __call__(self, model):
@@ -73,21 +77,33 @@ def compute_metrics_batched(self, model):
 
         # using encode_queries and encode_corpus functions if they exists,
         # which can be defined by users to add different instructions for query and passage conveniently
-        encode_queries_func = model.encode_queries if hasattr(model, 'encode_queries') else model.encode
-        encode_corpus_func = model.encode_corpus if hasattr(model, 'encode_corpus') else model.encode
+        encode_queries_func = (
+            model.encode_queries if hasattr(model, "encode_queries") else model.encode
+        )
+        encode_corpus_func = (
+            model.encode_corpus if hasattr(model, "encode_corpus") else model.encode
+        )
 
         logger.info("Encoding queries...")
         if isinstance(self.samples[0]["query"], str):
-            all_query_embs = np.asarray(encode_queries_func(
-                [sample["query"] for sample in self.samples],
-                batch_size=self.batch_size,
-            ))
+            all_query_embs = np.asarray(
+                encode_queries_func(
+                    [sample["query"] for sample in self.samples],
+                    batch_size=self.batch_size,
+                )
+            )
         elif isinstance(self.samples[0]["query"], list):
             # In case the query is a list of strings, we get the most similar embedding to any of the queries
-            all_query_flattened = [q for sample in self.samples for q in sample["query"]]
-            all_query_embs = np.asarray(encode_queries_func(all_query_flattened, batch_size=self.batch_size))
+            all_query_flattened = [
+                q for sample in self.samples for q in sample["query"]
+            ]
+            all_query_embs = np.asarray(
+                encode_queries_func(all_query_flattened, batch_size=self.batch_size)
+            )
         else:
-            raise ValueError(f"Query must be a string or a list of strings but is {type(self.samples[0]['query'])}")
+            raise ValueError(
+                f"Query must be a string or a list of strings but is {type(self.samples[0]['query'])}"
+            )
 
         logger.info("Encoding candidates...")
         all_docs = []
@@ -95,13 +111,17 @@ def compute_metrics_batched(self, model):
             all_docs.extend(sample["positive"])
             all_docs.extend(sample["negative"])
 
-        all_docs_embs = np.asarray(encode_corpus_func(all_docs, batch_size=self.batch_size))
+        all_docs_embs = np.asarray(
+            encode_corpus_func(all_docs, batch_size=self.batch_size)
+        )
 
         # Compute scores
         logger.info("Evaluating...")
         query_idx, docs_idx = 0, 0
         for instance in self.samples:
-            num_subqueries = len(instance["query"]) if isinstance(instance["query"], list) else 1
+            num_subqueries = (
+                len(instance["query"]) if isinstance(instance["query"], list) else 1
+            )
             query_emb = all_query_embs[query_idx : query_idx + num_subqueries]
             query_idx += num_subqueries
 
@@ -136,9 +156,13 @@ def compute_metrics_individual(self, model):
 
         # using encode_queries and encode_corpus functions if they exists,
         # which can be defined by users to add different instructions for query and passage conveniently
-        encode_queries_func = model.encode_queries if hasattr(model, 'encode_queries') else model.encode
-        encode_corpus_func = model.encode_corpus if hasattr(model, 'encode_corpus') else model.encode
-        
+        encode_queries_func = (
+            model.encode_queries if hasattr(model, "encode_queries") else model.encode
+        )
+        encode_corpus_func = (
+            model.encode_corpus if hasattr(model, "encode_corpus") else model.encode
+        )
+
         for instance in tqdm.tqdm(self.samples, desc="Samples"):
             query = instance["query"]
             positive = list(instance["positive"])
@@ -153,7 +177,9 @@ def compute_metrics_individual(self, model):
             if isinstance(query, str):
                 # .encoding interface requires List[str] as input
                 query = [query]
-            query_emb = np.asarray(encode_queries_func(query, batch_size=self.batch_size))
+            query_emb = np.asarray(
+                encode_queries_func(query, batch_size=self.batch_size)
+            )
             docs_emb = np.asarray(encode_corpus_func(docs, batch_size=self.batch_size))
 
             scores = self._compute_metrics_instance(query_emb, docs_emb, is_relevant)
diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py
index 97d436ed2e..25f77b524e 100644
--- a/mteb/evaluation/evaluators/RetrievalEvaluator.py
+++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py
@@ -1,98 +1,138 @@
-import logging
+from __future__ import annotations
+
 import heapq
+import logging
 from typing import Dict, List, Tuple
 
 import pytrec_eval
+import torch
 from sentence_transformers import SentenceTransformer
 from sentence_transformers.models import Transformer, WordEmbeddings
-import torch
 
 from .Evaluator import Evaluator
-from .utils import cos_sim, dot_score, mrr, recall_cap, hole, top_k_accuracy
+from .utils import cos_sim, dot_score, hole, mrr, recall_cap, top_k_accuracy
 
 logger = logging.getLogger(__name__)
 
+
 # Adapted from https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/retrieval/search/dense/exact_search.py#L12
-class DenseRetrievalExactSearch:    
-    def __init__(self, model, batch_size: int = 128, corpus_chunk_size: int = 50000, **kwargs):
+class DenseRetrievalExactSearch:
+    def __init__(
+        self, model, batch_size: int = 128, corpus_chunk_size: int = 50000, **kwargs
+    ):
         # Model is class that provides encode_corpus() and encode_queries()
         self.model = model
         self.batch_size = batch_size
-        self.score_functions = {'cos_sim': cos_sim, 'dot': dot_score}
-        self.score_function_desc = {'cos_sim': "Cosine Similarity", 'dot': "Dot Product"}
+        self.score_functions = {"cos_sim": cos_sim, "dot": dot_score}
+        self.score_function_desc = {
+            "cos_sim": "Cosine Similarity",
+            "dot": "Dot Product",
+        }
         self.corpus_chunk_size = corpus_chunk_size
         self.show_progress_bar = kwargs.get("show_progress_bar", True)
         self.convert_to_tensor = kwargs.get("convert_to_tensor", True)
         self.results = {}
-    
-    def search(self, 
-               corpus: Dict[str, Dict[str, str]], 
-               queries: Dict[str, str], 
-               top_k: int, 
-               score_function: str,
-               return_sorted: bool = False, 
-               **kwargs) -> Dict[str, Dict[str, float]]:
+
+    def search(
+        self,
+        corpus: dict[str, dict[str, str]],
+        queries: dict[str, str],
+        top_k: int,
+        score_function: str,
+        return_sorted: bool = False,
+        **kwargs,
+    ) -> dict[str, dict[str, float]]:
         # Create embeddings for all queries using model.encode_queries()
         # Runs semantic search against the corpus embeddings
         # Returns a ranked list with the corpus ids
         if score_function not in self.score_functions:
-            raise ValueError("score function: {} must be either (cos_sim) for cosine similarity or (dot) for dot product".format(score_function))
-            
+            raise ValueError(
+                "score function: {} must be either (cos_sim) for cosine similarity or (dot) for dot product".format(
+                    score_function
+                )
+            )
+
         logger.info("Encoding Queries...")
         query_ids = list(queries.keys())
         self.results = {qid: {} for qid in query_ids}
         queries = [queries[qid] for qid in queries]
         query_embeddings = self.model.encode_queries(
-            queries, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_tensor=self.convert_to_tensor)
-          
+            queries,
+            batch_size=self.batch_size,
+            show_progress_bar=self.show_progress_bar,
+            convert_to_tensor=self.convert_to_tensor,
+        )
+
         logger.info("Sorting Corpus by document length (Longest first)...")
 
-        corpus_ids = sorted(corpus, key=lambda k: len(corpus[k].get("title", "") + corpus[k].get("text", "")), reverse=True)
+        corpus_ids = sorted(
+            corpus,
+            key=lambda k: len(corpus[k].get("title", "") + corpus[k].get("text", "")),
+            reverse=True,
+        )
         corpus = [corpus[cid] for cid in corpus_ids]
 
         logger.info("Encoding Corpus in batches... Warning: This might take a while!")
-        logger.info("Scoring Function: {} ({})".format(self.score_function_desc[score_function], score_function))
+        logger.info(
+            "Scoring Function: {} ({})".format(
+                self.score_function_desc[score_function], score_function
+            )
+        )
 
         itr = range(0, len(corpus), self.corpus_chunk_size)
-        
-        result_heaps = {qid: [] for qid in query_ids}  # Keep only the top-k docs for each query
+
+        result_heaps = {
+            qid: [] for qid in query_ids
+        }  # Keep only the top-k docs for each query
         for batch_num, corpus_start_idx in enumerate(itr):
-            logger.info("Encoding Batch {}/{}...".format(batch_num+1, len(itr)))
+            logger.info("Encoding Batch {}/{}...".format(batch_num + 1, len(itr)))
             corpus_end_idx = min(corpus_start_idx + self.corpus_chunk_size, len(corpus))
 
-            # Encode chunk of corpus    
+            # Encode chunk of corpus
             sub_corpus_embeddings = self.model.encode_corpus(
                 corpus[corpus_start_idx:corpus_end_idx],
                 batch_size=self.batch_size,
-                show_progress_bar=self.show_progress_bar, 
-                convert_to_tensor = self.convert_to_tensor
-                )
+                show_progress_bar=self.show_progress_bar,
+                convert_to_tensor=self.convert_to_tensor,
+            )
 
             # Compute similarites using either cosine-similarity or dot product
-            cos_scores = self.score_functions[score_function](query_embeddings, sub_corpus_embeddings)
+            cos_scores = self.score_functions[score_function](
+                query_embeddings, sub_corpus_embeddings
+            )
             cos_scores[torch.isnan(cos_scores)] = -1
 
             # Get top-k values
-            cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(cos_scores, min(top_k+1, len(cos_scores[1])), dim=1, largest=True, sorted=return_sorted)
+            cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(
+                cos_scores,
+                min(top_k + 1, len(cos_scores[1])),
+                dim=1,
+                largest=True,
+                sorted=return_sorted,
+            )
             cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
             cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()
-            
+
             for query_itr in range(len(query_embeddings)):
-                query_id = query_ids[query_itr]                  
-                for sub_corpus_id, score in zip(cos_scores_top_k_idx[query_itr], cos_scores_top_k_values[query_itr]):
-                    corpus_id = corpus_ids[corpus_start_idx+sub_corpus_id]
+                query_id = query_ids[query_itr]
+                for sub_corpus_id, score in zip(
+                    cos_scores_top_k_idx[query_itr], cos_scores_top_k_values[query_itr]
+                ):
+                    corpus_id = corpus_ids[corpus_start_idx + sub_corpus_id]
                     if corpus_id != query_id:
                         if len(result_heaps[query_id]) < top_k:
                             # Push item on the heap
                             heapq.heappush(result_heaps[query_id], (score, corpus_id))
                         else:
                             # If item is larger than the smallest in the heap, push it on the heap then pop the smallest element
-                            heapq.heappushpop(result_heaps[query_id], (score, corpus_id))
+                            heapq.heappushpop(
+                                result_heaps[query_id], (score, corpus_id)
+                            )
 
         for qid in result_heaps:
             for score, corpus_id in result_heaps[qid]:
                 self.results[qid][corpus_id] = score
-        
+
         return self.results
 
 
@@ -101,6 +141,7 @@ class DRESModel:
     Dense Retrieval Exact Search (DRES) requires an encode_queries & encode_corpus method.
     This class converts a model with just an .encode method into DRES format.
     """
+
     def __init__(self, model, sep=" ", **kwargs):
         self.model = model
         self.sep = sep
@@ -109,7 +150,9 @@ def __init__(self, model, sep=" ", **kwargs):
     def encode_queries(self, queries: List[str], batch_size: int, **kwargs):
         if self.use_sbert_model:
             if isinstance(self.model._first_module(), Transformer):
-                logger.info(f"Queries will be truncated to {self.model.get_max_seq_length()} tokens.")
+                logger.info(
+                    f"Queries will be truncated to {self.model.get_max_seq_length()} tokens."
+                )
             elif isinstance(self.model._first_module(), WordEmbeddings):
                 logger.warning(
                     "Queries will not be truncated. This could lead to memory issues. In that case please lower the batch_size."
@@ -117,7 +160,7 @@ def encode_queries(self, queries: List[str], batch_size: int, **kwargs):
         return self.model.encode(queries, batch_size=batch_size, **kwargs)
 
     def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int, **kwargs):
-        if type(corpus) is dict:
+        if isinstance(corpus, dict):
             sentences = [
                 (corpus["title"][i] + self.sep + corpus["text"][i]).strip()
                 if "title" in corpus
@@ -126,62 +169,84 @@ def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int, **kwargs)
             ]
         else:
             sentences = [
-                (doc["title"] + self.sep + doc["text"]).strip() if "title" in doc else doc["text"].strip()
+                (doc["title"] + self.sep + doc["text"]).strip()
+                if "title" in doc
+                else doc["text"].strip()
                 for doc in corpus
             ]
         return self.model.encode(sentences, batch_size=batch_size, **kwargs)
 
+
 def is_dres_compatible(model):
     for method in ["encode_queries", "encode_corpus"]:
         op = getattr(model, method, None)
-        if not (callable(op)): return False
+        if not (callable(op)):
+            return False
     return True
 
+
 # Adapted from https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/retrieval/evaluation.py#L9
 class RetrievalEvaluator(Evaluator):
     def __init__(
-            self, retriever = None, k_values: List[int] = [1,3,5,10,100,1000], score_function: str = "cos_sim", **kwargs
-        ):
+        self,
+        retriever=None,
+        k_values: List[int] = [1, 3, 5, 10, 100, 1000],
+        score_function: str = "cos_sim",
+        **kwargs,
+    ):
         super().__init__(**kwargs)
         if is_dres_compatible(retriever):
-            logger.info("The custom encode_queries and encode_corpus functions of the model will be used")
+            logger.info(
+                "The custom encode_queries and encode_corpus functions of the model will be used"
+            )
             self.retriever = DenseRetrievalExactSearch(retriever, **kwargs)
         else:
             self.retriever = DenseRetrievalExactSearch(DRESModel(retriever), **kwargs)
         self.k_values = k_values
         self.top_k = max(k_values)
         self.score_function = score_function
-            
-    def __call__(self, corpus: Dict[str, Dict[str, str]], queries: Dict[str, str], **kwargs) -> Dict[str, Dict[str, float]]:
-        if not self.retriever: raise ValueError("Model/Technique has not been provided!")
-        return self.retriever.search(corpus, queries, self.top_k, self.score_function, **kwargs)
-    
-    def rerank(self,
-            corpus: Dict[str, Dict[str, str]],
-            queries: Dict[str, str],
-            results: Dict[str, Dict[str, float]],
-            top_k: int) -> Dict[str, Dict[str, float]]:
-    
+
+    def __call__(
+        self, corpus: dict[str, dict[str, str]], queries: dict[str, str], **kwargs
+    ) -> dict[str, dict[str, float]]:
+        if not self.retriever:
+            raise ValueError("Model/Technique has not been provided!")
+        return self.retriever.search(
+            corpus, queries, self.top_k, self.score_function, **kwargs
+        )
+
+    def rerank(
+        self,
+        corpus: dict[str, dict[str, str]],
+        queries: dict[str, str],
+        results: dict[str, dict[str, float]],
+        top_k: int,
+    ) -> dict[str, dict[str, float]]:
         new_corpus = {}
-    
+
         for query_id in results:
             if len(results[query_id]) > top_k:
-                for (doc_id, _) in sorted(results[query_id].items(), key=lambda item: item[1], reverse=True)[:top_k]:
+                for doc_id, _ in sorted(
+                    results[query_id].items(), key=lambda item: item[1], reverse=True
+                )[:top_k]:
                     new_corpus[doc_id] = corpus[doc_id]
             else:
                 for doc_id in results[query_id]:
                     new_corpus[doc_id] = corpus[doc_id]
-                    
+
         return self.retriever.search(new_corpus, queries, top_k, self.score_function)
 
     @staticmethod
-    def evaluate(qrels: Dict[str, Dict[str, int]], 
-                 results: Dict[str, Dict[str, float]], 
-                 k_values: List[int],
-                 ignore_identical_ids: bool=True) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float]]:
-        
+    def evaluate(
+        qrels: dict[str, dict[str, int]],
+        results: dict[str, dict[str, float]],
+        k_values: List[int],
+        ignore_identical_ids: bool = True,
+    ) -> Tuple[Dict[str, float], dict[str, float], dict[str, float], dict[str, float]]:
         if ignore_identical_ids:
-            logger.info('For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.')
+            logger.info(
+                "For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this."
+            )
             popped = []
             for qid, rels in results.items():
                 for pid in list(rels):
@@ -193,49 +258,60 @@ def evaluate(qrels: Dict[str, Dict[str, int]],
         _map = {}
         recall = {}
         precision = {}
-        
+
         for k in k_values:
             ndcg[f"NDCG@{k}"] = 0.0
             _map[f"MAP@{k}"] = 0.0
             recall[f"Recall@{k}"] = 0.0
             precision[f"P@{k}"] = 0.0
-        
+
         map_string = "map_cut." + ",".join([str(k) for k in k_values])
         ndcg_string = "ndcg_cut." + ",".join([str(k) for k in k_values])
         recall_string = "recall." + ",".join([str(k) for k in k_values])
         precision_string = "P." + ",".join([str(k) for k in k_values])
-        evaluator = pytrec_eval.RelevanceEvaluator(qrels, {map_string, ndcg_string, recall_string, precision_string})
+        evaluator = pytrec_eval.RelevanceEvaluator(
+            qrels, {map_string, ndcg_string, recall_string, precision_string}
+        )
         scores = evaluator.evaluate(results)
-        
+
         for query_id in scores.keys():
             for k in k_values:
                 ndcg[f"NDCG@{k}"] += scores[query_id]["ndcg_cut_" + str(k)]
                 _map[f"MAP@{k}"] += scores[query_id]["map_cut_" + str(k)]
                 recall[f"Recall@{k}"] += scores[query_id]["recall_" + str(k)]
-                precision[f"P@{k}"] += scores[query_id]["P_"+ str(k)]
-        
+                precision[f"P@{k}"] += scores[query_id]["P_" + str(k)]
+
         for k in k_values:
-            ndcg[f"NDCG@{k}"] = round(ndcg[f"NDCG@{k}"]/len(scores), 5)
-            _map[f"MAP@{k}"] = round(_map[f"MAP@{k}"]/len(scores), 5)
-            recall[f"Recall@{k}"] = round(recall[f"Recall@{k}"]/len(scores), 5)
-            precision[f"P@{k}"] = round(precision[f"P@{k}"]/len(scores), 5)
-        
+            ndcg[f"NDCG@{k}"] = round(ndcg[f"NDCG@{k}"] / len(scores), 5)
+            _map[f"MAP@{k}"] = round(_map[f"MAP@{k}"] / len(scores), 5)
+            recall[f"Recall@{k}"] = round(recall[f"Recall@{k}"] / len(scores), 5)
+            precision[f"P@{k}"] = round(precision[f"P@{k}"] / len(scores), 5)
+
         for eval in [ndcg, _map, recall, precision]:
             logger.info("\n")
             for k in eval.keys():
                 logger.info("{}: {:.4f}".format(k, eval[k]))
 
         return ndcg, _map, recall, precision
-    
+
     @staticmethod
-    def evaluate_custom(qrels: Dict[str, Dict[str, int]], 
-                 results: Dict[str, Dict[str, float]], 
-                 k_values: List[int], metric: str) -> Tuple[Dict[str, float]]:
+    def evaluate_custom(
+        qrels: dict[str, dict[str, int]],
+        results: dict[str, dict[str, float]],
+        k_values: List[int],
+        metric: str,
+    ) -> Tuple[Dict[str, float]]:
         if metric.lower() in ["mrr", "mrr@k", "mrr_cut"]:
             return mrr(qrels, results, k_values)
         elif metric.lower() in ["recall_cap", "r_cap", "r_cap@k"]:
             return recall_cap(qrels, results, k_values)
         elif metric.lower() in ["hole", "hole@k"]:
             return hole(qrels, results, k_values)
-        elif metric.lower() in ["acc", "top_k_acc", "accuracy", "accuracy@k", "top_k_accuracy"]:
+        elif metric.lower() in [
+            "acc",
+            "top_k_acc",
+            "accuracy",
+            "accuracy@k",
+            "top_k_accuracy",
+        ]:
             return top_k_accuracy(qrels, results, k_values)
diff --git a/mteb/evaluation/evaluators/STSEvaluator.py b/mteb/evaluation/evaluators/STSEvaluator.py
index 5cfbbce2e6..0fdc4ecc28 100644
--- a/mteb/evaluation/evaluators/STSEvaluator.py
+++ b/mteb/evaluation/evaluators/STSEvaluator.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import logging
 
 import numpy as np
@@ -8,13 +10,15 @@
     paired_manhattan_distances,
 )
 
-logger = logging.getLogger(__name__)
-
 from .Evaluator import Evaluator
 
+logger = logging.getLogger(__name__)
+
 
 class STSEvaluator(Evaluator):
-    def __init__(self, sentences1, sentences2, gold_scores, batch_size=64, limit=None, **kwargs):
+    def __init__(
+        self, sentences1, sentences2, gold_scores, batch_size=64, limit=None, **kwargs
+    ):
         super().__init__(**kwargs)
         if limit is not None:
             sentences1 = sentences1[:limit]
@@ -27,9 +31,13 @@ def __init__(self, sentences1, sentences2, gold_scores, batch_size=64, limit=Non
 
     def __call__(self, model):
         logger.info(f"Encoding {len(self.sentences1)} sentences1...")
-        embeddings1 = np.asarray(model.encode(self.sentences1, batch_size=self.batch_size))
+        embeddings1 = np.asarray(
+            model.encode(self.sentences1, batch_size=self.batch_size)
+        )
         logger.info(f"Encoding {len(self.sentences2)} sentences2...")
-        embeddings2 = np.asarray(model.encode(self.sentences2, batch_size=self.batch_size))
+        embeddings2 = np.asarray(
+            model.encode(self.sentences2, batch_size=self.batch_size)
+        )
 
         logger.info("Evaluating...")
         cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
diff --git a/mteb/evaluation/evaluators/SummarizationEvaluator.py b/mteb/evaluation/evaluators/SummarizationEvaluator.py
index 86343611f9..a779ec51c7 100644
--- a/mteb/evaluation/evaluators/SummarizationEvaluator.py
+++ b/mteb/evaluation/evaluators/SummarizationEvaluator.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import logging
 
 import numpy as np
@@ -5,12 +7,11 @@
 import tqdm
 from scipy.stats import pearsonr, spearmanr
 
+from .Evaluator import Evaluator
 from .utils import cos_sim, dot_score
 
 logger = logging.getLogger(__name__)
 
-from .Evaluator import Evaluator
-
 
 class SummarizationEvaluator(Evaluator):
     def __init__(
@@ -21,7 +22,7 @@ def __init__(
         gold_scores=None,
         limit=None,
         batch_size=32,
-        **kwargs
+        **kwargs,
     ):
         # human_summaries shape: (None, num_human_summaries)
         # machine_summaries shape: (None, num_machine_summaries)
@@ -47,22 +48,36 @@ def __call__(self, model):
 
         # Get the human & machine summaries for the text in one go for all
         human_lens = [len(human_summaries) for human_summaries in self.human_summaries]
-        machine_lens = [len(machine_summaries) for machine_summaries in self.machine_summaries]
+        machine_lens = [
+            len(machine_summaries) for machine_summaries in self.machine_summaries
+        ]
 
         logger.info(f"Encoding {sum(human_lens)} human summaries...")
         embs_human_summaries_all = model.encode(
-            [summary for human_summaries in self.human_summaries for summary in human_summaries],
+            [
+                summary
+                for human_summaries in self.human_summaries
+                for summary in human_summaries
+            ],
             batch_size=self.batch_size,
         )
         logger.info(f"Encoding {sum(machine_lens)} machine summaries...")
         embs_machine_summaries_all = model.encode(
-            [summary for machine_summaries in self.machine_summaries for summary in machine_summaries],
+            [
+                summary
+                for machine_summaries in self.machine_summaries
+                for summary in machine_summaries
+            ],
             batch_size=self.batch_size,
         )
 
         # Split the embeddings into the original human & machine summaries
-        embs_human_summaries_all = np.split(embs_human_summaries_all, np.cumsum(human_lens)[:-1])
-        embs_machine_summaries_all = np.split(embs_machine_summaries_all, np.cumsum(machine_lens)[:-1])
+        embs_human_summaries_all = np.split(
+            embs_human_summaries_all, np.cumsum(human_lens)[:-1]
+        )
+        embs_machine_summaries_all = np.split(
+            embs_machine_summaries_all, np.cumsum(machine_lens)[:-1]
+        )
 
         for i, (embs_human_summaries, embs_machine_summaries) in tqdm.tqdm(
             enumerate(zip(embs_human_summaries_all, embs_machine_summaries_all)),
diff --git a/mteb/evaluation/evaluators/__init__.py b/mteb/evaluation/evaluators/__init__.py
index c39d915a7a..a1dc8faaa5 100644
--- a/mteb/evaluation/evaluators/__init__.py
+++ b/mteb/evaluation/evaluators/__init__.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from .BitextMiningEvaluator import *
 from .ClassificationEvaluator import *
 from .ClusteringEvaluator import *
diff --git a/mteb/evaluation/evaluators/utils.py b/mteb/evaluation/evaluators/utils.py
index be596f6bfc..075b9d8700 100644
--- a/mteb/evaluation/evaluators/utils.py
+++ b/mteb/evaluation/evaluators/utils.py
@@ -1,5 +1,7 @@
+from __future__ import annotations
+
 import logging
-from typing import List, Dict, Union, Tuple
+from typing import Dict, List, Tuple
 
 import torch
 
@@ -45,24 +47,30 @@ def dot_score(a: torch.Tensor, b: torch.Tensor):
 
     return torch.mm(a, b.transpose(0, 1))
 
+
 # From https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/retrieval/custom_metrics.py#L4
-def mrr(qrels: Dict[str, Dict[str, int]], 
-        results: Dict[str, Dict[str, float]], 
-        k_values: List[int]) -> Tuple[Dict[str, float]]:
-    
+def mrr(
+    qrels: dict[str, dict[str, int]],
+    results: dict[str, dict[str, float]],
+    k_values: List[int],
+) -> Tuple[Dict[str, float]]:
     MRR = {}
-    
+
     for k in k_values:
         MRR[f"MRR@{k}"] = 0.0
-    
+
     k_max, top_hits = max(k_values), {}
     logging.info("\n")
-    
+
     for query_id, doc_scores in results.items():
-        top_hits[query_id] = sorted(doc_scores.items(), key=lambda item: item[1], reverse=True)[0:k_max]   
-    
+        top_hits[query_id] = sorted(
+            doc_scores.items(), key=lambda item: item[1], reverse=True
+        )[0:k_max]
+
     for query_id in top_hits:
-        query_relevant_docs = set([doc_id for doc_id in qrels[query_id] if qrels[query_id][doc_id] > 0])    
+        query_relevant_docs = set(
+            [doc_id for doc_id in qrels[query_id] if qrels[query_id][doc_id] > 0]
+        )
         for k in k_values:
             for rank, hit in enumerate(top_hits[query_id][0:k]):
                 if hit[0] in query_relevant_docs:
@@ -70,85 +78,106 @@ def mrr(qrels: Dict[str, Dict[str, int]],
                     break
 
     for k in k_values:
-        MRR[f"MRR@{k}"] = round(MRR[f"MRR@{k}"]/len(qrels), 5)
+        MRR[f"MRR@{k}"] = round(MRR[f"MRR@{k}"] / len(qrels), 5)
         logging.info("MRR@{}: {:.4f}".format(k, MRR[f"MRR@{k}"]))
 
     return MRR
 
-def recall_cap(qrels: Dict[str, Dict[str, int]], 
-               results: Dict[str, Dict[str, float]], 
-               k_values: List[int]) -> Tuple[Dict[str, float]]:
-    
+
+def recall_cap(
+    qrels: dict[str, dict[str, int]],
+    results: dict[str, dict[str, float]],
+    k_values: List[int],
+) -> Tuple[Dict[str, float]]:
     capped_recall = {}
-    
+
     for k in k_values:
         capped_recall[f"R_cap@{k}"] = 0.0
-    
+
     k_max = max(k_values)
     logging.info("\n")
-    
+
     for query_id, doc_scores in results.items():
-        top_hits = sorted(doc_scores.items(), key=lambda item: item[1], reverse=True)[0:k_max]   
-        query_relevant_docs = [doc_id for doc_id in qrels[query_id] if qrels[query_id][doc_id] > 0]
+        top_hits = sorted(doc_scores.items(), key=lambda item: item[1], reverse=True)[
+            0:k_max
+        ]
+        query_relevant_docs = [
+            doc_id for doc_id in qrels[query_id] if qrels[query_id][doc_id] > 0
+        ]
         for k in k_values:
-            retrieved_docs = [row[0] for row in top_hits[0:k] if qrels[query_id].get(row[0], 0) > 0]
+            retrieved_docs = [
+                row[0] for row in top_hits[0:k] if qrels[query_id].get(row[0], 0) > 0
+            ]
             denominator = min(len(query_relevant_docs), k)
-            capped_recall[f"R_cap@{k}"] += (len(retrieved_docs) / denominator)
+            capped_recall[f"R_cap@{k}"] += len(retrieved_docs) / denominator
 
     for k in k_values:
-        capped_recall[f"R_cap@{k}"] = round(capped_recall[f"R_cap@{k}"]/len(qrels), 5)
+        capped_recall[f"R_cap@{k}"] = round(capped_recall[f"R_cap@{k}"] / len(qrels), 5)
         logging.info("R_cap@{}: {:.4f}".format(k, capped_recall[f"R_cap@{k}"]))
 
     return capped_recall
 
 
-def hole(qrels: Dict[str, Dict[str, int]], 
-               results: Dict[str, Dict[str, float]], 
-               k_values: List[int]) -> Tuple[Dict[str, float]]:
-    
+def hole(
+    qrels: dict[str, dict[str, int]],
+    results: dict[str, dict[str, float]],
+    k_values: List[int],
+) -> Tuple[Dict[str, float]]:
     Hole = {}
-    
+
     for k in k_values:
         Hole[f"Hole@{k}"] = 0.0
-    
+
     annotated_corpus = set()
     for _, docs in qrels.items():
-        for doc_id, score in docs.items():    
+        for doc_id, score in docs.items():
             annotated_corpus.add(doc_id)
-    
+
     k_max = max(k_values)
     logging.info("\n")
-    
+
     for _, scores in results.items():
-        top_hits = sorted(scores.items(), key=lambda item: item[1], reverse=True)[0:k_max]
+        top_hits = sorted(scores.items(), key=lambda item: item[1], reverse=True)[
+            0:k_max
+        ]
         for k in k_values:
-            hole_docs = [row[0] for row in top_hits[0:k] if row[0] not in annotated_corpus]
+            hole_docs = [
+                row[0] for row in top_hits[0:k] if row[0] not in annotated_corpus
+            ]
             Hole[f"Hole@{k}"] += len(hole_docs) / k
 
     for k in k_values:
-        Hole[f"Hole@{k}"] = round(Hole[f"Hole@{k}"]/len(qrels), 5)
+        Hole[f"Hole@{k}"] = round(Hole[f"Hole@{k}"] / len(qrels), 5)
         logging.info("Hole@{}: {:.4f}".format(k, Hole[f"Hole@{k}"]))
 
     return Hole
 
+
 def top_k_accuracy(
-        qrels: Dict[str, Dict[str, int]], 
-        results: Dict[str, Dict[str, float]], 
-        k_values: List[int]) -> Tuple[Dict[str, float]]:
-    
+    qrels: dict[str, dict[str, int]],
+    results: dict[str, dict[str, float]],
+    k_values: List[int],
+) -> Tuple[Dict[str, float]]:
     top_k_acc = {}
-    
+
     for k in k_values:
         top_k_acc[f"Accuracy@{k}"] = 0.0
-    
+
     k_max, top_hits = max(k_values), {}
     logging.info("\n")
-    
+
     for query_id, doc_scores in results.items():
-        top_hits[query_id] = [item[0] for item in sorted(doc_scores.items(), key=lambda item: item[1], reverse=True)[0:k_max]]
-    
+        top_hits[query_id] = [
+            item[0]
+            for item in sorted(
+                doc_scores.items(), key=lambda item: item[1], reverse=True
+            )[0:k_max]
+        ]
+
     for query_id in top_hits:
-        query_relevant_docs = set([doc_id for doc_id in qrels[query_id] if qrels[query_id][doc_id] > 0])
+        query_relevant_docs = set(
+            [doc_id for doc_id in qrels[query_id] if qrels[query_id][doc_id] > 0]
+        )
         for k in k_values:
             for relevant_doc_id in query_relevant_docs:
                 if relevant_doc_id in top_hits[query_id][0:k]:
@@ -156,7 +185,7 @@ def top_k_accuracy(
                     break
 
     for k in k_values:
-        top_k_acc[f"Accuracy@{k}"] = round(top_k_acc[f"Accuracy@{k}"]/len(qrels), 5)
+        top_k_acc[f"Accuracy@{k}"] = round(top_k_acc[f"Accuracy@{k}"] / len(qrels), 5)
         logging.info("Accuracy@{}: {:.4f}".format(k, top_k_acc[f"Accuracy@{k}"]))
 
     return top_k_acc
diff --git a/mteb/logging.py b/mteb/logging.py
index e3dbcf2c0b..5d5d994502 100644
--- a/mteb/logging.py
+++ b/mteb/logging.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import logging
 
 
@@ -23,5 +25,7 @@ def enable_explicit_format() -> None:
     handlers = _get_library_root_logger().handlers
 
     for handler in handlers:
-        formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
+        formatter = logging.Formatter(
+            "[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s"
+        )
         handler.setFormatter(formatter)
diff --git a/mteb/tasks/BitextMining/__init__.py b/mteb/tasks/BitextMining/__init__.py
index cc4ce2c6f3..f5901f6576 100644
--- a/mteb/tasks/BitextMining/__init__.py
+++ b/mteb/tasks/BitextMining/__init__.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from .da.BornholmskBitextMining import *
 from .multilingual.BUCCBitextMining import *
 from .multilingual.DiaBLaBitextMining import *
diff --git a/mteb/tasks/BitextMining/da/BornholmskBitextMining.py b/mteb/tasks/BitextMining/da/BornholmskBitextMining.py
index a58e003a38..1fe293f7ce 100644
--- a/mteb/tasks/BitextMining/da/BornholmskBitextMining.py
+++ b/mteb/tasks/BitextMining/da/BornholmskBitextMining.py
@@ -1,25 +1,39 @@
+from __future__ import annotations
+
 import datasets
 
 from mteb.abstasks import AbsTaskBitextMining
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class BornholmBitextMining(AbsTaskBitextMining):
+    metadata = TaskMetadata(
+        name="BornholmBitextMining",
+        hf_hub_name="strombergnlp/bornholmsk_parallel",
+        description="Danish Bornholmsk Parallel Corpus. Bornholmsk is a Danish dialect spoken on the island of Bornholm, Denmark. Historically it is a part of east Danish which was also spoken in Scania and Halland, Sweden.",
+        reference="https://aclanthology.org/W19-6138/",
+        type="BitextMining",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["da", "da-bornholm"],
+        main_score="f1",
+        revision="3bc5cfb4ec514264fe2db5615fac9016f7251552",
+        date=None,
+        form=None,
+        domains=None,
+        license=None,
+        task_subtypes=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "BornholmBitextMining",
-            "hf_hub_name": "strombergnlp/bornholmsk_parallel",
-            "description": "Danish Bornholmsk Parallel Corpus. "
-            + "Bornholmsk is a Danish dialect spoken on the island of Bornholm, Denmark. "
-            + "Historically it is a part of east Danish which was also spoken in Scania and Halland, Sweden.",
-            "reference": "https://aclanthology.org/W19-6138/",
-            "type": "BitextMining",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["da", "da-bornholm"],
-            "main_score": "f1",
-            "revision": "3bc5cfb4ec514264fe2db5615fac9016f7251552",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
+        return
 
     def load_data(self, **kwargs):
         """
@@ -29,8 +43,8 @@ def load_data(self, **kwargs):
             return
 
         self.dataset = datasets.load_dataset(
-            self.description["hf_hub_name"],
-            revision=self.description.get("revision", None),
+            self.metadata_dict["hf_hub_name"],
+            revision=self.metadata_dict.get("revision", None),
         )
         self.dataset_transform()
         self.data_loaded = True
diff --git a/mteb/tasks/BitextMining/multilingual/BUCCBitextMining.py b/mteb/tasks/BitextMining/multilingual/BUCCBitextMining.py
index edcdbcf736..82350a232f 100644
--- a/mteb/tasks/BitextMining/multilingual/BUCCBitextMining.py
+++ b/mteb/tasks/BitextMining/multilingual/BUCCBitextMining.py
@@ -1,20 +1,36 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskBitextMining, CrosslingualTask
 
 _LANGUAGES = ["de-en", "fr-en", "ru-en", "zh-en"]
 
 
 class BUCCBitextMining(AbsTaskBitextMining, CrosslingualTask):
+    metadata = TaskMetadata(
+        name="BUCC",
+        hf_hub_name="mteb/bucc-bitext-mining",
+        description="BUCC bitext mining dataset",
+        reference="https://comparable.limsi.fr/bucc2018/bucc2018-task.html",
+        type="BitextMining",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=_LANGUAGES,
+        main_score="f1",
+        revision="d51519689f32196a32af33b075a01d0e7c51e252",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "BUCC",
-            "hf_hub_name": "mteb/bucc-bitext-mining",
-            "description": "BUCC bitext mining dataset",
-            "reference": "https://comparable.limsi.fr/bucc2018/bucc2018-task.html",
-            "type": "BitextMining",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": _LANGUAGES,
-            "main_score": "f1",
-            "revision": "d51519689f32196a32af33b075a01d0e7c51e252",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/BitextMining/multilingual/DiaBLaBitextMining.py b/mteb/tasks/BitextMining/multilingual/DiaBLaBitextMining.py
index c0827aebf0..4fd3546bc1 100644
--- a/mteb/tasks/BitextMining/multilingual/DiaBLaBitextMining.py
+++ b/mteb/tasks/BitextMining/multilingual/DiaBLaBitextMining.py
@@ -1,28 +1,39 @@
-import json
+from __future__ import annotations
+
 import datasets
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskBitextMining, CrosslingualTask
 
 
 class DiaBLaBitextMining(AbsTaskBitextMining, CrosslingualTask):
+    metadata = TaskMetadata(
+        name="DiaBlaBitextMining",
+        hf_hub_name="rbawden/DiaBLa",
+        description="English-French Parallel Corpus. DiaBLa is an English-French dataset for the evaluation of Machine Translation (MT) for informal, written bilingual dialogue.",
+        reference="https://inria.hal.science/hal-03021633",
+        type="BitextMining",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["fr-en", "en-fr"],
+        main_score="f1",
+        revision="5345895c56a601afe1a98519ce3199be60a27dba",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "DiaBLaBitextMining",
-            "hf_hub_name": "rbawden/DiaBLa",
-            "description": (
-                "English-French Parallel Corpus. "
-                + "DiaBLa is an English-French dataset for the evaluation of Machine Translation (MT) for informal,"
-                " written bilingual dialogue."
-            ),
-            "reference": "https://inria.hal.science/hal-03021633",
-            "type": "BitextMining",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["fr-en"],
-            "main_score": "f1",
-            "revision": "5345895c56a601afe1a98519ce3199be60a27dba",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         """
@@ -30,12 +41,12 @@ def load_data(self, **kwargs):
         """
         if self.data_loaded:
             return
-        
+
         self.dataset = {}
         for lang in self.langs:
             self.dataset[lang] = datasets.load_dataset(
-                self.description["hf_hub_name"],
-                revision=self.description.get("revision", None),
+                self.metadata_dict["hf_hub_name"],
+                revision=self.metadata_dict.get("revision", None),
             )
 
         self.dataset_transform()
@@ -45,8 +56,12 @@ def dataset_transform(self):
         def create_columns(row):
             """Put all French texts in column 'sentence1' and English texts in 'sentence2' column"""
             row["orig_lang"] = row["utterance_meta"]["lang"]
-            row["sentence1"] = row["orig"] if row["orig_lang"] == "french" else row["ref"]
-            row["sentence2"] = row["orig"] if not row["orig_lang"] == "french" else row["ref"]
+            row["sentence1"] = (
+                row["orig"] if row["orig_lang"] == "french" else row["ref"]
+            )
+            row["sentence2"] = (
+                row["orig"] if not row["orig_lang"] == "french" else row["ref"]
+            )
             return row
 
         # Convert to standard format
diff --git a/mteb/tasks/BitextMining/multilingual/FloresBitextMining.py b/mteb/tasks/BitextMining/multilingual/FloresBitextMining.py
index 8fd5537732..008d69a524 100644
--- a/mteb/tasks/BitextMining/multilingual/FloresBitextMining.py
+++ b/mteb/tasks/BitextMining/multilingual/FloresBitextMining.py
@@ -1,4 +1,9 @@
+from __future__ import annotations
+
 import datasets
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskBitextMining, CrosslingualTask
 
 _LANGUAGES = [
@@ -223,24 +228,37 @@ def extend_lang_pairs():
                         if pair not in _LANGUAGES:
                             _LANGUAGES_PAIRS.append(pair)
 
+
 extend_lang_pairs()
 
 
 class FloresBitextMining(AbsTaskBitextMining, CrosslingualTask):
+    metadata = TaskMetadata(
+        name="FloresBitextMining",
+        hf_hub_name="facebook/flores",
+        description="FLORES is a benchmark dataset for machine translation between English and low-resource languages.",
+        reference="https://huggingface.co/datasets/facebook/flores",
+        type="BitextMining",
+        category="s2s",
+        eval_splits=_SPLIT,
+        eval_langs=_LANGUAGES_PAIRS,
+        main_score="f1",
+        revision="80dc3040d19756742c9a18267ab30f54fb8e226b",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "FloresBitextMining",
-            "hf_hub_name": "facebook/flores",
-            "description": "FLORES is a benchmark dataset for machine translation between English and low-resource languages.",
-            "reference": "https://huggingface.co/datasets/facebook/flores",
-            "type": "BitextMining",
-            "category": "s2s",
-            "eval_splits": _SPLIT,
-            "eval_langs": _LANGUAGES_PAIRS,
-            "main_score": "f1",
-            "revision": "80dc3040d19756742c9a18267ab30f54fb8e226b",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         """
@@ -251,9 +269,9 @@ def load_data(self, **kwargs):
         self.dataset = {}
         for lang in self.langs:
             self.dataset[lang] = datasets.load_dataset(
-                self.description["hf_hub_name"],
+                self.metadata_dict["hf_hub_name"],
                 lang,
-                revision=self.description.get("revision", None),
+                revision=self.metadata_dict.get("revision", None),
             )
         self.dataset_transform()
         self.data_loaded = True
@@ -264,5 +282,9 @@ def dataset_transform(self):
             lang1 = lang.split("-")[0]
             lang2 = lang.split("-")[1]
             for split in _SPLIT:
-                self.dataset[lang][split] = self.dataset[lang][split].rename_column("sentence_" + lang1, "sentence1")
-                self.dataset[lang][split] = self.dataset[lang][split].rename_column("sentence_" + lang2, "sentence2")
+                self.dataset[lang][split] = self.dataset[lang][split].rename_column(
+                    "sentence_" + lang1, "sentence1"
+                )
+                self.dataset[lang][split] = self.dataset[lang][split].rename_column(
+                    "sentence_" + lang2, "sentence2"
+                )
diff --git a/mteb/tasks/BitextMining/multilingual/NorwegianCourtsBitextMining.py b/mteb/tasks/BitextMining/multilingual/NorwegianCourtsBitextMining.py
index 93e3a434c6..0011ac934b 100644
--- a/mteb/tasks/BitextMining/multilingual/NorwegianCourtsBitextMining.py
+++ b/mteb/tasks/BitextMining/multilingual/NorwegianCourtsBitextMining.py
@@ -1,26 +1,38 @@
+from __future__ import annotations
+
 import datasets
 
 from mteb.abstasks import AbsTaskBitextMining
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class NorwegianCourtsBitextMining(AbsTaskBitextMining):
+    metadata = TaskMetadata(
+        name="NorwegianCourtsBitextMining",
+        hf_hub_name="kardosdrur/norwegian-courts",
+        description="Nynorsk and Bokmål parallel corpus from Norwegian courts. Norwegian courts have two standardised written languages. Bokmål is a variant closer to Danish, while Nynorsk was created to resemble regional dialects of Norwegian.",
+        reference="https://opus.nlpl.eu/index.php",
+        type="BitextMining",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["nb", "nn"],
+        main_score="f1",
+        revision="d79af07e969a6678fcbbe819956840425816468f",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "NorwegianCourtsBitextMining",
-            "hf_hub_name": "kardosdrur/norwegian-courts",
-            "description": "Nynorsk and Bokmål parallel corpus from Norwegian courts. "
-            + "Norway has two standardised written languages. "
-            + "Bokmål is a variant closer to Danish, while Nynorsk was created to resemble "
-            + "regional dialects of Norwegian.",
-            "reference": "https://opus.nlpl.eu/index.php",
-            "type": "BitextMining",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["nb", "nn"],
-            "main_score": "f1",
-            "revision": "d79af07e969a6678fcbbe819956840425816468f",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         """
@@ -30,8 +42,8 @@ def load_data(self, **kwargs):
             return
 
         self.dataset = datasets.load_dataset(
-            self.description["hf_hub_name"],
-            revision=self.description.get("revision", None),
+            self.metadata_dict["hf_hub_name"],
+            revision=self.metadata_dict.get("revision", None),
         )
         self.dataset_transform()
         self.data_loaded = True
diff --git a/mteb/tasks/BitextMining/multilingual/TatoebaBitextMining.py b/mteb/tasks/BitextMining/multilingual/TatoebaBitextMining.py
index 8fbc050f98..7d2d137f86 100644
--- a/mteb/tasks/BitextMining/multilingual/TatoebaBitextMining.py
+++ b/mteb/tasks/BitextMining/multilingual/TatoebaBitextMining.py
@@ -1,3 +1,7 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskBitextMining, CrosslingualTask
 
 _LANGUAGES = [
@@ -117,17 +121,29 @@
 
 
 class TatoebaBitextMining(AbsTaskBitextMining, CrosslingualTask):
+    metadata = TaskMetadata(
+        name="Tatoeba",
+        hf_hub_name="facebook/flores",
+        description="1,000 English-aligned sentence pairs for each language based on the Tatoeba corpus",
+        reference="https://huggingface.co/datasets/facebook/flores",
+        type="BitextMining",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=_LANGUAGES,
+        main_score="f1",
+        revision="80dc3040d19756742c9a18267ab30f54fb8e226b",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "Tatoeba",
-            "hf_hub_name": "mteb/tatoeba-bitext-mining",
-            "description": "1,000 English-aligned sentence pairs for each language based on the Tatoeba corpus",
-            "reference": "https://github.com/facebookresearch/LASER/tree/main/data/tatoeba/v1",
-            "type": "BitextMining",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": _LANGUAGES,
-            "main_score": "f1",
-            "revision": "9080400076fbadbb4c4dcb136ff4eddc40b42553",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py
index 13b8a9be82..878557aeb4 100644
--- a/mteb/tasks/Classification/__init__.py
+++ b/mteb/tasks/Classification/__init__.py
@@ -1,25 +1,27 @@
-from .multilingual.AmazonCounterfactualClassification import *
-from .en.AmazonPolarityClassification import *
-from .multilingual.AmazonReviewsClassification import *
+from __future__ import annotations
+
 from .da.AngryTweetsClassification import *
-from .en.Banking77Classification import *
-from .zh.CMTEBClassification import *
 from .da.DalajClassification import *
 from .da.DanishPoliticalCommentsClassification import *
 from .da.DKHateClassification import *
+from .da.LccSentimentClassification import *
+from .en.AmazonPolarityClassification import *
+from .en.Banking77Classification import *
 from .en.EmotionClassification import *
 from .en.ImdbClassification import *
-from .da.LccSentimentClassification import *
+from .en.ToxicConversationsClassification import *
+from .en.TweetSentimentExtractionClassification import *
+from .multilingual.AmazonCounterfactualClassification import *
+from .multilingual.AmazonReviewsClassification import *
 from .multilingual.MasakhaNEWSClassification import *
 from .multilingual.MassiveIntentClassification import *
 from .multilingual.MassiveScenarioClassification import *
 from .multilingual.MTOPDomainClassification import *
 from .multilingual.MTOPIntentClassification import *
 from .multilingual.NordicLangClassification import *
+from .multilingual.ScalaClassification import *
 from .nb.NoRecClassification import *
 from .nb.NorwegianParliamentClassification import *
 from .pl.PolishClassification import *
-from .multilingual.ScalaClassification import *
 from .sv.SweRecClassification import *
-from .en.ToxicConversationsClassification import *
-from .en.TweetSentimentExtractionClassification import *
+from .zh.CMTEBClassification import *
diff --git a/mteb/tasks/Classification/da/AngryTweetsClassification.py b/mteb/tasks/Classification/da/AngryTweetsClassification.py
index a4a441ef82..cba265f003 100644
--- a/mteb/tasks/Classification/da/AngryTweetsClassification.py
+++ b/mteb/tasks/Classification/da/AngryTweetsClassification.py
@@ -1,20 +1,36 @@
+from __future__ import annotations
+
 from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class AngryTweetsClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="AngryTweetsClassification",
+        hf_hub_name="DDSC/angry-tweets",
+        description="A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets",
+        reference="https://aclanthology.org/2021.nodalida-main.53/",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["da"],
+        main_score="accuracy",
+        revision="20b0e6081892e78179356fada741b7afa381443d",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "AngryTweetsClassification",
-            "hf_hub_name": "DDSC/angry-tweets",
-            "description": "A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets",
-            "reference": "https://aclanthology.org/2021.nodalida-main.53/",
-            "eval_splits": ["test"],
-            "eval_langs": ["da"],
-            "type": "Classification",
-            "category": "s2s",
-            "main_score": "accuracy",
-            "n_experiments": 10,
-            "samples_per_label": 16,
-            "revision": "20b0e6081892e78179356fada741b7afa381443d",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["n_experiments"] = 10
+        metadata_dict["samples_per_label"] = 16
+        return metadata_dict
diff --git a/mteb/tasks/Classification/da/DKHateClassification.py b/mteb/tasks/Classification/da/DKHateClassification.py
index b98da4da45..85582c4e07 100644
--- a/mteb/tasks/Classification/da/DKHateClassification.py
+++ b/mteb/tasks/Classification/da/DKHateClassification.py
@@ -1,25 +1,41 @@
+from __future__ import annotations
+
 import datasets
 
 from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class DKHateClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="DKHateClassification",
+        hf_hub_name="DDSC/dkhate",
+        description="Danish Tweets annotated for Hate Speech either being Offensive or not",
+        reference="https://aclanthology.org/2020.lrec-1.430/",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["da"],
+        main_score="accuracy",
+        revision="59d12749a3c91a186063c7d729ec392fda94681c",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "DKHateClassification",
-            "hf_hub_name": "DDSC/dkhate",
-            "description": "Danish Tweets annotated for Hate Speech either being Offensive or not",
-            "reference": "https://aclanthology.org/2020.lrec-1.430/",
-            "type": "Classification",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["da"],
-            "main_score": "accuracy",
-            "n_experiments": 10,
-            "samples_per_label": 16,
-            "revision": "59d12749a3c91a186063c7d729ec392fda94681c",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["n_experiments"] = 10
+        metadata_dict["samples_per_label"] = 16
+        return metadata_dict
 
     def load_data(self, **kwargs):
         """
@@ -29,7 +45,8 @@ def load_data(self, **kwargs):
             return
 
         self.dataset = datasets.load_dataset(
-            self.description["hf_hub_name"], revision=self.description.get("revision", None)
+            self.metadata_dict["hf_hub_name"],
+            revision=self.metadata_dict.get("revision", None),
         )
         self.dataset_transform()
         self.data_loaded = True
@@ -38,4 +55,6 @@ def dataset_transform(self):
         # convert label to a 0/1 label
         labels = self.dataset["train"]["label"]  # type: ignore
         lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
-        self.dataset = self.dataset.map(lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"])
+        self.dataset = self.dataset.map(
+            lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
+        )
diff --git a/mteb/tasks/Classification/da/DalajClassification.py b/mteb/tasks/Classification/da/DalajClassification.py
index d69dd0ff50..20785d18b6 100644
--- a/mteb/tasks/Classification/da/DalajClassification.py
+++ b/mteb/tasks/Classification/da/DalajClassification.py
@@ -1,26 +1,42 @@
 # SuperLIM tasks
+from __future__ import annotations
+
 import datasets
 
 from mteb.abstasks import AbsTaskClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class DalajClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="DalajClassification",
+        hf_hub_name="AI-Sweden/SuperLim",
+        description="A Swedish dataset for linguistic acceptability. Available as a part of Superlim.",
+        reference="https://spraakbanken.gu.se/en/resources/superlim",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["da"],
+        main_score="accuracy",
+        revision="7ebf0b4caa7b2ae39698a889de782c09e6f5ee56",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "DalajClassification",
-            "hf_hub_name": "AI-Sweden/SuperLim",
-            "description": "A Swedish dataset for linguistic acceptability. Available as a part of Superlim.",
-            "reference": "https://spraakbanken.gu.se/en/resources/superlim",
-            "type": "Classification",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["sv"],
-            "main_score": "accuracy",
-            "n_experiments": 10,
-            "samples_per_label": 16,
-            "revision": "7ebf0b4caa7b2ae39698a889de782c09e6f5ee56",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["n_experiments"] = 10
+        metadata_dict["samples_per_label"] = 16
+        return metadata_dict
 
     def load_data(self, **kwargs):
         """
@@ -30,9 +46,9 @@ def load_data(self, **kwargs):
             return
 
         self.dataset = datasets.load_dataset(
-            self.description["hf_hub_name"],
+            self.metadata_dict["hf_hub_name"],
             "dalaj",  # chose the relevant subset
-            revision=self.description.get("revision"),
+            revision=self.metadata_dict.get("revision"),
         )
         self.dataset_transform()
         self.data_loaded = True
@@ -45,15 +61,21 @@ def dataset_transform(self):
 
         def __convert_sample_to_classification(sample):
             text = sample["original_sentence"] + sample["corrected_sentence"]
-            label = [1] * len(sample["original_sentence"]) + [0] * len(sample["corrected_sentence"])
+            label = [1] * len(sample["original_sentence"]) + [0] * len(
+                sample["corrected_sentence"]
+            )
             return {"text": text, "label": label}
 
         columns_to_keep = ["original_sentence", "corrected_sentence"]
         for split in self.dataset:
             columns_names = self.dataset[split].column_names  # type: ignore
-            columns_to_remove = [col for col in columns_names if col not in columns_to_keep]
+            columns_to_remove = [
+                col for col in columns_names if col not in columns_to_keep
+            ]
             self.dataset[split] = self.dataset[split].remove_columns(columns_to_remove)  # type: ignore
 
         self.dataset = self.dataset.map(
-            __convert_sample_to_classification, batched=True, remove_columns=columns_to_keep
+            __convert_sample_to_classification,
+            batched=True,
+            remove_columns=columns_to_keep,
         )
diff --git a/mteb/tasks/Classification/da/DanishPoliticalCommentsClassification.py b/mteb/tasks/Classification/da/DanishPoliticalCommentsClassification.py
index 4506dcaff0..6dfe9ff582 100644
--- a/mteb/tasks/Classification/da/DanishPoliticalCommentsClassification.py
+++ b/mteb/tasks/Classification/da/DanishPoliticalCommentsClassification.py
@@ -1,25 +1,41 @@
+from __future__ import annotations
+
 import datasets
 
 from mteb.abstasks import AbsTaskClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class DanishPoliticalCommentsClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="DanishPoliticalCommentsClassification",
+        hf_hub_name="danish_political_comments",
+        description="A dataset of Danish political comments rated for sentiment",
+        reference="https://huggingface.co/datasets/danish_political_comments",
+        type="Classification",
+        category="s2s",
+        eval_splits=["train"],
+        eval_langs=["da"],
+        main_score="accuracy",
+        revision="edbb03726c04a0efab14fc8c3b8b79e4d420e5a1",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "DanishPoliticalCommentsClassification",
-            "hf_hub_name": "danish_political_comments",
-            "description": "A dataset of Danish political comments rated for sentiment",
-            "reference": "NA",
-            "type": "Classification",
-            "category": "s2s",
-            "eval_splits": ["train"],
-            "eval_langs": ["da"],
-            "main_score": "accuracy",
-            "n_experiments": 10,
-            "samples_per_label": 16,
-            "revision": "edbb03726c04a0efab14fc8c3b8b79e4d420e5a1",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["n_experiments"] = 10
+        metadata_dict["samples_per_label"] = 16
+        return metadata_dict
 
     def load_data(self, **kwargs):
         """
@@ -29,7 +45,8 @@ def load_data(self, **kwargs):
             return
 
         self.dataset = datasets.load_dataset(
-            self.description["hf_hub_name"], revision=self.description.get("revision")
+            self.metadata_dict["hf_hub_name"],
+            revision=self.metadata_dict.get("revision"),
         )
         self.dataset_transform()
         self.data_loaded = True
diff --git a/mteb/tasks/Classification/DdiscoCohesionClassification.py b/mteb/tasks/Classification/da/DdiscoCohesionClassification.py
similarity index 65%
rename from mteb/tasks/Classification/DdiscoCohesionClassification.py
rename to mteb/tasks/Classification/da/DdiscoCohesionClassification.py
index 98b68684f2..fff065f500 100644
--- a/mteb/tasks/Classification/DdiscoCohesionClassification.py
+++ b/mteb/tasks/Classification/da/DdiscoCohesionClassification.py
@@ -1,55 +1,33 @@
+from __future__ import annotations
+
 from datasets import load_dataset
 
 from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class DdiscoCohesionClassification(AbsTaskClassification):
-    @property
-    def description(self):
-        return {
-            "name": "Ddisco",
-            "hf_hub_name": "DDSC/ddisco",
-            "description": "A Danish Discourse dataset with values for coherence and source (Wikipedia or Reddit)",
-            "reference": "https://aclanthology.org/2022.lrec-1.260/",
-            "type": "Classification",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["da"],
-            "main_score": "accuracy",
-            "revision": "514ab557579fcfba538a4078d6d647248a0e6eb7",
-        }
-
-    def load_data(self, **kwargs):
-        """
-        Load dataset from HuggingFace hub
-        """
-        if self.data_loaded:
-            return
-
-        self.dataset = load_dataset(
-            self.description["hf_hub_name"], revision=self.description.get("revision")
-        )
-        self.dataset_transform()
-        self.data_loaded = True
-
-    def dataset_transform(self):
-        self.dataset = self.dataset.rename_columns({"rating": "label"}).remove_columns(
-            ["domain"]
-        )
-
-    @property
-    def metadata(self):
-        return {
-            "date": "2012-01-01/2022-06-25", 
-            "form": ["written"],  
-            "domains": ["non-fiction", "social"],  
-            "dialect": [], 
-            "task_subtypes": ["Discourse coherence"],  
-            "license": "cc-by-sa-3.0",
-            "socioeconomic_status": "high", 
-            "annotations_creators": "expert-annotated",  
-            "text_creation": "found", 
-            "citation": """
+    metadata = TaskMetadata(
+        name="Ddisco",
+        hf_hub_name="DDSC/ddisco",
+        description="A Danish Discourse dataset with values for coherence and source (Wikipedia or Reddit)",
+        reference="https://aclanthology.org/2022.lrec-1.260/",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["da"],
+        main_score="accuracy",
+        revision="514ab557579fcfba538a4078d6d647248a0e6eb7",
+        date=("2021-01-01", "2022-06-25"),
+        form=["written"],
+        domains=["Non-fiction", "Social"],
+        dialect=[],
+        task_subtypes=["Discourse coherence"],
+        license="cc-by-sa-3.0",
+        socioeconomic_status="high",
+        annotations_creators="expert-annotated",
+        text_creation="found",
+        bibtex_citation="""
         @inproceedings{flansmose-mikkelsen-etal-2022-ddisco,
     title = "{DD}is{C}o: A Discourse Coherence Dataset for {D}anish",
     author = "Flansmose Mikkelsen, Linea  and
@@ -78,7 +56,28 @@ def metadata(self):
     pages = "2440--2445",
     abstract = "To date, there has been no resource for studying discourse coherence on real-world Danish texts. Discourse coherence has mostly been approached with the assumption that incoherent texts can be represented by coherent texts in which sentences have been shuffled. However, incoherent real-world texts rarely resemble that. We thus present DDisCo, a dataset including text from the Danish Wikipedia and Reddit annotated for discourse coherence. We choose to annotate real-world texts instead of relying on artificially incoherent text for training and testing models. Then, we evaluate the performance of several methods, including neural networks, on the dataset.",
 }
-        """, 
-        }
+        """,
+    )
 
+    def load_data(self, **kwargs):
+        """
+        Load dataset from HuggingFace hub
+        """
+        if self.data_loaded:
+            return
 
+        self.dataset = load_dataset(
+            self.metadata_dict["hf_hub_name"],
+            revision=self.metadata_dict.get("revision"),
+        )
+        self.dataset_transform()
+        self.data_loaded = True
+
+    def dataset_transform(self):
+        self.dataset = self.dataset.rename_columns({"rating": "label"}).remove_columns(
+            ["domain"]
+        )
+
+    @property
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Classification/da/LccSentimentClassification.py b/mteb/tasks/Classification/da/LccSentimentClassification.py
index 6729630db8..e1c000a446 100644
--- a/mteb/tasks/Classification/da/LccSentimentClassification.py
+++ b/mteb/tasks/Classification/da/LccSentimentClassification.py
@@ -1,20 +1,36 @@
+from __future__ import annotations
+
 from mteb.abstasks import AbsTaskClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class LccSentimentClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="LccSentimentClassification",
+        hf_hub_name="DDSC/lcc",
+        description="The leipzig corpora collection, annotated for sentiment",
+        reference="https://github.com/fnielsen/lcc-sentiment",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["da"],
+        main_score="accuracy",
+        revision="de7ba3406ee55ea2cc52a0a41408fa6aede6d3c6",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "LccSentimentClassification",
-            "hf_hub_name": "DDSC/lcc",
-            "description": "The leipzig corpora collection, annotated for sentiment",
-            "reference": "https://github.com/fnielsen/lcc-sentiment",
-            "type": "Classification",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["da"],
-            "main_score": "accuracy",
-            "n_experiments": 10,
-            "samples_per_label": 16,
-            "revision": "de7ba3406ee55ea2cc52a0a41408fa6aede6d3c6",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["n_experiments"] = 10
+        metadata_dict["samples_per_label"] = 16
+        return metadata_dict
diff --git a/mteb/tasks/Classification/en/AmazonPolarityClassification.py b/mteb/tasks/Classification/en/AmazonPolarityClassification.py
index ca97211d33..591b02e052 100644
--- a/mteb/tasks/Classification/en/AmazonPolarityClassification.py
+++ b/mteb/tasks/Classification/en/AmazonPolarityClassification.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskClassification
 
 
 class AmazonPolarityClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="AmazonPolarityClassification",
+        description="Amazon Polarity Classification Dataset.",
+        reference="https://huggingface.co/datasets/amazon_polarity",
+        hf_hub_name="mteb/amazon_polarity",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="accuracy",
+        revision="e2d317d38cd51312af73b3d32a06d1a08b442046",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "AmazonPolarityClassification",
-            "hf_hub_name": "mteb/amazon_polarity",
-            "description": "Amazon Polarity Classification Dataset.",
-            "reference": "https://dl.acm.org/doi/10.1145/2507157.2507163",
-            "category": "p2p",
-            "type": "Classification",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "accuracy",
-            "revision": "e2d317d38cd51312af73b3d32a06d1a08b442046",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Classification/en/Banking77Classification.py b/mteb/tasks/Classification/en/Banking77Classification.py
index 93a7c5e178..4c042cb818 100644
--- a/mteb/tasks/Classification/en/Banking77Classification.py
+++ b/mteb/tasks/Classification/en/Banking77Classification.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskClassification
 
 
 class Banking77Classification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="Banking77Classification",
+        description="Dataset composed of online banking queries annotated with their corresponding intents.",
+        reference="https://arxiv.org/abs/2003.04807",
+        hf_hub_name="mteb/banking77",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="accuracy",
+        revision="0fd18e25b25c072e09e0d92ab615fda904d66300",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "Banking77Classification",
-            "hf_hub_name": "mteb/banking77",
-            "description": "Dataset composed of online banking queries annotated with their corresponding intents.",
-            "reference": "https://arxiv.org/abs/2003.04807",
-            "category": "s2s",
-            "type": "Classification",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "accuracy",
-            "revision": "0fd18e25b25c072e09e0d92ab615fda904d66300",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Classification/en/EmotionClassification.py b/mteb/tasks/Classification/en/EmotionClassification.py
index a7bf1e77ab..1b3379b7d6 100644
--- a/mteb/tasks/Classification/en/EmotionClassification.py
+++ b/mteb/tasks/Classification/en/EmotionClassification.py
@@ -1,23 +1,37 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskClassification
 
 
 class EmotionClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="EmotionClassification",
+        description="Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise.",
+        reference="https://www.aclweb.org/anthology/D18-1404",
+        hf_hub_name="mteb/emotion",
+        type="Classification",
+        category="s2s",
+        eval_splits=["validation", "test"],
+        eval_langs=["en"],
+        main_score="accuracy",
+        revision="4f58c6b202a23cf9a4da393831edf4f9183cad37",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "EmotionClassification",
-            "hf_hub_name": "mteb/emotion",
-            "description": (
-                "Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love,"
-                " sadness, and surprise. For more detailed information please refer to the paper."
-            ),
-            "reference": "https://www.aclweb.org/anthology/D18-1404",
-            "category": "s2s",
-            "type": "Classification",
-            "eval_splits": ["validation", "test"],
-            "eval_langs": ["en"],
-            "main_score": "accuracy",
-            "n_experiments": 10,
-            "samples_per_label": 16,
-            "revision": "4f58c6b202a23cf9a4da393831edf4f9183cad37",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["n_experiments"] = 10
+        metadata_dict["samples_per_label"] = 16
+        return metadata_dict
diff --git a/mteb/tasks/Classification/en/ImdbClassification.py b/mteb/tasks/Classification/en/ImdbClassification.py
index 6d76fd92b5..57e9c9be33 100644
--- a/mteb/tasks/Classification/en/ImdbClassification.py
+++ b/mteb/tasks/Classification/en/ImdbClassification.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskClassification
 
 
 class ImdbClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="ImdbClassification",
+        description="Large Movie Review Dataset",
+        hf_hub_name="mteb/imdb",
+        reference="http://www.aclweb.org/anthology/P11-1015",
+        type="Classification",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="accuracy",
+        revision="3d86128a09e091d6018b6d26cad27f2739fc2db7",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "ImdbClassification",
-            "hf_hub_name": "mteb/imdb",
-            "description": "Large Movie Review Dataset",
-            "reference": "http://www.aclweb.org/anthology/P11-1015",
-            "category": "p2p",
-            "type": "Classification",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "accuracy",
-            "revision": "3d86128a09e091d6018b6d26cad27f2739fc2db7",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Classification/en/ToxicConversationsClassification.py b/mteb/tasks/Classification/en/ToxicConversationsClassification.py
index 3cb1d21e10..864c770495 100644
--- a/mteb/tasks/Classification/en/ToxicConversationsClassification.py
+++ b/mteb/tasks/Classification/en/ToxicConversationsClassification.py
@@ -1,25 +1,37 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskClassification
 
 
 class ToxicConversationsClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="ToxicConversationsClassification",
+        description="Collection of comments from the Civil Comments platform together with annotations if the comment is toxic or not.",
+        reference="https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview",
+        hf_hub_name="mteb/toxic_conversations_50k",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="accuracy",
+        revision="d604517c81ca91fe16a244d1248fc021f9ecee7a",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "ToxicConversationsClassification",
-            "hf_hub_name": "mteb/toxic_conversations_50k",
-            "description": (
-                "Collection of comments from the Civil Comments platform together with annotations if the comment is"
-                " toxic or not."
-            ),
-            "reference": (
-                "https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview"
-            ),
-            "category": "s2s",
-            "type": "Classification",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "accuracy",
-            "n_experiments": 10,
-            "samples_per_label": 32,
-            "revision": "d7c0de2777da35d6aae2200a62c6e0e5af397c4c",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["n_experiments"] = 10
+        metadata_dict["samples_per_label"] = 16
+        return dict(self.metadata)
diff --git a/mteb/tasks/Classification/en/TweetSentimentExtractionClassification.py b/mteb/tasks/Classification/en/TweetSentimentExtractionClassification.py
index c14081a366..e8288dd0d6 100644
--- a/mteb/tasks/Classification/en/TweetSentimentExtractionClassification.py
+++ b/mteb/tasks/Classification/en/TweetSentimentExtractionClassification.py
@@ -1,20 +1,37 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskClassification
 
 
 class TweetSentimentExtractionClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="TweetSentimentExtractionClassification",
+        description="",
+        reference="https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview",
+        hf_hub_name="mteb/tweet_sentiment_extraction",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="accuracy",
+        revision="d604517c81ca91fe16a244d1248fc021f9ecee7a",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "TweetSentimentExtractionClassification",
-            "hf_hub_name": "mteb/tweet_sentiment_extraction",
-            "description": "",
-            "reference": "https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview",
-            "category": "s2s",
-            "type": "Classification",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "accuracy",
-            "n_experiments": 10,
-            "samples_per_label": 32,
-            "revision": "d604517c81ca91fe16a244d1248fc021f9ecee7a",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["n_experiments"] = 10
+        metadata_dict["samples_per_label"] = 32
+        return metadata_dict
diff --git a/mteb/tasks/Classification/multilingual/AmazonCounterfactualClassification.py b/mteb/tasks/Classification/multilingual/AmazonCounterfactualClassification.py
index 5d5aa094d4..5e6f1b8eab 100644
--- a/mteb/tasks/Classification/multilingual/AmazonCounterfactualClassification.py
+++ b/mteb/tasks/Classification/multilingual/AmazonCounterfactualClassification.py
@@ -1,24 +1,41 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskClassification, MultilingualTask
 
 _LANGUAGES = ["en", "de", "en-ext", "ja"]
 
 
 class AmazonCounterfactualClassification(MultilingualTask, AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="AmazonCounterfactualClassification",
+        hf_hub_name="mteb/amazon_counterfactual",
+        description=(
+            "A collection of Amazon customer reviews annotated for counterfactual detection pair classification."
+        ),
+        reference="https://arxiv.org/abs/2104.06893",
+        category="s2s",
+        type="Classification",
+        eval_splits=["validation", "test"],
+        eval_langs=_LANGUAGES,
+        main_score="accuracy",
+        revision="e8379541af4e31359cca9fbcf4b00f2671dba205",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "AmazonCounterfactualClassification",
-            "hf_hub_name": "mteb/amazon_counterfactual",
-            "description": (
-                "A collection of Amazon customer reviews annotated for counterfactual detection pair classification."
-            ),
-            "reference": "https://arxiv.org/abs/2104.06893",
-            "category": "s2s",
-            "type": "Classification",
-            "eval_splits": ["validation", "test"],
-            "eval_langs": _LANGUAGES,
-            "main_score": "accuracy",
-            "n_experiments": 10,
-            "samples_per_label": 32,
-            "revision": "e8379541af4e31359cca9fbcf4b00f2671dba205",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["n_experiments"] = 10
+        metadata_dict["samples_per_label"] = 32
+        return metadata_dict
diff --git a/mteb/tasks/Classification/multilingual/AmazonReviewsClassification.py b/mteb/tasks/Classification/multilingual/AmazonReviewsClassification.py
index 195c2623b0..59733396dd 100644
--- a/mteb/tasks/Classification/multilingual/AmazonReviewsClassification.py
+++ b/mteb/tasks/Classification/multilingual/AmazonReviewsClassification.py
@@ -1,23 +1,36 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskClassification, MultilingualTask
 
 _LANGUAGES = ["en", "de", "es", "fr", "ja", "zh"]
 
 
 class AmazonReviewsClassification(MultilingualTask, AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="AmazonReviewsClassification",
+        hf_hub_name="mteb/amazon_reviews_multi",
+        description="A collection of Amazon reviews specifically designed to aid research in multilingual text classification.",
+        reference="https://arxiv.org/abs/2010.02573",
+        category="s2s",
+        type="Classification",
+        eval_splits=["validation", "test"],
+        eval_langs=_LANGUAGES,
+        main_score="accuracy",
+        revision="1399c76144fd37290681b995c656ef9b2e06e26d",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "AmazonReviewsClassification",
-            "hf_hub_name": "mteb/amazon_reviews_multi",
-            "description": (
-                "A collection of Amazon reviews specifically designed to aid research in multilingual text"
-                " classification."
-            ),
-            "reference": "https://arxiv.org/abs/2010.02573",
-            "category": "s2s",
-            "type": "Classification",
-            "eval_splits": ["validation", "test"],
-            "eval_langs": _LANGUAGES,
-            "main_score": "accuracy",
-            "revision": "1399c76144fd37290681b995c656ef9b2e06e26d",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Classification/multilingual/MTOPDomainClassification.py b/mteb/tasks/Classification/multilingual/MTOPDomainClassification.py
index 766258d7e5..a95d5e4f18 100644
--- a/mteb/tasks/Classification/multilingual/MTOPDomainClassification.py
+++ b/mteb/tasks/Classification/multilingual/MTOPDomainClassification.py
@@ -1,20 +1,36 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskClassification, MultilingualTask
 
 _LANGUAGES = ["en", "de", "es", "fr", "hi", "th"]
 
 
 class MTOPDomainClassification(MultilingualTask, AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="MTOPDomainClassification",
+        hf_hub_name="mteb/mtop_domain",
+        description="MTOP: Multilingual Task-Oriented Semantic Parsing",
+        reference="https://arxiv.org/pdf/2008.09335.pdf",
+        category="s2s",
+        type="Classification",
+        eval_splits=["validation", "test"],
+        eval_langs=_LANGUAGES,
+        main_score="accuracy",
+        revision="d80d48c1eb48d3562165c59d59d0034df9fff0bf",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "MTOPDomainClassification",
-            "hf_hub_name": "mteb/mtop_domain",
-            "description": "MTOP: Multilingual Task-Oriented Semantic Parsing",
-            "reference": "https://arxiv.org/pdf/2008.09335.pdf",
-            "category": "s2s",
-            "type": "Classification",
-            "eval_splits": ["validation", "test"],
-            "eval_langs": _LANGUAGES,
-            "main_score": "accuracy",
-            "revision": "d80d48c1eb48d3562165c59d59d0034df9fff0bf",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Classification/multilingual/MTOPIntentClassification.py b/mteb/tasks/Classification/multilingual/MTOPIntentClassification.py
index c416d1fe36..6a2c9ca589 100644
--- a/mteb/tasks/Classification/multilingual/MTOPIntentClassification.py
+++ b/mteb/tasks/Classification/multilingual/MTOPIntentClassification.py
@@ -1,20 +1,36 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskClassification, MultilingualTask
 
 _LANGUAGES = ["en", "de", "es", "fr", "hi", "th"]
 
 
 class MTOPIntentClassification(MultilingualTask, AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="MTOPIntentClassification",
+        hf_hub_name="mteb/mtop_intent",
+        description="MTOP: Multilingual Task-Oriented Semantic Parsing",
+        reference="https://arxiv.org/pdf/2008.09335.pdf",
+        category="s2s",
+        type="Classification",
+        eval_splits=["validation", "test"],
+        eval_langs=_LANGUAGES,
+        main_score="accuracy",
+        revision="ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "MTOPIntentClassification",
-            "hf_hub_name": "mteb/mtop_intent",
-            "description": "MTOP: Multilingual Task-Oriented Semantic Parsing",
-            "reference": "https://arxiv.org/pdf/2008.09335.pdf",
-            "category": "s2s",
-            "type": "Classification",
-            "eval_splits": ["validation", "test"],
-            "eval_langs": _LANGUAGES,
-            "main_score": "accuracy",
-            "revision": "ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Classification/multilingual/MasakhaNEWSClassification.py b/mteb/tasks/Classification/multilingual/MasakhaNEWSClassification.py
index 601857a1c9..a0105af9d3 100644
--- a/mteb/tasks/Classification/multilingual/MasakhaNEWSClassification.py
+++ b/mteb/tasks/Classification/multilingual/MasakhaNEWSClassification.py
@@ -1,3 +1,7 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskClassification, MultilingualTask
 
 _LANGUAGES = [
@@ -21,21 +25,29 @@
 
 
 class MasakhaNEWSClassification(AbsTaskClassification, MultilingualTask):
+    metadata = TaskMetadata(
+        name="MasakhaNEWSClassification",
+        hf_hub_name="masakhane/masakhanews",
+        description="MasakhaNEWS is the largest publicly available dataset for news topic classification in 16 languages widely spoken in Africa. The train/validation/test sets are available for all the 16 languages.",
+        reference="https://arxiv.org/abs/2304.09972",
+        category="s2s",
+        type="Classification",
+        eval_splits=["test"],
+        eval_langs=_LANGUAGES,
+        main_score="accuracy",
+        revision="8ccc72e69e65f40c70e117d8b3c08306bb788b60",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "MasakhaNEWSClassification",
-            "hf_hub_name": "masakhane/masakhanews",
-            "description": (
-                "MasakhaNEWS is the largest publicly available dataset for news topic classification in 16 languages widely spoken in Africa. The train/validation/test sets are available for all the 16 languages."
-            ),
-            "reference": "https://arxiv.org/abs/2304.09972",
-            "category": "s2s",
-            "type": "Classification",
-            "eval_splits": ["test"],
-            "eval_langs": _LANGUAGES,
-            "main_score": "accuracy",
-            "n_experiments": 10,
-            "samples_per_label": 16,
-            "revision": "8ccc72e69e65f40c70e117d8b3c08306bb788b60",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Classification/multilingual/MassiveIntentClassification.py b/mteb/tasks/Classification/multilingual/MassiveIntentClassification.py
index b6df3a363e..7ebc7b6509 100644
--- a/mteb/tasks/Classification/multilingual/MassiveIntentClassification.py
+++ b/mteb/tasks/Classification/multilingual/MassiveIntentClassification.py
@@ -1,3 +1,7 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskClassification, MultilingualTask
 
 _LANGUAGES = [
@@ -56,20 +60,29 @@
 
 
 class MassiveIntentClassification(MultilingualTask, AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="MassiveIntentClassification",
+        hf_hub_name="mteb/amazon_massive_intent",
+        description="MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages",
+        reference="https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.",
+        category="s2s",
+        type="Classification",
+        eval_splits=["validation", "test"],
+        eval_langs=_LANGUAGES,
+        main_score="accuracy",
+        revision="31efe3c427b0bae9c22cbb560b8f15491cc6bed7",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "MassiveIntentClassification",
-            "hf_hub_name": "mteb/amazon_massive_intent",
-            "description": (
-                "MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51"
-                " Typologically-Diverse Languages"
-            ),
-            "reference": "https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.",
-            "category": "s2s",
-            "type": "Classification",
-            "eval_splits": ["validation", "test"],
-            "eval_langs": _LANGUAGES,
-            "main_score": "accuracy",
-            "revision": "31efe3c427b0bae9c22cbb560b8f15491cc6bed7",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Classification/multilingual/MassiveScenarioClassification.py b/mteb/tasks/Classification/multilingual/MassiveScenarioClassification.py
index 732ee82dab..8a8a4bab14 100644
--- a/mteb/tasks/Classification/multilingual/MassiveScenarioClassification.py
+++ b/mteb/tasks/Classification/multilingual/MassiveScenarioClassification.py
@@ -1,3 +1,7 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskClassification, MultilingualTask
 
 _LANGUAGES = [
@@ -56,20 +60,29 @@
 
 
 class MassiveScenarioClassification(MultilingualTask, AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="MassiveScenarioClassification",
+        hf_hub_name="mteb/amazon_massive_scenario",
+        description="MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages",
+        reference="https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.",
+        category="s2s",
+        type="Classification",
+        eval_splits=["validation", "test"],
+        eval_langs=_LANGUAGES,
+        main_score="accuracy",
+        revision="7d571f92784cd94a019292a1f45445077d0ef634",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "MassiveScenarioClassification",
-            "hf_hub_name": "mteb/amazon_massive_scenario",
-            "description": (
-                "MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51"
-                " Typologically-Diverse Languages"
-            ),
-            "reference": "https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.",
-            "category": "s2s",
-            "type": "Classification",
-            "eval_splits": ["validation", "test"],
-            "eval_langs": _LANGUAGES,
-            "main_score": "accuracy",
-            "revision": "7d571f92784cd94a019292a1f45445077d0ef634",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Classification/multilingual/NordicLangClassification.py b/mteb/tasks/Classification/multilingual/NordicLangClassification.py
index 067bc8433f..b0b8cdd200 100644
--- a/mteb/tasks/Classification/multilingual/NordicLangClassification.py
+++ b/mteb/tasks/Classification/multilingual/NordicLangClassification.py
@@ -1,25 +1,41 @@
+from __future__ import annotations
+
 import datasets
 
 from mteb.abstasks import AbsTaskClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class NordicLangClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="NordicLangClassification",
+        description="A dataset for Nordic language identification.",
+        reference="https://aclanthology.org/2021.vardial-1.8/",
+        hf_hub_name="strombergnlp/nordic_langid",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["no", "nn"],
+        main_score="accuracy",
+        revision="e254179d18ab0165fdb6dbef91178266222bee2a",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "NordicLangClassification",
-            "hf_hub_name": "strombergnlp/nordic_langid",
-            "description": "A dataset for Nordic language identification.",
-            "reference": "https://aclanthology.org/2021.vardial-1.8/",
-            "type": "Classification",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["da", "sv", "nb", "nn", "is", "fo"],
-            "main_score": "accuracy",
-            "n_experiments": 10,
-            "samples_per_label": 32,
-            "revision": "e254179d18ab0165fdb6dbef91178266222bee2a",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["n_experiments"] = 10
+        metadata_dict["samples_per_label"] = 32
+        return metadata_dict
 
     def load_data(self, **kwargs):
         """
@@ -29,7 +45,9 @@ def load_data(self, **kwargs):
             return
 
         self.dataset = datasets.load_dataset(
-            self.description["hf_hub_name"], "10k", revision=self.description.get("revision")  # select relevant subset
+            self.metadata_dict["hf_hub_name"],
+            "10k",
+            revision=self.metadata_dict.get("revision"),  # select relevant subset
         )
         self.dataset_transform()
         self.data_loaded = True
diff --git a/mteb/tasks/Classification/multilingual/ScalaClassification.py b/mteb/tasks/Classification/multilingual/ScalaClassification.py
index f74ab57004..80b817eaaa 100644
--- a/mteb/tasks/Classification/multilingual/ScalaClassification.py
+++ b/mteb/tasks/Classification/multilingual/ScalaClassification.py
@@ -1,25 +1,41 @@
+from __future__ import annotations
+
 import datasets
 
 from mteb.abstasks import AbsTaskClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class ScalaDaClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="ScalaDaClassification",
+        description="A modified version of DDT modified for linguistic acceptability classification",
+        reference="https://aclanthology.org/2023.nodalida-1.20/",
+        hf_hub_name="ScandEval/scala-da",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["da"],
+        main_score="accuracy",
+        revision="1de08520a7b361e92ffa2a2201ebd41942c54675",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "ScalaDaClassification",
-            "hf_hub_name": "ScandEval/scala-da",
-            "description": "A modified version of DDT modified for linguistic acceptability classification",
-            "reference": "https://aclanthology.org/2023.nodalida-1.20/",
-            "type": "Classification",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["da"],
-            "main_score": "accuracy",
-            "n_experiments": 10,
-            "samples_per_label": 16,
-            "revision": "1de08520a7b361e92ffa2a2201ebd41942c54675",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["n_experiments"] = 10
+        metadata_dict["samples_per_label"] = 32
+        return metadata_dict
 
     def load_data(self, **kwargs):
         """
@@ -29,7 +45,8 @@ def load_data(self, **kwargs):
             return
 
         self.dataset = datasets.load_dataset(
-            self.description["hf_hub_name"], revision=self.description.get("revision", None)
+            self.metadata_dict["hf_hub_name"],
+            revision=self.metadata_dict.get("revision", None),
         )
         self.dataset_transform()
         self.data_loaded = True
@@ -38,26 +55,41 @@ def dataset_transform(self):
         # convert label to a 0/1 label
         labels = self.dataset["train"]["label"]  # type: ignore
         lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
-        self.dataset = self.dataset.map(lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"])
+        self.dataset = self.dataset.map(
+            lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
+        )
 
 
 class ScalaNbClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="ScalaNbClassification",
+        description="A Norwegian dataset for linguistic acceptability classification for Bokmål",
+        reference="https://aclanthology.org/2023.nodalida-1.20/",
+        hf_hub_name="ScandEval/scala-nb",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["nb"],
+        main_score="accuracy",
+        revision="237111a078ad5a834a55c57803d40bbe410ed03b",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "ScalaNbClassification",
-            "hf_hub_name": "ScandEval/scala-nb",
-            "description": "A Norwegian dataset for linguistic acceptability classification for Bokmål",
-            "reference": "https://aclanthology.org/2023.nodalida-1.20/",
-            "type": "Classification",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["no", "nb"],
-            "main_score": "accuracy",
-            "n_experiments": 10,
-            "samples_per_label": 16,
-            "revision": "237111a078ad5a834a55c57803d40bbe410ed03b",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["n_experiments"] = 10
+        metadata_dict["samples_per_label"] = 32
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         """
@@ -67,7 +99,8 @@ def load_data(self, **kwargs):
             return
 
         self.dataset = datasets.load_dataset(
-            self.description["hf_hub_name"], revision=self.description.get("revision", None)
+            self.metadata_dict["hf_hub_name"],
+            revision=self.metadata_dict.get("revision", None),
         )
         self.dataset_transform()
         self.data_loaded = True
@@ -76,26 +109,41 @@ def dataset_transform(self):
         # convert label to a 0/1 label
         labels = self.dataset["train"]["label"]  # type: ignore
         lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
-        self.dataset = self.dataset.map(lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"])
+        self.dataset = self.dataset.map(
+            lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
+        )
 
 
 class ScalaNnClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="ScalaNnClassification",
+        description="A Norwegian dataset for linguistic acceptability classification for Nynorsk",
+        reference="https://aclanthology.org/2023.nodalida-1.20/",
+        hf_hub_name="ScandEval/scala-nn",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["nn"],
+        main_score="accuracy",
+        revision="9d9a2a4092ed3cacf0744592f6d2f32ab8ef4c0b",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "ScalaNnClassification",
-            "hf_hub_name": "ScandEval/scala-nn",
-            "description": "A Norwegian dataset for linguistic acceptability classification for Nynorsk",
-            "reference": "https://aclanthology.org/2023.nodalida-1.20/",
-            "type": "Classification",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["no", "nn"],
-            "main_score": "accuracy",
-            "n_experiments": 10,
-            "samples_per_label": 16,
-            "revision": "9d9a2a4092ed3cacf0744592f6d2f32ab8ef4c0b",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["n_experiments"] = 10
+        metadata_dict["samples_per_label"] = 32
+        return metadata_dict
 
     def load_data(self, **kwargs):
         """
@@ -105,7 +153,8 @@ def load_data(self, **kwargs):
             return
 
         self.dataset = datasets.load_dataset(
-            self.description["hf_hub_name"], revision=self.description.get("revision", None)
+            self.metadata_dict["hf_hub_name"],
+            revision=self.metadata_dict.get("revision", None),
         )
         self.dataset_transform()
         self.data_loaded = True
@@ -114,26 +163,41 @@ def dataset_transform(self):
         # convert label to a 0/1 label
         labels = self.dataset["train"]["label"]  # type: ignore
         lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
-        self.dataset = self.dataset.map(lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"])
+        self.dataset = self.dataset.map(
+            lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
+        )
 
 
 class ScalaSvClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="ScalaSvClassification",
+        description="A Swedish dataset for linguistic acceptability classification",
+        reference="https://aclanthology.org/2023.nodalida-1.20/",
+        hf_hub_name="ScandEval/scala-sv",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["sv"],
+        main_score="accuracy",
+        revision="1b48e3dcb02872335ff985ff938a054a4ed99008",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "ScalaSvClassification",
-            "hf_hub_name": "ScandEval/scala-sv",
-            "description": "A Swedish dataset for linguistic acceptability classification",
-            "reference": "https://aclanthology.org/2023.nodalida-1.20/",
-            "type": "Classification",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["sv"],
-            "main_score": "accuracy",
-            "n_experiments": 10,
-            "samples_per_label": 16,
-            "revision": "1b48e3dcb02872335ff985ff938a054a4ed99008",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["n_experiments"] = 10
+        metadata_dict["samples_per_label"] = 32
+        return metadata_dict
 
     def load_data(self, **kwargs):
         """
@@ -143,7 +207,8 @@ def load_data(self, **kwargs):
             return
 
         self.dataset = datasets.load_dataset(
-            self.description["hf_hub_name"], revision=self.description.get("revision", None)
+            self.metadata_dict["hf_hub_name"],
+            revision=self.metadata_dict.get("revision", None),
         )
         self.dataset_transform()
         self.data_loaded = True
@@ -152,4 +217,6 @@ def dataset_transform(self):
         # convert label to a 0/1 label
         labels = self.dataset["train"]["label"]  # type: ignore
         lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
-        self.dataset = self.dataset.map(lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"])
+        self.dataset = self.dataset.map(
+            lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
+        )
diff --git a/mteb/tasks/Classification/nb/NoRecClassification.py b/mteb/tasks/Classification/nb/NoRecClassification.py
index 6bacaaf276..5281329604 100644
--- a/mteb/tasks/Classification/nb/NoRecClassification.py
+++ b/mteb/tasks/Classification/nb/NoRecClassification.py
@@ -1,20 +1,33 @@
+from __future__ import annotations
+
 from mteb.abstasks import AbsTaskClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class NoRecClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="NoRecClassification",
+        description="A Norwegian dataset for sentiment classification on review",
+        reference="https://aclanthology.org/L18-1661/",
+        hf_hub_name="ScandEval/norec-mini",  # using the mini version to keep results ~comparable to the ScandEval benchmark
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["nb"],
+        main_score="accuracy",
+        revision="07b99ab3363c2e7f8f87015b01c21f4d9b917ce3",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "NoRecClassification",
-            "hf_hub_name": "ScandEval/norec-mini",  # Using the mini version to keep results ~comparable to the ScandEval benchmark
-            "description": "A Norwegian dataset for sentiment classification on review",
-            "reference": "https://aclanthology.org/L18-1661/",
-            "type": "Classification",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["nb"],
-            "main_score": "accuracy",
-            "n_experiments": 10,
-            "samples_per_label": 16,
-            "revision": "07b99ab3363c2e7f8f87015b01c21f4d9b917ce3",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Classification/nb/NorwegianParliamentClassification.py b/mteb/tasks/Classification/nb/NorwegianParliamentClassification.py
index 0f7d706ee4..b181cc7d36 100644
--- a/mteb/tasks/Classification/nb/NorwegianParliamentClassification.py
+++ b/mteb/tasks/Classification/nb/NorwegianParliamentClassification.py
@@ -1,20 +1,33 @@
+from __future__ import annotations
+
 from mteb.abstasks import AbsTaskClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class NorwegianParliamentClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="NorwegianParliamentClassification",
+        description="Norwegian parliament speeches annotated for sentiment",
+        reference="https://huggingface.co/datasets/NbAiLab/norwegian_parliament",
+        hf_hub_name="NbAiLab/norwegian_parliament",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test", "validation"],
+        eval_langs=["nb"],  # assumed to be bokmål
+        main_score="accuracy",
+        revision="f7393532774c66312378d30b197610b43d751972",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "NorwegianParliament",
-            "hf_hub_name": "NbAiLab/norwegian_parliament",
-            "description": "Norwegian parliament speeches annotated for sentiment",
-            "reference": "https://huggingface.co/datasets/NbAiLab/norwegian_parliament",
-            "type": "Classification",
-            "category": "s2s",
-            "eval_splits": ["test", "validation"],
-            "eval_langs": ["nb"],  # assumed to be bokmål
-            "main_score": "accuracy",
-            "n_experiments": 10,
-            "samples_per_label": 16,
-            "revision": "f7393532774c66312378d30b197610b43d751972",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Classification/pl/PolishClassification.py b/mteb/tasks/Classification/pl/PolishClassification.py
index a8f8c4b7b8..c8fb4eb158 100644
--- a/mteb/tasks/Classification/pl/PolishClassification.py
+++ b/mteb/tasks/Classification/pl/PolishClassification.py
@@ -1,85 +1,153 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskClassification
 
 
 class CbdClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="CBD",
+        description="Polish Tweets annotated for cyberbullying detection.",
+        reference="http://2019.poleval.pl/files/poleval2019.pdf",
+        hf_hub_name="PL-MTEB/cbd",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="accuracy",
+        revision="59d12749a3c91a186063c7d729ec392fda94681c",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "CBD",
-            "hf_hub_name": "PL-MTEB/cbd",
-            "description": "Polish Tweets annotated for cyberbullying detection.",
-            "reference": "http://2019.poleval.pl/files/poleval2019.pdf",
-            "category": "s2s",
-            "type": "Classification",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "accuracy"
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
 
 class PolEmo2InClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="PolEmo2.0-IN",
+        description="A collection of Polish online reviews from four domains: medicine, hotels, products and "
+        "school. The PolEmo2.0-IN task is to predict the sentiment of in-domain (medicine and hotels) reviews.",
+        reference="https://aclanthology.org/K19-1092.pdf",
+        hf_hub_name="PL-MTEB/polemo2_in",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="accuracy",
+        revision="9e9b1f8ef51616073f47f306f7f47dd91663f86a",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "PolEmo2.0-IN",
-            "hf_hub_name": "PL-MTEB/polemo2_in",
-            "description": "A collection of Polish online reviews from four domains: medicine, hotels, products and "
-                           "school. The PolEmo2.0-IN task is to predict the sentiment of in-domain (medicine and hotels) "
-                           "reviews.",
-            "reference": "https://aclanthology.org/K19-1092.pdf",
-            "category": "s2s",
-            "type": "Classification",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "accuracy"
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
 
 class PolEmo2OutClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="PolEmo2.0-OUT",
+        description="A collection of Polish online reviews from four domains: medicine, hotels, products and "
+        "school. The PolEmo2.0-OUT task is to predict the sentiment of out-of-domain (products and "
+        "school) reviews using models train on reviews from medicine and hotels domains.",
+        reference="https://aclanthology.org/K19-1092.pdf",
+        hf_hub_name="PL-MTEB/polemo2_out",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="accuracy",
+        revision="c99d599f0a6ab9b85b065da6f9d94f9cf731679f",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "PolEmo2.0-OUT",
-            "hf_hub_name": "PL-MTEB/polemo2_out",
-            "description": "A collection of Polish online reviews from four domains: medicine, hotels, products and "
-                           "school. The PolEmo2.0-OUT task is to predict the sentiment of out-of-domain (products and "
-                           "school) reviews using models train on reviews from medicine and hotels domains.",
-            "reference": "https://aclanthology.org/K19-1092.pdf",
-            "category": "s2s",
-            "type": "Classification",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "accuracy"
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
 
 class AllegroReviewsClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="AllegroReviews",
+        description="A Polish dataset for sentiment classification on reviews from e-commerce marketplace Allegro.",
+        reference="https://aclanthology.org/2020.acl-main.111.pdf",
+        hf_hub_name="PL-MTEB/allegro-reviews",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="accuracy",
+        revision="477b8bd4448b5ef8ed01ba82bf9ff67f6e109207",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "AllegroReviews",
-            "hf_hub_name": "PL-MTEB/allegro-reviews",
-            "description": "A Polish dataset for sentiment classification on reviews from e-commerce marketplace Allegro.",
-            "reference": "https://aclanthology.org/2020.acl-main.111.pdf",
-            "category": "s2s",
-            "type": "Classification",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "accuracy"
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
 
 class PacClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="PAC",
+        description="Polish Paraphrase Corpus",
+        reference="https://arxiv.org/pdf/2211.13112.pdf",
+        hf_hub_name="laugustyniak/abusive-clauses-pl",
+        type="Classification",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="accuracy",
+        revision="8a04d940a42cd40658986fdd8e3da561533a3646",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "PAC",
-            "hf_hub_name": "laugustyniak/abusive-clauses-pl",
-            "description": "Polish Abusive Clauses Dataset",
-            "reference": "https://arxiv.org/pdf/2211.13112.pdf",
-            "category": "s2s",
-            "type": "Classification",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "accuracy"
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Classification/sv/SweRecClassification.py b/mteb/tasks/Classification/sv/SweRecClassification.py
index 1bd57012c6..ab3bef79d7 100644
--- a/mteb/tasks/Classification/sv/SweRecClassification.py
+++ b/mteb/tasks/Classification/sv/SweRecClassification.py
@@ -1,20 +1,33 @@
+from __future__ import annotations
+
 from mteb.abstasks import AbsTaskClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class SweRecClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="SweRecClassification",
+        description="A Swedish dataset for sentiment classification on review",
+        reference="https://aclanthology.org/2023.nodalida-1.20/",
+        hf_hub_name="SweRecClassification",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["sv"],
+        main_score="accuracy",
+        revision="3c62f26bafdc4c4e1c16401ad4b32f0a94b46612",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "SweRecClassification",
-            "hf_hub_name": "ScandEval/swerec-mini",  # using the mini version to keep results ~comparable to ScandEval
-            "description": "A Swedish dataset for sentiment classification on review",
-            "reference": "https://aclanthology.org/2023.nodalida-1.20/",
-            "type": "Classification",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["sv"],
-            "main_score": "accuracy",
-            "n_experiments": 10,
-            "samples_per_label": 16,
-            "revision": "3c62f26bafdc4c4e1c16401ad4b32f0a94b46612",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Classification/zh/CMTEBClassification.py b/mteb/tasks/Classification/zh/CMTEBClassification.py
index a45e645695..9fa0f20081 100644
--- a/mteb/tasks/Classification/zh/CMTEBClassification.py
+++ b/mteb/tasks/Classification/zh/CMTEBClassification.py
@@ -1,107 +1,193 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskClassification
 
+
 class TNews(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="TNews",
+        description="Short Text Classification for News",
+        reference="https://www.cluebenchmarks.com/introduce.html",
+        hf_hub_name="C-MTEB/TNews-classification",
+        type="Classification",
+        category="s2s",
+        eval_splits=["validation", "test"],
+        eval_langs=["zh"],
+        main_score="accuracy",
+        revision="317f262bf1e6126357bbe89e875451e4b0938fe4",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            'name': 'TNews',
-            'hf_hub_name': 'C-MTEB/TNews-classification',
-            'description': 'Short Text Classification for News',
-            "reference": "https://www.cluebenchmarks.com/introduce.html",
-            'type': 'Classification',
-            'category': 's2s',
-            'eval_splits': ['validation'],
-            'eval_langs': ['zh'],
-            'main_score': 'accuracy',
-            'samples_per_label': 32,
-            'revision': '317f262bf1e6126357bbe89e875451e4b0938fe4',
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["samples_per_label"] = 32
+        return metadata_dict
 
 
 class IFlyTek(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="IFlyTek",
+        description="Long Text classification for the description of Apps",
+        reference="https://www.cluebenchmarks.com/introduce.html",
+        hf_hub_name="C-MTEB/IFlyTek-classification",
+        type="Classification",
+        category="s2s",
+        eval_splits=["validation", "test"],
+        eval_langs=["zh"],
+        main_score="accuracy",
+        revision="421605374b29664c5fc098418fe20ada9bd55f8a",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            'name': 'IFlyTek',
-            'hf_hub_name': 'C-MTEB/IFlyTek-classification',
-            'description': 'Long Text classification for the description of Apps',
-            "reference": "https://www.cluebenchmarks.com/introduce.html",
-            'type': 'Classification',
-            'category': 's2s',
-            'eval_splits': ['validation'],
-            'eval_langs': ['zh'],
-            'main_score': 'accuracy',
-            'samples_per_label': 32,
-            'n_experiments': 5,
-            'revision': '421605374b29664c5fc098418fe20ada9bd55f8a',
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["samples_per_label"] = 32
+        metadata_dict["n_experiments"] = 5
+        return metadata_dict
 
 
 class MultilingualSentiment(AbsTaskClassification):
-    @property
-    def description(self):
-        return {
-            'name': 'MultilingualSentiment',
-            'hf_hub_name': 'C-MTEB/MultilingualSentiment-classification',
-            'description': 'A collection of multilingual sentiments datasets grouped into 3 classes -- positive, neutral, negative',
-            "reference": "https://github.com/tyqiangz/multilingual-sentiment-datasets",
-            'category': 's2s',
-            'type': 'Classification',
-            'eval_splits': ['validation'],
-            'eval_langs': ['zh'],
-            'main_score': 'accuracy',
-            'samples_per_label': 32,
-            'revision': '46958b007a63fdbf239b7672c25d0bea67b5ea1a',       
-        }
+    metadata = TaskMetadata(
+        name="MultilingualSentiment",
+        description="A collection of multilingual sentiments datasets grouped into 3 classes -- positive, neutral, negative",
+        reference="https://github.com/tyqiangz/multilingual-sentiment-datasets",
+        hf_hub_name="C-MTEB/MultilingualSentiment-classification",
+        type="Classification",
+        category="s2s",
+        eval_splits=["validation", "test"],
+        eval_langs=["zh"],
+        main_score="accuracy",
+        revision="46958b007a63fdbf239b7672c25d0bea67b5ea1a",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
 
+    @property
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["samples_per_label"] = 32
+        return metadata_dict
 
 
 class JDReview(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="JDReview",
+        description="review for iphone",
+        reference="https://aclanthology.org/2023.nodalida-1.20/",
+        hf_hub_name="C-MTEB/JDReview-classification",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["zh"],
+        main_score="accuracy",
+        revision="b7c64bd89eb87f8ded463478346f76731f07bf8b",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            'name': 'JDReview',
-            'hf_hub_name': 'C-MTEB/JDReview-classification',
-            'description': 'review for iphone',
-            'category': 's2s',
-            'type': 'Classification',
-            'eval_splits': ['test'],
-            'eval_langs': ['zh'],
-            'main_score': 'accuracy',
-            'samples_per_label': 32,
-            'revision': 'b7c64bd89eb87f8ded463478346f76731f07bf8b',
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["samples_per_label"] = 32
+        return metadata_dict
 
 
 class OnlineShopping(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="OnlineShopping",
+        description="Sentiment Analysis of User Reviews on Online Shopping Websites",
+        reference="https://aclanthology.org/2023.nodalida-1.20/",
+        hf_hub_name="C-MTEB/OnlineShopping-classification",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["zh"],
+        main_score="accuracy",
+        revision="e610f2ebd179a8fda30ae534c3878750a96db120",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            'name': 'OnlineShopping',
-            'hf_hub_name': 'C-MTEB/OnlineShopping-classification',
-            'description': 'Sentiment Analysis of User Reviews on Online Shopping Websites',
-            'category': 's2s',
-            'type': 'Classification',
-            'eval_splits': ['test'],
-            'eval_langs': ['zh'],
-            'main_score': 'accuracy',
-            'samples_per_label': 32,
-            'revision': 'e610f2ebd179a8fda30ae534c3878750a96db120',
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["samples_per_label"] = 32
+        return metadata_dict
 
 
 class Waimai(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="Waimai",
+        description="Sentiment Analysis of user reviews on takeaway platforms",
+        reference="https://aclanthology.org/2023.nodalida-1.20/",
+        hf_hub_name="C-MTEB/waimai-classification",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["zh"],
+        main_score="accuracy",
+        revision="339287def212450dcaa9df8c22bf93e9980c7023",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            'name': 'Waimai',
-            'hf_hub_name': 'C-MTEB/waimai-classification',
-            'description': 'Sentiment Analysis of user reviews on takeaway platforms',
-            'category': 's2s',
-            'type': 'Classification',
-            'eval_splits': ['test'],
-            'eval_langs': ['zh'],
-            'main_score': 'accuracy',
-            'samples_per_label': 32,
-            'revision': '339287def212450dcaa9df8c22bf93e9980c7023',
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["samples_per_label"] = 32
+
+        return metadata_dict
diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py
index cad03d5c2b..1a5154936f 100644
--- a/mteb/tasks/Clustering/__init__.py
+++ b/mteb/tasks/Clustering/__init__.py
@@ -1,28 +1,30 @@
-from .fr.AlloProfClusteringP2P import *
-from .fr.AlloProfClusteringS2S import *
+from __future__ import annotations
+
+from .de.BlurbsClusteringP2P import *
+from .de.BlurbsClusteringS2S import *
+from .de.TenKGnadClusteringP2P import *
+from .de.TenKGnadClusteringS2S import *
 from .en.ArxivClusteringP2P import *
 from .en.ArxivClusteringS2S import *
 from .en.BigPatentClustering import *
 from .en.BiorxivClusteringP2P import *
 from .en.BiorxivClusteringS2S import *
-from .de.BlurbsClusteringP2P import *
-from .de.BlurbsClusteringS2S import *
-from .zh.CMTEBClustering import *
-from .fr.HALClusteringS2S import *
-from .multilingual.MasakhaNEWSClusteringP2P import *
-from .multilingual.MasakhaNEWSClusteringS2S import *
 from .en.MedrxivClusteringP2P import *
 from .en.MedrxivClusteringS2S import *
-from .fr.MLSUMClusteringP2P import *
-from .fr.MLSUMClusteringS2S import *
-from .pl.PolishClustering import *
 from .en.RedditClustering import *
 from .en.RedditClusteringP2P import *
 from .en.StackExchangeClustering import *
 from .en.StackExchangeClusteringP2P import *
-from .de.TenKGnadClusteringP2P import *
-from .de.TenKGnadClusteringS2S import *
 from .en.TwentyNewsgroupsClustering import *
 from .en.WikiCitiesClustering import *
 from .es.FloresClusteringS2S import *
 from .es.SpanishNewsClusteringP2P import *
+from .fr.AlloProfClusteringP2P import *
+from .fr.AlloProfClusteringS2S import *
+from .fr.HALClusteringS2S import *
+from .fr.MLSUMClusteringP2P import *
+from .fr.MLSUMClusteringS2S import *
+from .multilingual.MasakhaNEWSClusteringP2P import *
+from .multilingual.MasakhaNEWSClusteringS2S import *
+from .pl.PolishClustering import *
+from .zh.CMTEBClustering import *
diff --git a/mteb/tasks/Clustering/de/BlurbsClusteringP2P.py b/mteb/tasks/Clustering/de/BlurbsClusteringP2P.py
index e92f702fa2..a5e6dad822 100644
--- a/mteb/tasks/Clustering/de/BlurbsClusteringP2P.py
+++ b/mteb/tasks/Clustering/de/BlurbsClusteringP2P.py
@@ -1,18 +1,33 @@
+from __future__ import annotations
+
 from mteb.abstasks.AbsTaskClustering import AbsTaskClustering
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class BlurbsClusteringP2P(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="BlurbsClusteringP2P",
+        description="Clustering of book titles+blurbs. Clustering of 28 sets, either on the main or secondary genre.",
+        reference="https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html",
+        hf_hub_name="slvnwhrl/blurbs-clustering-p2p",
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["de"],
+        main_score="v_measure",
+        revision="a2dd5b02a77de3466a3eaa98ae586b5610314496",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "BlurbsClusteringP2P",
-            "hf_hub_name": "slvnwhrl/blurbs-clustering-p2p",
-            "description": "Clustering of book titles+blurbs. Clustering of 28 sets, either on the main or secondary genre.",
-            "reference": "https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html",
-            "type": "Clustering",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["de"],
-            "main_score": "v_measure",
-            "revision": "a2dd5b02a77de3466a3eaa98ae586b5610314496",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Clustering/de/BlurbsClusteringS2S.py b/mteb/tasks/Clustering/de/BlurbsClusteringS2S.py
index daa0d33dc8..93656a8596 100644
--- a/mteb/tasks/Clustering/de/BlurbsClusteringS2S.py
+++ b/mteb/tasks/Clustering/de/BlurbsClusteringS2S.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class BlurbsClusteringS2S(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="BlurbsClusteringS2S",
+        description="Clustering of book titles. Clustering of 28 sets, either on the main or secondary genre.",
+        reference="https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html",
+        hf_hub_name="slvnwhrl/blurbs-clustering-s2s",
+        type="Clustering",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["de"],
+        main_score="v_measure",
+        revision="5bfd9ee0ae3d2ef0d9a0e0bd0c3b2d0",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "BlurbsClusteringS2S",
-            "hf_hub_name": "slvnwhrl/blurbs-clustering-s2s",
-            "description": "Clustering of book titles. Clustering of 28 sets, either on the main or secondary genre.",
-            "reference": "https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html",
-            "type": "Clustering",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["de"],
-            "main_score": "v_measure",
-            "revision": "9bfff9a7f8f6dc6ffc9da71c48dd48b68696471d",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Clustering/de/TenKGnadClusteringP2P.py b/mteb/tasks/Clustering/de/TenKGnadClusteringP2P.py
index 18e940899b..8baf4815b3 100644
--- a/mteb/tasks/Clustering/de/TenKGnadClusteringP2P.py
+++ b/mteb/tasks/Clustering/de/TenKGnadClusteringP2P.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class TenKGnadClusteringP2P(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="TenKGnadClusteringP2P",
+        description="Clustering of news article titles+subheadings+texts. Clustering of 10 splits on the news article category.",
+        reference="https://tblock.github.io/10kGNAD/",
+        hf_hub_name="slvnwhrl/tenkgnad-clustering-p2p",
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["de"],
+        main_score="v_measure",
+        revision="5c59e41555244b7e45c9a6be2d720ab4bafae558",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "TenKGnadClusteringP2P",
-            "hf_hub_name": "slvnwhrl/tenkgnad-clustering-p2p",
-            "description": "Clustering of news article titles+subheadings+texts. Clustering of 10 splits on the news article category.",
-            "reference": "https://tblock.github.io/10kGNAD/",
-            "type": "Clustering",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["de"],
-            "main_score": "v_measure",
-            "revision": "5c59e41555244b7e45c9a6be2d720ab4bafae558",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Clustering/de/TenKGnadClusteringS2S.py b/mteb/tasks/Clustering/de/TenKGnadClusteringS2S.py
index c86f2a0ce1..65e88237b8 100644
--- a/mteb/tasks/Clustering/de/TenKGnadClusteringS2S.py
+++ b/mteb/tasks/Clustering/de/TenKGnadClusteringS2S.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class TenKGnadClusteringS2S(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="TenKGnadClusteringS2S",
+        description="Clustering of news article titles. Clustering of 10 splits on the news article category.",
+        reference="https://tblock.github.io/10kGNAD/",
+        hf_hub_name="slvnwhrl/tenkgnad-clustering-s2s",
+        type="Clustering",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["de"],
+        main_score="v_measure",
+        revision="6cddbe003f12b9b140aec477b583ac4191f01786",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "TenKGnadClusteringS2S",
-            "hf_hub_name": "slvnwhrl/tenkgnad-clustering-s2s",
-            "description": "Clustering of news article titles. Clustering of 10 splits on the news article category.",
-            "reference": "https://tblock.github.io/10kGNAD/",
-            "type": "Clustering",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["de"],
-            "main_score": "v_measure",
-            "revision": "6cddbe003f12b9b140aec477b583ac4191f01786",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Clustering/en/ArxivClusteringP2P.py b/mteb/tasks/Clustering/en/ArxivClusteringP2P.py
index fd4edb309f..2323974eb4 100644
--- a/mteb/tasks/Clustering/en/ArxivClusteringP2P.py
+++ b/mteb/tasks/Clustering/en/ArxivClusteringP2P.py
@@ -1,21 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class ArxivClusteringP2P(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="ArxivClusteringP2P",
+        description="Clustering of titles+abstract from arxiv. Clustering of 30 sets, either on the main or secondary category",
+        reference="https://www.kaggle.com/Cornell-University/arxiv",
+        hf_hub_name="mteb/arxiv-clustering-p2p",
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="v_measure",
+        revision="a122ad7f3f0291bf49cc6f4d32aa80929df69d5d",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "ArxivClusteringP2P",
-            "hf_hub_name": "mteb/arxiv-clustering-p2p",
-            "description": (
-                "Clustering of titles+abstract from arxiv. Clustering of 30 sets, either on the main or secondary"
-                " category"
-            ),
-            "reference": "https://www.kaggle.com/Cornell-University/arxiv",
-            "type": "Clustering",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "v_measure",
-            "revision": "a122ad7f3f0291bf49cc6f4d32aa80929df69d5d",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Clustering/en/ArxivClusteringS2S.py b/mteb/tasks/Clustering/en/ArxivClusteringS2S.py
index 22d2aaf80e..dd813577de 100644
--- a/mteb/tasks/Clustering/en/ArxivClusteringS2S.py
+++ b/mteb/tasks/Clustering/en/ArxivClusteringS2S.py
@@ -1,20 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class ArxivClusteringS2S(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="ArxivClusteringS2S",
+        description="Clustering of titles from arxiv. Clustering of 30 sets, either on the main or secondary category",
+        reference="https://www.kaggle.com/Cornell-University/arxiv",
+        hf_hub_name="mteb/arxiv-clustering-s2s",
+        type="Clustering",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="v_measure",
+        revision="f910caf1a6075f7329cdf8c1a6135696f37dbd53",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "ArxivClusteringS2S",
-            "hf_hub_name": "mteb/arxiv-clustering-s2s",
-            "description": (
-                "Clustering of titles from arxiv. Clustering of 30 sets, either on the main or secondary category"
-            ),
-            "reference": "https://www.kaggle.com/Cornell-University/arxiv",
-            "type": "Clustering",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "v_measure",
-            "revision": "f910caf1a6075f7329cdf8c1a6135696f37dbd53",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Clustering/en/BigPatentClustering.py b/mteb/tasks/Clustering/en/BigPatentClustering.py
index 442c744f97..d10c90339c 100644
--- a/mteb/tasks/Clustering/en/BigPatentClustering.py
+++ b/mteb/tasks/Clustering/en/BigPatentClustering.py
@@ -1,21 +1,34 @@
+from __future__ import annotations
+
 from mteb.abstasks.AbsTaskClustering import AbsTaskClustering
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class BigPatentClustering(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="BigPatentClustering",
+        description="Clustering of documents from the Big Patent dataset. Test set only includes documents"
+        "belonging to a single category, with a total of 9 categories.",
+        reference="https://www.kaggle.com/datasets/big_patent",
+        hf_hub_name="jinaai/big-patent-clustering",
+        type="Clustering",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="v_measure",
+        revision="62d5330920bca426ce9d3c76ea914f15fc83e891",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "BigPatentClustering",
-            "hf_hub_name": "jinaai/big-patent-clustering",
-            "description": (
-                "Clustering of documents from the Big Patent dataset. Test set only includes documents"
-                "belonging to a single category, with a total of 9 categories."
-            ),
-            "reference": "https://huggingface.co/datasets/big_patent",
-            "type": "Clustering",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "v_measure",
-            "revision": "62d5330920bca426ce9d3c76ea914f15fc83e891",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Clustering/en/BiorxivClusteringP2P.py b/mteb/tasks/Clustering/en/BiorxivClusteringP2P.py
index 949f2821d9..9ba39ce85a 100644
--- a/mteb/tasks/Clustering/en/BiorxivClusteringP2P.py
+++ b/mteb/tasks/Clustering/en/BiorxivClusteringP2P.py
@@ -1,20 +1,33 @@
+from __future__ import annotations
+
 from mteb.abstasks.AbsTaskClustering import AbsTaskClustering
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class BiorxivClusteringP2P(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="BiorxivClusteringP2P",
+        description="Clustering of titles+abstract from biorxiv. Clustering of 10 sets, based on the main category.",
+        reference="https://api.biorxiv.org/",
+        hf_hub_name="mteb/biorxiv-clustering-p2p",
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="v_measure",
+        revision="65b79d1d13f80053f67aca9498d9402c2d9f1f40",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "BiorxivClusteringP2P",
-            "hf_hub_name": "mteb/biorxiv-clustering-p2p",
-            "description": (
-                "Clustering of titles+abstract from biorxiv. Clustering of 10 sets, based on the main category."
-            ),
-            "reference": "https://api.biorxiv.org/",
-            "type": "Clustering",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "v_measure",
-            "revision": "65b79d1d13f80053f67aca9498d9402c2d9f1f40",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Clustering/en/BiorxivClusteringS2S.py b/mteb/tasks/Clustering/en/BiorxivClusteringS2S.py
index c0a722054a..12a66e8553 100644
--- a/mteb/tasks/Clustering/en/BiorxivClusteringS2S.py
+++ b/mteb/tasks/Clustering/en/BiorxivClusteringS2S.py
@@ -1,18 +1,33 @@
+from __future__ import annotations
+
 from mteb.abstasks.AbsTaskClustering import AbsTaskClustering
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class BiorxivClusteringS2S(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="BiorxivClusteringS2S",
+        description="Clustering of titles from biorxiv. Clustering of 10 sets, based on the main category.",
+        reference="https://api.biorxiv.org/",
+        hf_hub_name="mteb/biorxiv-clustering-s2s",
+        type="Clustering",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="v_measure",
+        revision="258694dd0231531bc1fd9de6ceb52a0853c6d908",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "BiorxivClusteringS2S",
-            "hf_hub_name": "mteb/biorxiv-clustering-s2s",
-            "description": "Clustering of titles from biorxiv. Clustering of 10 sets, based on the main category.",
-            "reference": "https://api.biorxiv.org/",
-            "type": "Clustering",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "v_measure",
-            "revision": "258694dd0231531bc1fd9de6ceb52a0853c6d908",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Clustering/en/MedrxivClusteringP2P.py b/mteb/tasks/Clustering/en/MedrxivClusteringP2P.py
index 70e1192c96..9a252d7c74 100644
--- a/mteb/tasks/Clustering/en/MedrxivClusteringP2P.py
+++ b/mteb/tasks/Clustering/en/MedrxivClusteringP2P.py
@@ -1,20 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class MedrxivClusteringP2P(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="MedrxivClusteringP2P",
+        description="Clustering of titles+abstract from medrxiv. Clustering of 10 sets, based on the main category.",
+        reference="https://api.medrxiv.org/",
+        hf_hub_name="mteb/medrxiv-clustering-p2p",
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="v_measure",
+        revision="e7a26af6f3ae46b30dde8737f02c07b1505bcc73",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "MedrxivClusteringP2P",
-            "hf_hub_name": "mteb/medrxiv-clustering-p2p",
-            "description": (
-                "Clustering of titles+abstract from medrxiv. Clustering of 10 sets, based on the main category."
-            ),
-            "reference": "https://api.biorxiv.org/",
-            "type": "Clustering",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "v_measure",
-            "revision": "e7a26af6f3ae46b30dde8737f02c07b1505bcc73",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Clustering/en/MedrxivClusteringS2S.py b/mteb/tasks/Clustering/en/MedrxivClusteringS2S.py
index a0e850154f..bd569cb494 100644
--- a/mteb/tasks/Clustering/en/MedrxivClusteringS2S.py
+++ b/mteb/tasks/Clustering/en/MedrxivClusteringS2S.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class MedrxivClusteringS2S(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="MedrxivClusteringS2S",
+        description="Clustering of titles from medrxiv. Clustering of 10 sets, based on the main category.",
+        reference="https://api.medrxiv.org/",
+        hf_hub_name="mteb/medrxiv-clustering-s2s",
+        type="Clustering",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="v_measure",
+        revision="35191c8c0dca72d8ff3efcd72aa802307d469663",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "MedrxivClusteringS2S",
-            "hf_hub_name": "mteb/medrxiv-clustering-s2s",
-            "description": "Clustering of titles from medrxiv. Clustering of 10 sets, based on the main category.",
-            "reference": "https://api.biorxiv.org/",
-            "type": "Clustering",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "v_measure",
-            "revision": "35191c8c0dca72d8ff3efcd72aa802307d469663",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Clustering/en/RedditClustering.py b/mteb/tasks/Clustering/en/RedditClustering.py
index 79bb842cd6..c6a5131ea6 100644
--- a/mteb/tasks/Clustering/en/RedditClustering.py
+++ b/mteb/tasks/Clustering/en/RedditClustering.py
@@ -1,21 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class RedditClustering(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="RedditClustering",
+        description="Clustering of titles from 199 subreddits. Clustering of 25 sets, each with 10-50 classes, and each class with 100 - 1000 sentences.",
+        reference="https://arxiv.org/abs/2104.07081",
+        hf_hub_name="mteb/reddit-clustering",
+        type="Clustering",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="v_measure",
+        revision="24640382cdbf8abc73003fb0fa6d111a705499eb",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "RedditClustering",
-            "hf_hub_name": "mteb/reddit-clustering",
-            "description": (
-                "Clustering of titles from 199 subreddits. Clustering of 25 sets, each with 10-50 classes, and each"
-                " class with 100 - 1000 sentences."
-            ),
-            "reference": "https://arxiv.org/abs/2104.07081",
-            "type": "Clustering",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "v_measure",
-            "revision": "24640382cdbf8abc73003fb0fa6d111a705499eb",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Clustering/en/RedditClusteringP2P.py b/mteb/tasks/Clustering/en/RedditClusteringP2P.py
index e5e7f0ff86..79871749a8 100644
--- a/mteb/tasks/Clustering/en/RedditClusteringP2P.py
+++ b/mteb/tasks/Clustering/en/RedditClusteringP2P.py
@@ -1,20 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class RedditClusteringP2P(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="RedditClusteringP2P",
+        description="Clustering of title+posts from reddit. Clustering of 10 sets of 50k paragraphs and 40 sets of 10k paragraphs.",
+        reference="https://arxiv.org/abs/2104.07081",
+        hf_hub_name="mteb/reddit-clustering-p2p",
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="v_measure",
+        revision="24640382cdbf8abc73003fb0fa6d111a705499eb",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "RedditClusteringP2P",
-            "hf_hub_name": "mteb/reddit-clustering-p2p",
-            "description": (
-                "Clustering of title+posts from reddit. Clustering of 10 sets with 1K - 100K samples and 10 - 100 labels each."
-            ),
-            "reference": "https://huggingface.co/datasets/sentence-transformers/reddit-title-body",
-            "type": "Clustering",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "v_measure",
-            "revision": "282350215ef01743dc01b456c7f5241fa8937f16",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Clustering/en/StackExchangeClustering.py b/mteb/tasks/Clustering/en/StackExchangeClustering.py
index cc02da9b20..401400e8bc 100644
--- a/mteb/tasks/Clustering/en/StackExchangeClustering.py
+++ b/mteb/tasks/Clustering/en/StackExchangeClustering.py
@@ -1,9 +1,37 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class StackExchangeClustering(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="StackExchangeClustering",
+        description="Clustering of titles from 121 stackexchanges. Clustering of 25 sets, each with 10-50 classes, and each class with 100 - 1000 sentences.",
+        reference="https://arxiv.org/abs/2104.07081",
+        hf_hub_name="mteb/stackexchange-clustering",
+        type="Clustering",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="v_measure",
+        revision="6cbc1f7b2bc0622f2e39d2c77fa502909748c259",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
         return {
             "name": "StackExchangeClustering",
             "hf_hub_name": "mteb/stackexchange-clustering",
diff --git a/mteb/tasks/Clustering/en/StackExchangeClusteringP2P.py b/mteb/tasks/Clustering/en/StackExchangeClusteringP2P.py
index 20dc6dd2cf..3ceb56e49f 100644
--- a/mteb/tasks/Clustering/en/StackExchangeClusteringP2P.py
+++ b/mteb/tasks/Clustering/en/StackExchangeClusteringP2P.py
@@ -1,21 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class StackExchangeClusteringP2P(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="StackExchangeClusteringP2P",
+        description="Clustering of title+body from stackexchange. Clustering of 5 sets of 10k paragraphs and 5 sets of 5k paragraphs.",
+        reference="https://arxiv.org/abs/2104.07081",
+        hf_hub_name="mteb/stackexchange-clustering-p2p",
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="v_measure",
+        revision="815ca46b2622cec33ccafc3735d572c266efdb44",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "StackExchangeClusteringP2P",
-            "hf_hub_name": "mteb/stackexchange-clustering-p2p",
-            "description": (
-                "Clustering of title+body from stackexchange. Clustering of 5 sets of 10k paragraphs and 5 sets of 5k"
-                " paragraphs."
-            ),
-            "reference": "https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_title_body_jsonl",
-            "type": "Clustering",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "v_measure",
-            "revision": "815ca46b2622cec33ccafc3735d572c266efdb44",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Clustering/en/TwentyNewsgroupsClustering.py b/mteb/tasks/Clustering/en/TwentyNewsgroupsClustering.py
index 317740ba18..fa435b1bba 100644
--- a/mteb/tasks/Clustering/en/TwentyNewsgroupsClustering.py
+++ b/mteb/tasks/Clustering/en/TwentyNewsgroupsClustering.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class TwentyNewsgroupsClustering(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="TwentyNewsgroupsClustering",
+        description="Clustering of the 20 Newsgroups dataset (subject only).",
+        reference="https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html",
+        hf_hub_name="mteb/twentynewsgroups-clustering",
+        type="Clustering",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="v_measure",
+        revision="6125ec4e24fa026cec8a478383ee943acfbd5449",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "TwentyNewsgroupsClustering",
-            "hf_hub_name": "mteb/twentynewsgroups-clustering",
-            "description": "Clustering of the 20 Newsgroups dataset (subject only).",
-            "reference": "https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html",
-            "type": "Clustering",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "v_measure",
-            "revision": "6125ec4e24fa026cec8a478383ee943acfbd5449",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Clustering/en/WikiCitiesClustering.py b/mteb/tasks/Clustering/en/WikiCitiesClustering.py
index 74599a92cc..c8349d51d7 100644
--- a/mteb/tasks/Clustering/en/WikiCitiesClustering.py
+++ b/mteb/tasks/Clustering/en/WikiCitiesClustering.py
@@ -1,21 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class WikiCitiesClustering(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="WikiCitiesClustering",
+        description="Clustering of Wikipedia articles of cities by country from https://huggingface.co/datasets/wikipedia.",
+        reference="https://huggingface.co/datasets/wikipedia",
+        hf_hub_name="mteb/wikipedia-clustering",
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="v_measure",
+        revision="ddc9ee9242fa65332597f70e967ecc38b9d734fa",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "WikiCitiesClustering",
-            "hf_hub_name": "jinaai/cities_wiki_clustering",
-            "description": (
-                "Clustering of Wikipedia articles of cities by country from https://huggingface.co/datasets/wikipedia."
-                "Test set includes 126 countries, and a total of 3531 cities."
-            ),
-            "reference": "https://huggingface.co/datasets/wikipedia",
-            "type": "Clustering",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "v_measure",
-            "revision": "ddc9ee9242fa65332597f70e967ecc38b9d734fa",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Clustering/es/FloresClusteringS2S.py b/mteb/tasks/Clustering/es/FloresClusteringS2S.py
index e8e685357e..f5c278d360 100644
--- a/mteb/tasks/Clustering/es/FloresClusteringS2S.py
+++ b/mteb/tasks/Clustering/es/FloresClusteringS2S.py
@@ -1,20 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class FloresClusteringS2S(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="FloresClusteringS2S",
+        description="Clustering of sentences from various web articles, 32 topics in total.",
+        reference="https://huggingface.co/datasets/facebook/flores",
+        hf_hub_name="mteb/flores",
+        type="Clustering",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["es"],
+        main_score="v_measure",
+        revision="480b580487f53a46f881354a8348335d4edbb2de",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "FloresClusteringS2S",
-            "hf_hub_name": "jinaai/flores_clustering",
-            "description": (
-                "Clustering of sentences from various web articles, 32 topics in total."
-            ),
-            "reference": "https://huggingface.co/datasets/facebook/flores",
-            "type": "Clustering",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["es"],
-            "main_score": "v_measure",
-            "revision": "480b580487f53a46f881354a8348335d4edbb2de",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Clustering/es/SpanishNewsClusteringP2P.py b/mteb/tasks/Clustering/es/SpanishNewsClusteringP2P.py
index 80df01a8df..fc2a0e3a96 100644
--- a/mteb/tasks/Clustering/es/SpanishNewsClusteringP2P.py
+++ b/mteb/tasks/Clustering/es/SpanishNewsClusteringP2P.py
@@ -1,15 +1,41 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class SpanishNewsClusteringP2P(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="SpanishNewsClusteringP2P",
+        description="Clustering of news articles, 7 topics in total.",
+        reference="https://www.kaggle.com/datasets/kevinmorgado/spanish-news-classification",
+        hf_hub_name="mteb/spanish_news_clustering",
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["es"],
+        main_score="v_measure",
+        revision="b5edc3d3d7c12c7b9f883e9da50f6732f3624142",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
         return {
             "name": "SpanishNewsClusteringP2P",
             "hf_hub_name": "jinaai/spanish_news_clustering",
-            "description": (
-                "Clustering of news articles, 7 topics in total."
-            ),
+            "description": ("Clustering of news articles, 7 topics in total."),
             "reference": "https://www.kaggle.com/datasets/kevinmorgado/spanish-news-classification",
             "type": "Clustering",
             "category": "p2p",
diff --git a/mteb/tasks/Clustering/fr/AlloProfClusteringP2P.py b/mteb/tasks/Clustering/fr/AlloProfClusteringP2P.py
index f270930173..f0b06a12a1 100644
--- a/mteb/tasks/Clustering/fr/AlloProfClusteringP2P.py
+++ b/mteb/tasks/Clustering/fr/AlloProfClusteringP2P.py
@@ -1,26 +1,40 @@
+from __future__ import annotations
+
 import datasets
 import numpy as np
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class AlloProfClusteringP2P(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="AlloProfClusteringP2P",
+        description="Clustering of document titles and descriptions from Allo Prof dataset. Clustering of 10 sets on the document topic.",
+        reference="https://huggingface.co/datasets/lyon-nlp/alloprof",
+        hf_hub_name="mteb/alloprof",
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["fr"],
+        main_score="v_measure",
+        revision="392ba3f5bcc8c51f578786c1fc3dae648662cb9b",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "AlloProfClusteringP2P",
-            "hf_hub_name": "lyon-nlp/alloprof",
-            "description": (
-                "Clustering of document titles and descriptions from Allo Prof dataset. Clustering of 10 sets on the document topic."
-            ),
-            "reference": "https://huggingface.co/datasets/lyon-nlp/alloprof",
-            "type": "Clustering",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["fr"],
-            "main_score": "v_measure",
-            "revision": "392ba3f5bcc8c51f578786c1fc3dae648662cb9b",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         """
@@ -30,9 +44,9 @@ def load_data(self, **kwargs):
             return
 
         self.dataset = datasets.load_dataset(
-            self.description["hf_hub_name"],
+            self.metadata_dict["hf_hub_name"],
             "documents",
-            revision=self.description.get("revision", None),
+            revision=self.metadata_dict.get("revision", None),
         )
         self.dataset_transform()
         self.data_loaded = True
diff --git a/mteb/tasks/Clustering/fr/AlloProfClusteringS2S.py b/mteb/tasks/Clustering/fr/AlloProfClusteringS2S.py
index bb98c3b086..a8d7b87fe8 100644
--- a/mteb/tasks/Clustering/fr/AlloProfClusteringS2S.py
+++ b/mteb/tasks/Clustering/fr/AlloProfClusteringS2S.py
@@ -1,27 +1,41 @@
+from __future__ import annotations
+
 import datasets
 import numpy as np
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class AlloProfClusteringS2S(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="AlloProfClusteringS2S",
+        description="Clustering of document titles from Allo Prof dataset. Clustering of 10 sets on the document topic.",
+        reference="https://huggingface.co/datasets/lyon-nlp/alloprof",
+        hf_hub_name="mteb/alloprof",
+        type="Clustering",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["fr"],
+        main_score="v_measure",
+        revision="392ba3f5bcc8c51f578786c1fc3dae648662cb9b",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "AlloProfClusteringS2S",
-            "hf_hub_name": "lyon-nlp/alloprof",
-            "description": (
-                "Clustering of document titles from Allo Prof dataset. Clustering of 10 sets on the document topic."
-            ),
-            "reference": "https://huggingface.co/datasets/lyon-nlp/alloprof",
-            "type": "Clustering",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["fr"],
-            "main_score": "v_measure",
-            "revision": "392ba3f5bcc8c51f578786c1fc3dae648662cb9b",
-        }
-    
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
+
     def load_data(self, **kwargs):
         """
         Load dataset from HuggingFace hub and convert it to the standard format.
@@ -29,9 +43,9 @@ def load_data(self, **kwargs):
         if self.data_loaded:
             return
         self.dataset = datasets.load_dataset(
-            self.description["hf_hub_name"],
+            self.metadata_dict["hf_hub_name"],
             "documents",
-            revision=self.description.get("revision", None),
+            revision=self.metadata_dict.get("revision", None),
         )
         self.dataset_transform()
         self.data_loaded = True
diff --git a/mteb/tasks/Clustering/fr/HALClusteringS2S.py b/mteb/tasks/Clustering/fr/HALClusteringS2S.py
index c6f57c836e..bcd177fb72 100644
--- a/mteb/tasks/Clustering/fr/HALClusteringS2S.py
+++ b/mteb/tasks/Clustering/fr/HALClusteringS2S.py
@@ -1,26 +1,40 @@
+from __future__ import annotations
+
 import datasets
 import numpy as np
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class HALClusteringS2S(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="HALClusteringS2S",
+        description="Clustering of titles from HAL (https://huggingface.co/datasets/lyon-nlp/clustering-hal-s2s)",
+        reference="https://huggingface.co/datasets/lyon-nlp/clustering-hal-s2s",
+        hf_hub_name="mteb/hal",
+        type="Clustering",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["fr"],
+        main_score="v_measure",
+        revision="e06ebbbb123f8144bef1a5d18796f3dec9ae2915",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "HALClusteringS2S",
-            "hf_hub_name": "lyon-nlp/clustering-hal-s2s",
-            "description": (
-                "Clustering of titles from HAL (https://hal.science/). Clustering of 10 sets on the main category."
-            ),
-            "reference": "https://huggingface.co/datasets/lyon-nlp/clustering-hal-s2s",
-            "type": "Clustering",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["fr"],
-            "main_score": "v_measure",
-            "revision": "e06ebbbb123f8144bef1a5d18796f3dec9ae2915",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         """
@@ -30,8 +44,8 @@ def load_data(self, **kwargs):
             return
 
         self.dataset = datasets.load_dataset(
-            self.description["hf_hub_name"],
-            revision=self.description.get("revision", None),
+            self.metadata_dict["hf_hub_name"],
+            revision=self.metadata_dict.get("revision", None),
         )
         self.dataset_transform()
         self.data_loaded = True
diff --git a/mteb/tasks/Clustering/fr/MLSUMClusteringP2P.py b/mteb/tasks/Clustering/fr/MLSUMClusteringP2P.py
index e659800256..1adbf615f4 100644
--- a/mteb/tasks/Clustering/fr/MLSUMClusteringP2P.py
+++ b/mteb/tasks/Clustering/fr/MLSUMClusteringP2P.py
@@ -1,26 +1,40 @@
+from __future__ import annotations
+
 import datasets
 import numpy as np
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class MLSUMClusteringP2P(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="MLSUMClusteringP2P",
+        description="Clustering of newspaper article contents and titles from MLSUM dataset. Clustering of 10 sets on the newpaper article topics.",
+        reference="https://huggingface.co/datasets/mlsum",
+        hf_hub_name="mteb/mlsum",
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["fr"],
+        main_score="v_measure",
+        revision="b5d54f8f3b61ae17845046286940f03c6bc79bc7",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "MLSUMClusteringP2P",
-            "hf_hub_name": "mlsum",
-            "description": (
-                "Clustering of newspaper article contents and titles from MLSUM dataset. Clustering of 10 sets on the newpaper article topics."
-            ),
-            "reference": "https://huggingface.co/datasets/mlsum",
-            "type": "Clustering",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["fr"],
-            "main_score": "v_measure",
-            "revision": "b5d54f8f3b61ae17845046286940f03c6bc79bc7",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         """
@@ -29,10 +43,10 @@ def load_data(self, **kwargs):
         if self.data_loaded:
             return
         self.dataset = datasets.load_dataset(
-            self.description["hf_hub_name"],
+            self.metadata_dict["hf_hub_name"],
             "fr",
-            split=self.description["eval_splits"][0],
-            revision=self.description.get("revision", None),
+            split=self.metadata_dict["eval_splits"][0],
+            revision=self.metadata_dict.get("revision", None),
         )
         self.dataset_transform()
         self.data_loaded = True
@@ -53,4 +67,6 @@ def dataset_transform(self):
             "sentences": [split.tolist() for split in np.array_split(texts, 10)],
             "labels": [split.tolist() for split in np.array_split(topics, 10)],
         }
-        self.dataset = {self.description["eval_splits"][0]: datasets.Dataset.from_dict(new_format)}
+        self.dataset = {
+            self.metadata_dict["eval_splits"][0]: datasets.Dataset.from_dict(new_format)
+        }
diff --git a/mteb/tasks/Clustering/fr/MLSUMClusteringS2S.py b/mteb/tasks/Clustering/fr/MLSUMClusteringS2S.py
index 425d8b34c4..af522fbe8e 100644
--- a/mteb/tasks/Clustering/fr/MLSUMClusteringS2S.py
+++ b/mteb/tasks/Clustering/fr/MLSUMClusteringS2S.py
@@ -1,26 +1,40 @@
+from __future__ import annotations
+
 import datasets
 import numpy as np
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class MLSUMClusteringS2S(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="MLSUMClusteringS2S",
+        description="Clustering of newspaper article contents and titles from MLSUM dataset. Clustering of 10 sets on the newpaper article topics.",
+        reference="https://huggingface.co/datasets/mlsum",
+        hf_hub_name="mteb/mlsum",
+        type="Clustering",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["fr"],
+        main_score="v_measure",
+        revision="b5d54f8f3b61ae17845046286940f03c6bc79bc7",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "MLSUMClusteringS2S",
-            "hf_hub_name": "mlsum",
-            "description": (
-                "Clustering of newspaper article titles from MLSUM dataset. Clustering of 10 sets on the newpaper article topics."
-            ),
-            "reference": "https://huggingface.co/datasets/mlsum",
-            "type": "Clustering",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["fr"],
-            "main_score": "v_measure",
-            "revision": "b5d54f8f3b61ae17845046286940f03c6bc79bc7",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         """
@@ -29,10 +43,10 @@ def load_data(self, **kwargs):
         if self.data_loaded:
             return
         self.dataset = datasets.load_dataset(
-            self.description["hf_hub_name"],
+            self.metadata_dict["hf_hub_name"],
             "fr",
-            split=self.description["eval_splits"][0],
-            revision=self.description.get("revision", None),
+            split=self.metadata_dict["eval_splits"][0],
+            revision=self.metadata_dict.get("revision", None),
         )
         self.dataset_transform()
         self.data_loaded = True
@@ -43,7 +57,13 @@ def dataset_transform(self):
         """
         self.dataset = self.dataset.remove_columns(["summary", "text", "url", "date"])
         new_format = {
-            "sentences": [split.tolist() for split in np.array_split(self.dataset["title"], 10)],
-            "labels": [split.tolist() for split in np.array_split(self.dataset["topic"], 10)],
+            "sentences": [
+                split.tolist() for split in np.array_split(self.dataset["title"], 10)
+            ],
+            "labels": [
+                split.tolist() for split in np.array_split(self.dataset["topic"], 10)
+            ],
+        }
+        self.dataset = {
+            self.metadata_dict["eval_splits"][0]: datasets.Dataset.from_dict(new_format)
         }
-        self.dataset = {self.description["eval_splits"][0]: datasets.Dataset.from_dict(new_format)}
diff --git a/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringP2P.py b/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringP2P.py
index 81d2591838..7155b6e899 100644
--- a/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringP2P.py
+++ b/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringP2P.py
@@ -1,9 +1,13 @@
+from __future__ import annotations
+
 import datasets
 import numpy as np
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskClustering, MultilingualTask
 
-_LANGUAGES =[
+_LANGUAGES = [
     "amh",
     "eng",
     "fra",
@@ -24,22 +28,32 @@
 
 
 class MasakhaNEWSClusteringP2P(AbsTaskClustering, MultilingualTask):
+    metadata = TaskMetadata(
+        name="MasakhaNEWSClusteringP2P",
+        description="Clustering of news article headlines and texts from MasakhaNEWS dataset. Clustering of 10 sets on the news article label.",
+        reference="https://huggingface.co/datasets/masakhane/masakhanews",
+        hf_hub_name="mteb/masakhanews",
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=_LANGUAGES,
+        main_score="v_measure",
+        revision="8ccc72e69e65f40c70e117d8b3c08306bb788b60",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "MasakhaNEWSClusteringP2P",
-            "hf_hub_name": "masakhane/masakhanews",
-            "description": (
-                "Clustering of news article headlines and texts from MasakhaNEWS dataset. Clustering of 10 sets on the news article label."
-            ),
-            "reference": "https://huggingface.co/datasets/masakhane/masakhanews",
-            "type": "Clustering",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": _LANGUAGES,
-            "main_score": "v_measure",
-            "revision": "8ccc72e69e65f40c70e117d8b3c08306bb788b60",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         """
@@ -50,9 +64,9 @@ def load_data(self, **kwargs):
         self.dataset = {}
         for lang in self.langs:
             self.dataset[lang] = datasets.load_dataset(
-                self.description["hf_hub_name"],
+                self.metadata_dict["hf_hub_name"],
                 lang,
-                revision=self.description.get("revision", None),
+                revision=self.metadata_dict.get("revision", None),
             )
             self.dataset_transform(lang)
         self.data_loaded = True
@@ -64,7 +78,9 @@ def dataset_transform(self, lang):
         self.dataset[lang].pop("train")
         self.dataset[lang].pop("validation")
 
-        self.dataset[lang] = self.dataset[lang].remove_columns(["url", "text", "headline"])
+        self.dataset[lang] = self.dataset[lang].remove_columns(
+            ["url", "text", "headline"]
+        )
         texts = self.dataset[lang]["test"]["headline_text"]
         labels = self.dataset[lang]["test"]["label"]
         new_format = {
diff --git a/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringS2S.py b/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringS2S.py
index 2459e3c21b..39fa6966d5 100644
--- a/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringS2S.py
+++ b/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringS2S.py
@@ -1,9 +1,13 @@
+from __future__ import annotations
+
 import datasets
 import numpy as np
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskClustering, MultilingualTask
 
-_LANGUAGES =[
+_LANGUAGES = [
     "amh",
     "eng",
     "fra",
@@ -24,22 +28,34 @@
 
 
 class MasakhaNEWSClusteringS2S(AbsTaskClustering, MultilingualTask):
+    metadata = TaskMetadata(
+        name="MasakhaNEWSClusteringS2S",
+        hf_hub_name="masakhane/masakhanews",
+        description=(
+            "Clustering of news article headlines from MasakhaNEWS dataset. Clustering of 10 sets on the news article label."
+        ),
+        reference="https://huggingface.co/datasets/masakhane/masakhanews",
+        type="Clustering",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=_LANGUAGES,
+        main_score="v_measure",
+        revision="8ccc72e69e65f40c70e117d8b3c08306bb788b60",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "MasakhaNEWSClusteringS2S",
-            "hf_hub_name": "masakhane/masakhanews",
-            "description": (
-                "Clustering of news article headlines from MasakhaNEWS dataset. Clustering of 10 sets on the news article label."
-            ),
-            "reference": "https://huggingface.co/datasets/masakhane/masakhanews",
-            "type": "Clustering",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": _LANGUAGES,
-            "main_score": "v_measure",
-            "revision": "8ccc72e69e65f40c70e117d8b3c08306bb788b60",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         """
@@ -50,9 +66,9 @@ def load_data(self, **kwargs):
         self.dataset = {}
         for lang in self.langs:
             self.dataset[lang] = datasets.load_dataset(
-                self.description["hf_hub_name"],
+                self.metadata_dict["hf_hub_name"],
                 lang,
-                revision=self.description.get("revision", None),
+                revision=self.metadata_dict.get("revision", None),
             )
             self.dataset_transform(lang)
         self.data_loaded = True
@@ -64,7 +80,9 @@ def dataset_transform(self, lang):
         self.dataset[lang].pop("train")
         self.dataset[lang].pop("validation")
 
-        self.dataset[lang] = self.dataset[lang].remove_columns(["url", "text", "headline_text"])
+        self.dataset[lang] = self.dataset[lang].remove_columns(
+            ["url", "text", "headline_text"]
+        )
         texts = self.dataset[lang]["test"]["headline"]
         labels = self.dataset[lang]["test"]["label"]
         new_format = {
diff --git a/mteb/tasks/Clustering/pl/PolishClustering.py b/mteb/tasks/Clustering/pl/PolishClustering.py
index 214cbe3c5f..35e185da63 100644
--- a/mteb/tasks/Clustering/pl/PolishClustering.py
+++ b/mteb/tasks/Clustering/pl/PolishClustering.py
@@ -1,18 +1,35 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class EightTagsClustering(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="EightTagsClustering",
+        description="Clustering of headlines from social media posts in Polish belonging to 8 categories: film, history, "
+        "food, medicine, motorization, work, sport and technology.",
+        reference="https://aclanthology.org/2020.lrec-1.207.pdf",
+        hf_hub_name="mteb/polish-clustering",
+        type="Clustering",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="v_measure",
+        revision="e7a26af6f3ae46b30dde8737f02c07b1505bcc73",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "8TagsClustering",
-            "hf_hub_name": "PL-MTEB/8tags-clustering",
-            "description": "Clustering of headlines from social media posts in Polish belonging to 8 categories: film, history, "
-                           "food, medicine, motorization, work, sport and technology.",
-            "reference": "https://aclanthology.org/2020.lrec-1.207.pdf",
-            "type": "Clustering",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "v_measure"
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Clustering/zh/CMTEBClustering.py b/mteb/tasks/Clustering/zh/CMTEBClustering.py
index 94862ff0ea..91504b871e 100644
--- a/mteb/tasks/Clustering/zh/CMTEBClustering.py
+++ b/mteb/tasks/Clustering/zh/CMTEBClustering.py
@@ -1,75 +1,121 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskClustering import AbsTaskClustering
 
 
 class CLSClusteringS2S(AbsTaskClustering):
-    @property
-    def description(self):
-        return {
-            "name": "CLSClusteringS2S",
-            "hf_hub_name": "C-MTEB/CLSClusteringS2S",
-            "description": (
-                "Clustering of titles from CLS dataset. Clustering of 13 sets, based on the main category."
-            ),
-            "reference": "https://arxiv.org/abs/2209.05034",
-            "type": "Clustering",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["zh"],
-            "main_score": "v_measure",
-            'revision': 'e458b3f5414b62b7f9f83499ac1f5497ae2e869f',
-        }
+    metadata = TaskMetadata(
+        name="CLSClusteringS2S",
+        description="Clustering of titles from CLS dataset. Clustering of 13 sets on the main category.",
+        reference="https://arxiv.org/abs/2209.05034",
+        hf_hub_name="mteb/cls",
+        type="Clustering",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["zh"],
+        main_score="v_measure",
+        revision="e458b3f5414b62b7f9f83499ac1f5497ae2e869f",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
 
+    @property
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
 
 class CLSClusteringP2P(AbsTaskClustering):
-    @property
-    def description(self):
-        return {
-            "name": "CLSClusteringP2P",
-            "hf_hub_name": "C-MTEB/CLSClusteringP2P",
-            "description": (
-                "Clustering of titles + abstract from CLS dataset. Clustering of 13 sets, based on the main category."
-            ),
-            "reference": "https://arxiv.org/abs/2209.05034",
-            "type": "Clustering",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["zh"],
-            "main_score": "v_measure",
-            'revision': '4b6227591c6c1a73bc76b1055f3b7f3588e72476',
-        }
+    metadata = TaskMetadata(
+        name="CLSClusteringP2P",
+        description="Clustering of titles + abstract from CLS dataset. Clustering of 13 sets on the main category.",
+        reference="https://arxiv.org/abs/2209.05034",
+        hf_hub_name="mteb/cls",
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["zh"],
+        main_score="v_measure",
+        revision="4b6227591c6c1a73bc76b1055f3b7f3588e72476",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
 
+    @property
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
 
 class ThuNewsClusteringS2S(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="ThuNewsClusteringS2S",
+        hf_hub_name="C-MTEB/ThuNewsClusteringS2S",
+        description="Clustering of titles from the THUCNews dataset",
+        reference="http://thuctc.thunlp.org/",
+        type="Clustering",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["zh"],
+        main_score="v_measure",
+        revision="8a8b2caeda43f39e13c4bc5bea0f8a667896e10d",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            'name': 'ThuNewsClusteringS2S',
-            'hf_hub_name': 'C-MTEB/ThuNewsClusteringS2S',
-            'description': 'Clustering of titles from the THUCNews dataset',
-            "reference": "http://thuctc.thunlp.org/",
-            "type": "Clustering",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["zh"],
-            "main_score": "v_measure",
-            'revision': '8a8b2caeda43f39e13c4bc5bea0f8a667896e10d',
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
 
 class ThuNewsClusteringP2P(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="ThuNewsClusteringP2P",
+        hf_hub_name="C-MTEB/ThuNewsClusteringP2P",
+        description="Clustering of titles + abstracts from the THUCNews dataset",
+        reference="http://thuctc.thunlp.org/",
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["zh"],
+        main_score="v_measure",
+        revision="5798586b105c0434e4f0fe5e767abe619442cf93",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            'name': 'ThuNewsClusteringP2P',
-            'hf_hub_name': 'C-MTEB/ThuNewsClusteringP2P',
-            'description': 'Clustering of titles + abstracts from the THUCNews dataset',
-            "reference": "http://thuctc.thunlp.org/",
-            "type": "Clustering",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["zh"],
-            "main_score": "v_measure",
-            'revision': '5798586b105c0434e4f0fe5e767abe619442cf93',
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py
index 2f103e435d..dc7343c55e 100644
--- a/mteb/tasks/PairClassification/__init__.py
+++ b/mteb/tasks/PairClassification/__init__.py
@@ -1,7 +1,9 @@
-from .zh.CMTEBPairClassification import *
-from .multilingual.OpusparcusPC import *
-from .pl.PolishPC import *
-from .multilingual.PawsX import *
+from __future__ import annotations
+
 from .en.SprintDuplicateQuestionsPC import *
 from .en.TwitterSemEval2015PC import *
 from .en.TwitterURLCorpusPC import *
+from .multilingual.OpusparcusPC import *
+from .multilingual.PawsX import *
+from .pl.PolishPC import *
+from .zh.CMTEBPairClassification import *
diff --git a/mteb/tasks/PairClassification/en/SprintDuplicateQuestionsPC.py b/mteb/tasks/PairClassification/en/SprintDuplicateQuestionsPC.py
index 8f41026039..a1d70c18f5 100644
--- a/mteb/tasks/PairClassification/en/SprintDuplicateQuestionsPC.py
+++ b/mteb/tasks/PairClassification/en/SprintDuplicateQuestionsPC.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskPairClassification import AbsTaskPairClassification
 
 
 class SprintDuplicateQuestionsPC(AbsTaskPairClassification):
+    metadata = TaskMetadata(
+        name="SprintDuplicateQuestions",
+        description="Duplicate questions from the Sprint community.",
+        reference="https://www.aclweb.org/anthology/D18-1131/",
+        hf_hub_name="mteb/sprintduplicatequestions-pairclassification",
+        type="PairClassification",
+        category="s2s",
+        eval_splits=["validation", "test"],
+        eval_langs=["en"],
+        main_score="ap",
+        revision="d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "SprintDuplicateQuestions",
-            "hf_hub_name": "mteb/sprintduplicatequestions-pairclassification",
-            "description": "Duplicate questions from the Sprint community.",
-            "reference": "https://www.aclweb.org/anthology/D18-1131/",
-            "category": "s2s",
-            "type": "PairClassification",
-            "eval_splits": ["validation", "test"],
-            "eval_langs": ["en"],
-            "main_score": "ap",
-            "revision": "d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/PairClassification/en/TwitterSemEval2015PC.py b/mteb/tasks/PairClassification/en/TwitterSemEval2015PC.py
index 8f5941f965..d57b4ca79e 100644
--- a/mteb/tasks/PairClassification/en/TwitterSemEval2015PC.py
+++ b/mteb/tasks/PairClassification/en/TwitterSemEval2015PC.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskPairClassification import AbsTaskPairClassification
 
 
 class TwitterSemEval2015PC(AbsTaskPairClassification):
+    metadata = TaskMetadata(
+        name="TwitterSemEval2015",
+        hf_hub_name="mteb/twittersemeval2015-pairclassification",
+        description="Paraphrase-Pairs of Tweets from the SemEval 2015 workshop.",
+        reference="https://alt.qcri.org/semeval2015/task1/",
+        category="s2s",
+        type="PairClassification",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ap",
+        revision="70970daeab8776df92f5ea462b6173c0b46fd2d1",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "TwitterSemEval2015",
-            "hf_hub_name": "mteb/twittersemeval2015-pairclassification",
-            "description": "Paraphrase-Pairs of Tweets from the SemEval 2015 workshop.",
-            "reference": "https://alt.qcri.org/semeval2015/task1/",
-            "category": "s2s",
-            "type": "PairClassification",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ap",
-            "revision": "70970daeab8776df92f5ea462b6173c0b46fd2d1",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/PairClassification/en/TwitterURLCorpusPC.py b/mteb/tasks/PairClassification/en/TwitterURLCorpusPC.py
index a9ee79a3eb..e4b1425e55 100644
--- a/mteb/tasks/PairClassification/en/TwitterURLCorpusPC.py
+++ b/mteb/tasks/PairClassification/en/TwitterURLCorpusPC.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskPairClassification import AbsTaskPairClassification
 
 
 class TwitterURLCorpusPC(AbsTaskPairClassification):
+    metadata = TaskMetadata(
+        name="TwitterURLCorpus",
+        hf_hub_name="mteb/twitterurlcorpus-pairclassification",
+        description="Paraphrase-Pairs of Tweets.",
+        reference="https://languagenet.github.io/",
+        category="s2s",
+        type="PairClassification",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ap",
+        revision="8b6510b0b1fa4e4c4f879467980e9be563ec1cdf",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "TwitterURLCorpus",
-            "hf_hub_name": "mteb/twitterurlcorpus-pairclassification",
-            "description": "Paraphrase-Pairs of Tweets.",
-            "reference": "https://languagenet.github.io/",
-            "category": "s2s",
-            "type": "PairClassification",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ap",
-            "revision": "8b6510b0b1fa4e4c4f879467980e9be563ec1cdf",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/PairClassification/multilingual/OpusparcusPC.py b/mteb/tasks/PairClassification/multilingual/OpusparcusPC.py
index 44980105f7..35b878a363 100644
--- a/mteb/tasks/PairClassification/multilingual/OpusparcusPC.py
+++ b/mteb/tasks/PairClassification/multilingual/OpusparcusPC.py
@@ -1,24 +1,41 @@
-from ....abstasks import AbsTaskPairClassification, MultilingualTask
+from __future__ import annotations
+
 import datasets
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+from ....abstasks import AbsTaskPairClassification, MultilingualTask
+
 _LANGUAGES = ["de", "en", "fi", "fr", "ru", "sv"]
 
 
 class OpusparcusPC(AbsTaskPairClassification, MultilingualTask):
+    metadata = TaskMetadata(
+        name="OpusparcusPC",
+        hf_hub_name="GEM/opusparcus",
+        description="Opusparcus is a paraphrase corpus for six European language: German, English, Finnish, French, Russian, and Swedish. The paraphrases consist of subtitles from movies and TV shows.",
+        reference="https://gem-benchmark.com/data_cards/opusparcus",
+        category="s2s",
+        type="PairClassification",
+        eval_splits=["test.full", "validation.full"],
+        eval_langs=_LANGUAGES,
+        main_score="ap",
+        revision="9e9b1f8ef51616073f47f306f7f47dd91663f86a",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "OpusparcusPC",
-            "hf_hub_name": "GEM/opusparcus",
-            "description": "Opusparcus is a paraphrase corpus for six European language: German, English, Finnish, French, Russian, and Swedish. The paraphrases consist of subtitles from movies and TV shows.",
-            "reference": "https://gem-benchmark.com/data_cards/opusparcus",
-            "category": "s2s",
-            "type": "PairClassification",
-            "eval_splits": ["test.full", "validation.full"],
-            "eval_langs": _LANGUAGES,
-            "main_score": "ap",
-            "revision": "9e9b1f8ef51616073f47f306f7f47dd91663f86a",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         """
@@ -29,10 +46,10 @@ def load_data(self, **kwargs):
         self.dataset = {}
         for lang in self.langs:
             self.dataset[lang] = datasets.load_dataset(
-                self.description["hf_hub_name"],
+                self.metadata_dict["hf_hub_name"],
                 lang=lang,
                 quality=100,
-                revision=self.description.get("revision", None),
+                revision=self.metadata_dict.get("revision", None),
             )
             self.dataset_transform(lang)
         self.data_loaded = True
@@ -45,7 +62,9 @@ def dataset_transform(self, lang):
             sent2 = self.dataset[lang][split]["target"]
             new_dict = {}
             # Labels are a score between 1.0 and 4.0, and we need binary classification
-            labels = [0 if label < 2.5 else 1 if label > 2.5 else 2.5 for label in labels]
+            labels = [
+                0 if label < 2.5 else 1 if label > 2.5 else 2.5 for label in labels
+            ]
             # Get neutral label to delete them
             neutral = [i for i, val in enumerate(labels) if val == 2.5]
             for i in sorted(neutral, reverse=True):
diff --git a/mteb/tasks/PairClassification/multilingual/PawsX.py b/mteb/tasks/PairClassification/multilingual/PawsX.py
index cb77d1f6d1..f7259d50e3 100644
--- a/mteb/tasks/PairClassification/multilingual/PawsX.py
+++ b/mteb/tasks/PairClassification/multilingual/PawsX.py
@@ -1,24 +1,40 @@
+from __future__ import annotations
+
 import datasets
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import MultilingualTask
 from ....abstasks.AbsTaskPairClassification import AbsTaskPairClassification
 
 
 class PawsX(MultilingualTask, AbsTaskPairClassification):
+    metadata = TaskMetadata(
+        name="PawsX",
+        hf_hub_name="paws-x",
+        description="",
+        reference="https://arxiv.org/abs/1908.11828",
+        category="s2s",
+        type="PairClassification",
+        eval_splits=["test.full", "validation.full"],
+        eval_langs=["de", "en", "es", "fr", "ja", "ko", "zh"],
+        main_score="ap",
+        revision="8a04d940a42cd40658986fdd8e3da561533a3646",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "PawsX",
-            "hf_hub_name": "paws-x",
-            "description": "",
-            "reference": "",
-            "category": "s2s",
-            "type": "PairClassification",
-            "eval_splits": ["test"],
-            "eval_langs": ["de", "en", "es", "fr", "ja", "ko", "zh"],
-            "main_score": "ap",
-            "revision": "8a04d940a42cd40658986fdd8e3da561533a3646",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
@@ -27,9 +43,9 @@ def load_data(self, **kwargs):
         self.dataset = dict()
         for lang in self.langs:
             hf_dataset = datasets.load_dataset(
-                self.description["hf_hub_name"],
+                self.metadata_dict["hf_hub_name"],
                 lang,
-                revision=self.description.get("revision", None),
+                revision=self.metadata_dict.get("revision", None),
             )
 
             sent1 = []
diff --git a/mteb/tasks/PairClassification/pl/PolishPC.py b/mteb/tasks/PairClassification/pl/PolishPC.py
index ac888e09d2..0a985aff9f 100644
--- a/mteb/tasks/PairClassification/pl/PolishPC.py
+++ b/mteb/tasks/PairClassification/pl/PolishPC.py
@@ -1,65 +1,121 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskPairClassification import AbsTaskPairClassification
 
 
 class SickePLPC(AbsTaskPairClassification):
+    metadata = TaskMetadata(
+        name="SICK-E-PL",
+        hf_hub_name="PL-MTEB/sicke-pl-pairclassification",
+        description="Polish version of SICK dataset for textual entailment.",
+        reference="https://aclanthology.org/2020.lrec-1.207",
+        category="s2s",
+        type="PairClassification",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="ap",
+        revision="5c59e41555244b7e45c9a6be2d720ab4bafae558",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "SICK-E-PL",
-            "hf_hub_name": "PL-MTEB/sicke-pl-pairclassification",
-            "description": "Polish version of SICK dataset for textual entailment.",
-            "reference": "https://aclanthology.org/2020.lrec-1.207.pdf",
-            "category": "s2s",
-            "type": "PairClassification",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "ap",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
 
 class PpcPC(AbsTaskPairClassification):
+    metadata = TaskMetadata(
+        name="PpcPC",
+        hf_hub_name="PL-MTEB/ppc-pairclassification",
+        description="Polish Paraphrase Corpus",
+        reference="https://arxiv.org/pdf/2207.12759.pdf",
+        category="s2s",
+        type="PairClassification",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="ap",
+        revision="1.0",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "PPC",
-            "hf_hub_name": "PL-MTEB/ppc-pairclassification",
-            "description": "Polish Paraphrase Corpus",
-            "reference": "https://arxiv.org/pdf/2207.12759.pdf",
-            "category": "s2s",
-            "type": "PairClassification",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "ap"
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
 
 class CdscePC(AbsTaskPairClassification):
+    metadata = TaskMetadata(
+        name="CDSC-E",
+        hf_hub_name="PL-MTEB/cdsce-pairclassification",
+        description="Compositional Distributional Semantics Corpus for textual entailment.",
+        reference="https://aclanthology.org/P17-1073.pdf",
+        category="s2s",
+        type="PairClassification",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="ap",
+        revision="1.0",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "CDSC-E",
-            "hf_hub_name": "PL-MTEB/cdsce-pairclassification",
-            "description": "Compositional Distributional Semantics Corpus for textual entailment.",
-            "reference": "https://aclanthology.org/P17-1073.pdf",
-            "category": "s2s",
-            "type": "PairClassification",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "ap"
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
 
 class PscPC(AbsTaskPairClassification):
+    metadata = TaskMetadata(
+        name="PSC",
+        hf_hub_name="PL-MTEB/psc-pairclassification",
+        description="Polish Summaries Corpus",
+        reference="http://www.lrec-conf.org/proceedings/lrec2014/pdf/1211_Paper.pdf",
+        category="s2s",
+        type="PairClassification",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="ap",
+        revision="1.0",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "PSC",
-            "hf_hub_name": "PL-MTEB/psc-pairclassification",
-            "description": "Polish Summaries Corpus",
-            "reference": "http://www.lrec-conf.org/proceedings/lrec2014/pdf/1211_Paper.pdf",
-            "category": "s2s",
-            "type": "PairClassification",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "ap"
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/PairClassification/zh/CMTEBPairClassification.py b/mteb/tasks/PairClassification/zh/CMTEBPairClassification.py
index 5a434da2b7..b6d7a6de2e 100644
--- a/mteb/tasks/PairClassification/zh/CMTEBPairClassification.py
+++ b/mteb/tasks/PairClassification/zh/CMTEBPairClassification.py
@@ -1,35 +1,62 @@
+from __future__ import annotations
+
 from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class Ocnli(AbsTaskPairClassification):
+    metadata = TaskMetadata(
+        name="Ocnli",
+        description="Original Chinese Natural Language Inference dataset",
+        reference="https://arxiv.org/abs/2010.05444",
+        hf_hub_name="C-MTEB/OCNLI",
+        type="PairClassification",
+        category="s2s",
+        eval_splits=["validation", "test"],
+        eval_langs=["zh"],
+        main_score="accuracy",
+        revision="7bd1ee0ae3a820d8a4b6f8a624063f8504a3564d",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "Ocnli",
-            "hf_hub_name": "C-MTEB/OCNLI",
-            "description": "Original Chinese Natural Language Inference dataset",
-            "reference": "https://arxiv.org/abs/2010.05444",
-            "category": "s2s",
-            "type": "PairClassification",
-            "eval_splits": ["validation"],
-            "eval_langs": ["zh"],
-            "main_score": "ap",
-            "revision": "66e76a618a34d6d565d5538088562851e6daa7ec",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
 
 class Cmnli(AbsTaskPairClassification):
+    metadata = TaskMetadata(
+        name="Cmnli",
+        description="Chinese Multi-Genre NLI",
+        reference="https://huggingface.co/datasets/clue/viewer/cmnli",
+        hf_hub_name="C-MTEB/CMNLI",
+        type="PairClassification",
+        category="s2s",
+        eval_splits=["validation", "test"],
+        eval_langs=["zh"],
+        main_score="accuracy",
+        revision="41bc36f332156f7adc9e38f53777c959b2ae9766",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "Cmnli",
-            "hf_hub_name": "C-MTEB/CMNLI",
-            "description": "Chinese Multi-Genre NLI",
-            "reference": "https://huggingface.co/datasets/clue/viewer/cmnli",
-            "category": "s2s",
-            "type": "PairClassification",
-            "eval_splits": ["validation"],
-            "eval_langs": ["zh"],
-            "main_score": "ap",
-            "revision": "41bc36f332156f7adc9e38f53777c959b2ae9766",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Reranking/__init__.py b/mteb/tasks/Reranking/__init__.py
index ef7fcb7455..a214bdf07b 100644
--- a/mteb/tasks/Reranking/__init__.py
+++ b/mteb/tasks/Reranking/__init__.py
@@ -1,9 +1,10 @@
-from .fr.AlloprofReranking import *
+from __future__ import annotations
+
 from .en.AskUbuntuDupQuestions import *
-from .zh.CMTEBReranking import *
 from .en.MindSmallReranking import *
-from .zh.CMTEBReranking import *
-from .multilingual.MIRACLReranking import *
 from .en.SciDocsReranking import *
 from .en.StackOverflowDupQuestions import *
+from .fr.AlloprofReranking import *
 from .fr.SyntecReranking import *
+from .multilingual.MIRACLReranking import *
+from .zh.CMTEBReranking import *
diff --git a/mteb/tasks/Reranking/en/AskUbuntuDupQuestions.py b/mteb/tasks/Reranking/en/AskUbuntuDupQuestions.py
index 1279510f9d..70978a1028 100644
--- a/mteb/tasks/Reranking/en/AskUbuntuDupQuestions.py
+++ b/mteb/tasks/Reranking/en/AskUbuntuDupQuestions.py
@@ -1,21 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskReranking import AbsTaskReranking
 
 
 class AskUbuntuDupQuestions(AbsTaskReranking):
+    metadata = TaskMetadata(
+        name="AskUbuntuDupQuestions",
+        description="AskUbuntu Question Dataset - Questions from AskUbuntu with manual annotations marking pairs of questions as similar or non-similar",
+        reference="https://github.com/taolei87/askubuntu",
+        hf_hub_name="mteb/askubuntudupquestions-reranking",
+        type="Reranking",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="map",
+        revision="2000358ca161889fa9c082cb41daa8dcfb161a54",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "AskUbuntuDupQuestions",
-            "hf_hub_name": "mteb/askubuntudupquestions-reranking",
-            "description": (
-                "AskUbuntu Question Dataset - Questions from AskUbuntu with manual annotations marking pairs of"
-                " questions as similar or non-similar"
-            ),
-            "reference": "https://github.com/taolei87/askubuntu",
-            "type": "Reranking",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "map",
-            "revision": "2000358ca161889fa9c082cb41daa8dcfb161a54",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Reranking/en/MindSmallReranking.py b/mteb/tasks/Reranking/en/MindSmallReranking.py
index 8da64c1159..25476b6ffa 100644
--- a/mteb/tasks/Reranking/en/MindSmallReranking.py
+++ b/mteb/tasks/Reranking/en/MindSmallReranking.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskReranking import AbsTaskReranking
 
 
 class MindSmallReranking(AbsTaskReranking):
+    metadata = TaskMetadata(
+        name="MindSmallReranking",
+        description="Microsoft News Dataset: A Large-Scale English Dataset for News Recommendation Research",
+        reference="https://msnews.github.io/assets/doc/ACL2020_MIND.pdf",
+        hf_hub_name="msnews/mind-small-reranking",
+        type="Reranking",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="map",
+        revision="3bdac13927fdc888b903db93b2ffdbd90b295a69",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "MindSmallReranking",
-            "hf_hub_name": "mteb/mind_small",
-            "description": "Microsoft News Dataset: A Large-Scale English Dataset for News Recommendation Research",
-            "reference": "https://msnews.github.io/assets/doc/ACL2020_MIND.pdf",
-            "type": "Reranking",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "map",
-            "revision": "3bdac13927fdc888b903db93b2ffdbd90b295a69",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Reranking/en/SciDocsReranking.py b/mteb/tasks/Reranking/en/SciDocsReranking.py
index b7d3991ec9..e5e06e7d50 100644
--- a/mteb/tasks/Reranking/en/SciDocsReranking.py
+++ b/mteb/tasks/Reranking/en/SciDocsReranking.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskReranking import AbsTaskReranking
 
 
 class SciDocsReranking(AbsTaskReranking):
+    metadata = TaskMetadata(
+        name="SciDocsRR",
+        description="Ranking of related scientific papers based on their title.",
+        reference="https://allenai.org/data/scidocs",
+        hf_hub_name="mteb/scidocs-reranking",
+        type="Reranking",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="map",
+        revision="d3c5e1fc0b855ab6097bf1cda04dd73947d7caab",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "SciDocsRR",
-            "hf_hub_name": "mteb/scidocs-reranking",
-            "description": "Ranking of related scientific papers based on their title.",
-            "reference": "https://allenai.org/data/scidocs",
-            "type": "Reranking",
-            "category": "s2s",
-            "eval_splits": ["test", "validation"],
-            "eval_langs": ["en"],
-            "main_score": "map",
-            "revision": "d3c5e1fc0b855ab6097bf1cda04dd73947d7caab",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Reranking/en/StackOverflowDupQuestions.py b/mteb/tasks/Reranking/en/StackOverflowDupQuestions.py
index c2721773e1..130f0600c5 100644
--- a/mteb/tasks/Reranking/en/StackOverflowDupQuestions.py
+++ b/mteb/tasks/Reranking/en/StackOverflowDupQuestions.py
@@ -1,20 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskReranking import AbsTaskReranking
 
 
 class StackOverflowDupQuestions(AbsTaskReranking):
+    metadata = TaskMetadata(
+        name="StackOverflowDupQuestions",
+        description="Stack Overflow Duplicate Questions Task for questions with the tags Java, JavaScript and Python",
+        reference="https://www.microsoft.com/en-us/research/uploads/prod/2019/03/nl4se18LinkSO.pdf",
+        hf_hub_name="mteb/stackoverflowdupquestions-reranking",
+        type="Reranking",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="map",
+        revision="e185fbe320c72810689fc5848eb6114e1ef5ec69",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "StackOverflowDupQuestions",
-            "hf_hub_name": "mteb/stackoverflowdupquestions-reranking",
-            "description": (
-                "Stack Overflow Duplicate Questions Task for questions with the tags Java, JavaScript and Python"
-            ),
-            "reference": "https://www.microsoft.com/en-us/research/uploads/prod/2019/03/nl4se18LinkSO.pdf",
-            "type": "Reranking",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "map",
-            "revision": "e185fbe320c72810689fc5848eb6114e1ef5ec69",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Reranking/fr/AlloprofReranking.py b/mteb/tasks/Reranking/fr/AlloprofReranking.py
index bfd6d1d63e..6093487c9f 100644
--- a/mteb/tasks/Reranking/fr/AlloprofReranking.py
+++ b/mteb/tasks/Reranking/fr/AlloprofReranking.py
@@ -1,21 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskReranking import AbsTaskReranking
 
 
 class AlloprofReranking(AbsTaskReranking):
+    metadata = TaskMetadata(
+        name="AlloprofReranking",
+        description="This dataset was provided by AlloProf, an organisation in Quebec, Canada offering resources and a help forum curated by a large number of teachers to students on all subjects taught from in primary and secondary school",
+        reference="https://huggingface.co/datasets/antoinelb7/alloprof",
+        hf_hub_name="mteb/alloprof-reranking",
+        type="Reranking",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["fr"],
+        main_score="map",
+        revision="666fdacebe0291776e86f29345663dfaf80a0db9",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "AlloprofReranking",
-            "hf_hub_name": "lyon-nlp/mteb-fr-reranking-alloprof-s2p",
-            "description": (
-                "This dataset was provided by AlloProf, an organisation in Quebec, Canada offering resources and a help forum"
-                "curated by a large number of teachers to students on all subjects taught from in primary and secondary school"
-            ),
-            "reference": "https://huggingface.co/datasets/antoinelb7/alloprof",
-            "type": "Reranking",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["fr"],
-            "main_score": "map",
-            "revision": "666fdacebe0291776e86f29345663dfaf80a0db9",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Reranking/fr/SyntecReranking.py b/mteb/tasks/Reranking/fr/SyntecReranking.py
index 7a2d350641..b4e25f90fe 100644
--- a/mteb/tasks/Reranking/fr/SyntecReranking.py
+++ b/mteb/tasks/Reranking/fr/SyntecReranking.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskReranking import AbsTaskReranking
 
 
 class SyntecReranking(AbsTaskReranking):
+    metadata = TaskMetadata(
+        name="SyntecReranking",
+        description="This dataset has been built from the Syntec Collective bargaining agreement.",
+        reference="https://huggingface.co/datasets/lyon-nlp/mteb-fr-reranking-syntec-s2p",
+        hf_hub_name="lyon-nlp/mteb-fr-reranking-syntec-s2p",
+        type="Reranking",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["fr"],
+        main_score="map",
+        revision="b205c5084a0934ce8af14338bf03feb19499c84d",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "SyntecReranking",
-            "hf_hub_name": "lyon-nlp/mteb-fr-reranking-syntec-s2p",
-            "description": "This dataset has been built from the Syntec Collective bargaining agreement.",
-            "reference": "https://huggingface.co/datasets/lyon-nlp/mteb-fr-reranking-syntec-s2p",
-            "type": "Reranking",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["fr"],
-            "main_score": "map",
-            "revision": "b205c5084a0934ce8af14338bf03feb19499c84d",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py
index 9629ad3061..fee0c71e45 100644
--- a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py
+++ b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py
@@ -1,23 +1,35 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import MultilingualTask
 from ....abstasks.AbsTaskReranking import AbsTaskReranking
 
 
 class MIRACLReranking(MultilingualTask, AbsTaskReranking):
+    metadata = TaskMetadata(
+        name="MIRACLReranking",
+        description="MIRACL (Multilingual Information Retrieval Across a Continuum of Languages) is a multilingual retrieval dataset that focuses on search across 18 different languages. This task focuses on the German and Spanish subset.",
+        reference="https://project-miracl.github.io/",
+        hf_hub_name="jinaai/miracl",
+        type="Reranking",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["de", "es"],
+        main_score="map",
+        revision="d28a029f35c4ff7f616df47b0edf54e6882395e6",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "MIRACLReranking",
-            "hf_hub_name": "jinaai/miracl",
-            "reference": "https://project-miracl.github.io/",
-            "description": (
-                "MIRACL (Multilingual Information Retrieval Across a Continuum of Languages) is a multilingual "
-                "retrieval dataset that focuses on search across 18 different languages. This task focuses on "
-                "the German and Spanish subset."
-            ),
-            "type": "Reranking",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["de", "es"],
-            "main_score": "map",
-            "revision": "d28a029f35c4ff7f616df47b0edf54e6882395e6",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Reranking/zh/CMTEBReranking.py b/mteb/tasks/Reranking/zh/CMTEBReranking.py
index 0eeace7f8b..e9bb6ae056 100644
--- a/mteb/tasks/Reranking/zh/CMTEBReranking.py
+++ b/mteb/tasks/Reranking/zh/CMTEBReranking.py
@@ -1,9 +1,36 @@
+from __future__ import annotations
+
 from mteb.abstasks.AbsTaskReranking import AbsTaskReranking
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class T2Reranking(AbsTaskReranking):
+    metadata = TaskMetadata(
+        name="T2Reranking",
+        description="T2Ranking: A large-scale Chinese Benchmark for Passage Ranking",
+        reference="https://arxiv.org/abs/2304.03679",
+        hf_hub_name="C-MTEB/T2Reranking",
+        type="Reranking",
+        category="s2s",
+        eval_splits=["dev"],
+        eval_langs=["zh"],
+        main_score="map",
+        revision="76631901a18387f85eaa53e5450019b87ad58ef9",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
         return {
             "name": "T2Reranking",
             "hf_hub_name": "C-MTEB/T2Reranking",
@@ -19,51 +46,87 @@ def description(self):
 
 
 class MMarcoReranking(AbsTaskReranking):
+    metadata = TaskMetadata(
+        name="MMarcoReranking",
+        description="mMARCO is a multilingual version of the MS MARCO passage ranking dataset",
+        reference="https://github.com/unicamp-dl/mMARCO",
+        hf_hub_name="C-MTEB/Mmarco-reranking",
+        type="Reranking",
+        category="s2s",
+        eval_splits=["dev"],
+        eval_langs=["zh"],
+        main_score="map",
+        revision="8e0c766dbe9e16e1d221116a3f36795fbade07f6",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "MMarcoReranking",
-            "hf_hub_name": "C-MTEB/Mmarco-reranking",
-            "description": "mMARCO is a multilingual version of the MS MARCO passage ranking dataset",
-            "reference": "https://github.com/unicamp-dl/mMARCO",
-            "type": "Reranking",
-            "category": "s2p",
-            "eval_splits": ["dev"],
-            "eval_langs": ["zh"],
-            "main_score": "map",
-            "revision": "8e0c766dbe9e16e1d221116a3f36795fbade07f6",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
 
 class CMedQAv1(AbsTaskReranking):
+    metadata = TaskMetadata(
+        name="CMedQAv1-reranking",
+        description="Chinese community medical question answering",
+        reference="https://github.com/zhangsheng93/cMedQA",
+        hf_hub_name="C-MTEB/CMedQAv1-reranking",
+        type="Reranking",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["zh"],
+        main_score="map",
+        revision="cd540c506dae1cf9e9a59c3e06f42030d54e7301",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "CMedQAv1",
-            "hf_hub_name": "C-MTEB/CMedQAv1-reranking",
-            "description": "Chinese community medical question answering",
-            "reference": "https://github.com/zhangsheng93/cMedQA",
-            "type": "Reranking",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["zh"],
-            "main_score": "map",
-            "revision": "8d7f1e942507dac42dc58017c1a001c3717da7df",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
 
 class CMedQAv2(AbsTaskReranking):
+    metadata = TaskMetadata(
+        name="CMedQAv2-reranking",
+        description="Chinese community medical question answering",
+        reference="https://github.com/zhangsheng93/cMedQA2",
+        hf_hub_name="C-MTEB/CMedQAv2-reranking",
+        type="Reranking",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["zh"],
+        main_score="map",
+        revision="23d186750531a14a0357ca22cd92d712fd512ea0",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "CMedQAv2",
-            "hf_hub_name": "C-MTEB/CMedQAv2-reranking",
-            "description": "Chinese community medical question answering",
-            "reference": "https://github.com/zhangsheng93/cMedQA2",
-            "type": "Reranking",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["zh"],
-            "main_score": "map",
-            "revision": "23d186750531a14a0357ca22cd92d712fd512ea0",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py
index 84542d75a0..4ca7fb0ff3 100644
--- a/mteb/tasks/Retrieval/__init__.py
+++ b/mteb/tasks/Retrieval/__init__.py
@@ -1,9 +1,10 @@
-from .fr.AlloprofRetrieval import *
-from .pl.ArguAnaPLRetrieval import *
+from __future__ import annotations
+
+from .de.GerDaLIRRetrieval import *
+from .de.GermanDPRRetrieval import *
+from .de.GermanQuADRetrieval import *
 from .en.ArguAnaRetrieval import *
-from .fr.BSARDRetrieval import *
 from .en.ClimateFEVERRetrieval import *
-from .zh.CMTEBRetrieval import *
 from .en.CQADupstackAndroidRetrieval import *
 from .en.CQADupstackEnglishRetrieval import *
 from .en.CQADupstackGamingRetrieval import *
@@ -16,45 +17,43 @@
 from .en.CQADupstackUnixRetrieval import *
 from .en.CQADupstackWebmastersRetrieval import *
 from .en.CQADupstackWordpressRetrieval import *
-from .pl.DBPediaPLRetrieval import *
 from .en.DBPediaRetrieval import *
 from .en.FEVERRetrieval import *
 from .en.FiQA2018Retrieval import *
-from .pl.FiQAPLRetrieval import *
-from .de.GermanQuADRetrieval import *
 from .en.HagridRetrieval import *
-from .pl.HotpotQAPLRetrieval import *
 from .en.HotpotQARetrieval import *
-from .pl.MSMARCOPLRetrieval import *
 from .en.MSMARCORetrieval import *
 from .en.MSMARCOv2Retrieval import *
 from .en.NarrativeQARetrieval import *
-from .pl.NFCorpusPLRetrieval import *
 from .en.NFCorpusRetrieval import *
-from .pl.NQPLRetrieval import *
 from .en.NQRetrieval import *
-from .pl.QuoraPLRetrieval import *
 from .en.QuoraRetrieval import *
-from .pl.SCIDOCSPLRetrieval import *
 from .en.SCIDOCSRetrieval import *
-from .pl.SciFactPLRetrieval import *
 from .en.SciFactRetrieval import *
-from .fr.SyntecRetrieval import *
 from .en.Touche2020Retrieval import *
-from .pl.TRECCOVIDPLRetrieval import *
 from .en.TRECCOVIDRetrieval import *
-from .en.NarrativeQARetrieval import *
-from .de.GermanQuADRetrieval import *
-from .de.GerDaLIRRetrieval import *
-from .de.GermanDPRRetrieval import *
-from .de.GerDaLIRRetrieval import *
-from .multilingual.MultiLongDocRetrieval import *
-from .ko.KoStrategyQA import *
-from .ko.KoMrtydi import *
-from .ko.KoMiracl import *
-from .es.SpanishPassageRetrievalS2S import *
 from .es.SpanishPassageRetrievalS2P import *
-from .multilingual.XPQARetrieval import *
+from .es.SpanishPassageRetrievalS2S import *
+from .fr.AlloprofRetrieval import *
+from .fr.BSARDRetrieval import *
+from .fr.SyntecRetrieval import *
+from .ko.KoMiracl import *
+from .ko.KoMrtydi import *
+from .ko.KoStrategyQA import *
 from .multilingual.MintakaRetrieval import *
 from .multilingual.MIRACLRetrieval import *
+from .multilingual.MultiLongDocRetrieval import *
 from .multilingual.XMarketRetrieval import *
+from .multilingual.XPQARetrieval import *
+from .pl.ArguAnaPLRetrieval import *
+from .pl.DBPediaPLRetrieval import *
+from .pl.FiQAPLRetrieval import *
+from .pl.HotpotQAPLRetrieval import *
+from .pl.MSMARCOPLRetrieval import *
+from .pl.NFCorpusPLRetrieval import *
+from .pl.NQPLRetrieval import *
+from .pl.QuoraPLRetrieval import *
+from .pl.SCIDOCSPLRetrieval import *
+from .pl.SciFactPLRetrieval import *
+from .pl.TRECCOVIDPLRetrieval import *
+from .zh.CMTEBRetrieval import *
diff --git a/mteb/tasks/Retrieval/de/GerDaLIRRetrieval.py b/mteb/tasks/Retrieval/de/GerDaLIRRetrieval.py
index 01aa944c08..30d51a416d 100644
--- a/mteb/tasks/Retrieval/de/GerDaLIRRetrieval.py
+++ b/mteb/tasks/Retrieval/de/GerDaLIRRetrieval.py
@@ -1,48 +1,61 @@
+from __future__ import annotations
+
 import datasets
 
 from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class GerDaLIR(AbsTaskRetrieval):
     _EVAL_SPLIT = "test"
 
+    metadata = TaskMetadata(
+        name="GerDaLIR",
+        description="GerDaLIR is a legal information retrieval dataset created from the Open Legal Data platform.",
+        reference="https://github.com/lavis-nlp/GerDaLIR",
+        hf_hub_name="jinaai/ger_da_lir",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=[_EVAL_SPLIT],
+        eval_langs=["de"],
+        main_score="ndcg_at_10",
+        revision="0bb47f1d73827e96964edb84dfe552f62f4fd5eb",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "GerDaLIR",
-            "hf_hub_name": "jinaai/ger_da_lir",
-            "description": (
-                "GerDaLIR is a legal information retrieval dataset created from the Open Legal Data platform."
-            ),
-            "reference": "https://github.com/lavis-nlp/GerDaLIR",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["de"],
-            "main_score": "ndcg_at_10",
-            "revision": "0bb47f1d73827e96964edb84dfe552f62f4fd5eb",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
 
         query_rows = datasets.load_dataset(
-            self.description["hf_hub_name"],
+            self.metadata_dict["hf_hub_name"],
             "queries",
-            revision=self.description.get("revision", None),
+            revision=self.metadata_dict.get("revision", None),
             split=self._EVAL_SPLIT,
         )
         corpus_rows = datasets.load_dataset(
-            self.description["hf_hub_name"],
+            self.metadata_dict["hf_hub_name"],
             "corpus",
-            revision=self.description.get("revision", None),
+            revision=self.metadata_dict.get("revision", None),
             split=self._EVAL_SPLIT,
         )
         qrels_rows = datasets.load_dataset(
-            self.description["hf_hub_name"],
+            self.metadata_dict["hf_hub_name"],
             "qrels",
-            revision=self.description.get("revision", None),
+            revision=self.metadata_dict.get("revision", None),
             split=self._EVAL_SPLIT,
         )
 
diff --git a/mteb/tasks/Retrieval/de/GermanDPRRetrieval.py b/mteb/tasks/Retrieval/de/GermanDPRRetrieval.py
index 3a15bcbaa3..24b1c02abb 100644
--- a/mteb/tasks/Retrieval/de/GermanDPRRetrieval.py
+++ b/mteb/tasks/Retrieval/de/GermanDPRRetrieval.py
@@ -1,5 +1,9 @@
+from __future__ import annotations
+
 import datasets
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
@@ -7,29 +11,40 @@ class GermanDPR(AbsTaskRetrieval):
     _EVAL_SPLIT = "test"
     _LANGUAGE = "de"
 
+    metadata = TaskMetadata(
+        name="GermanDPR",
+        description="GermanDPR is a German Question Answering dataset for open-domain QA. It associates questions with a textual context containing the answer",
+        reference="https://www.deepset.ai/germanquad",
+        hf_hub_name="deepset/germanquad",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=[_EVAL_SPLIT],
+        eval_langs=[_LANGUAGE],
+        main_score="ndcg_at_10",
+        revision="5129d02422a66be600ac89cd3e8531b4f97d347d",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "GermanDPR",
-            "hf_hub_name": "deepset/germandpr",
-            "description": "GermanDPR is a German Question Answering dataset for open-domain QA. It associates "
-            "questions with a textual context containing the answer",
-            "reference": "https://www.deepset.ai/germanquad",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": [self._EVAL_SPLIT],
-            "eval_langs": [self._LANGUAGE],
-            "main_score": "ndcg_at_10",
-            "revision": "5129d02422a66be600ac89cd3e8531b4f97d347d",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     @staticmethod
     def _format_documents(docs, id_prefix="", existing_docs=None):
         if existing_docs is None:
             existing_docs = dict()
         result = {}
-        for i, (title, content) in enumerate(zip(docs['title'], docs['text'])):
-            formatted_content = content.split('==\n')[-1].replace('\n', ' ').lstrip()
+        for i, (title, content) in enumerate(zip(docs["title"], docs["text"])):
+            formatted_content = content.split("==\n")[-1].replace("\n", " ").lstrip()
             if formatted_content in existing_docs:
                 id_value = existing_docs[formatted_content]
             else:
@@ -43,19 +58,25 @@ def load_data(self, **kwargs):
             return
 
         data = datasets.load_dataset(
-            self.description["hf_hub_name"], revision=self.description.get("revision", None), split=self._EVAL_SPLIT
+            self.metadata_dict["hf_hub_name"],
+            revision=self.metadata_dict.get("revision", None),
+            split=self._EVAL_SPLIT,
         )
         corpus = dict()
         queries = dict()
         relevant_docs = dict()
         all_docs = dict()
         for i, row in enumerate(data):
-            q_id = f'q_{i}'
-            queries[q_id] = row['question']
-            pos_docs = self._format_documents(row['positive_ctxs'], id_prefix=f"doc_{i}_p_", existing_docs=all_docs)
+            q_id = f"q_{i}"
+            queries[q_id] = row["question"]
+            pos_docs = self._format_documents(
+                row["positive_ctxs"], id_prefix=f"doc_{i}_p_", existing_docs=all_docs
+            )
             corpus.update(pos_docs)
             neg_docs = self._format_documents(
-                row['hard_negative_ctxs'], id_prefix=f"doc_{i}_n_", existing_docs=all_docs
+                row["hard_negative_ctxs"],
+                id_prefix=f"doc_{i}_n_",
+                existing_docs=all_docs,
             )
             corpus.update(neg_docs)
             relevant_docs[q_id] = {k: 1 for k in pos_docs}
diff --git a/mteb/tasks/Retrieval/de/GermanQuADRetrieval.py b/mteb/tasks/Retrieval/de/GermanQuADRetrieval.py
index 64b7379b73..c0d0fcb315 100644
--- a/mteb/tasks/Retrieval/de/GermanQuADRetrieval.py
+++ b/mteb/tasks/Retrieval/de/GermanQuADRetrieval.py
@@ -1,47 +1,65 @@
+from __future__ import annotations
+
 from collections import defaultdict
-from datasets import load_dataset, DatasetDict
+
+from datasets import DatasetDict, load_dataset
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 def load_retrieval_data(hf_hub_name, eval_splits):
     eval_split = eval_splits[0]
-    corpus_dataset = load_dataset(hf_hub_name, 'corpus')
-    queries_dataset = load_dataset(hf_hub_name, 'queries')
-    qrels = load_dataset(hf_hub_name + '-qrels')[eval_split]
+    corpus_dataset = load_dataset(hf_hub_name, "corpus")
+    queries_dataset = load_dataset(hf_hub_name, "queries")
+    qrels = load_dataset(hf_hub_name + "-qrels")[eval_split]
 
-    corpus = {e['_id']: {'text': e['text']} for e in corpus_dataset['corpus']}
-    queries = {e['_id']: e['text'] for e in queries_dataset['queries']}
+    corpus = {e["_id"]: {"text": e["text"]} for e in corpus_dataset["corpus"]}
+    queries = {e["_id"]: e["text"] for e in queries_dataset["queries"]}
     relevant_docs = defaultdict(dict)
     for e in qrels:
-        relevant_docs[e['query-id']][e['corpus-id']] = e['score']
+        relevant_docs[e["query-id"]][e["corpus-id"]] = e["score"]
 
-    corpus = DatasetDict({eval_split:corpus})
-    queries = DatasetDict({eval_split:queries})
-    relevant_docs = DatasetDict({eval_split:relevant_docs})
+    corpus = DatasetDict({eval_split: corpus})
+    queries = DatasetDict({eval_split: queries})
+    relevant_docs = DatasetDict({eval_split: relevant_docs})
     return corpus, queries, relevant_docs
 
+
 class GermanQuADRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="GermanQuAD-Retrieval",
+        description="Context Retrieval for German Question Answering",
+        reference="https://www.kaggle.com/datasets/GermanQuAD",
+        hf_hub_name="mteb/germanquad",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["de"],
+        main_score="mrr_at_5",
+        revision="f5c87ae5a2e7a5106606314eef45255f03151bb3",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
 
     @property
-    def description(self):
-        return {
-            "name": "GermanQuAD-Retrieval",
-            "hf_hub_name": "mteb/germanquad-retrieval",
-            "description": "Context Retrieval for German Question Answering",
-            "reference": "https://www.deepset.ai/germanquad",
-            "type": "Retrieval",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["de"],
-            "main_score": "mrr_at_10",
-            "revision": "f5c87ae5a2e7a5106606314eef45255f03151bb3",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
 
-        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(self.description['hf_hub_name'],
-                                                                            self.description['eval_splits'])
+        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(
+            self.metadata_dict["hf_hub_name"], self.metadata_dict["eval_splits"]
+        )
         self.data_loaded = True
diff --git a/mteb/tasks/Retrieval/en/ArguAnaRetrieval.py b/mteb/tasks/Retrieval/en/ArguAnaRetrieval.py
index 6e8e0f9114..45fab75d30 100644
--- a/mteb/tasks/Retrieval/en/ArguAnaRetrieval.py
+++ b/mteb/tasks/Retrieval/en/ArguAnaRetrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class ArguAna(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="ArguAna",
+        description="NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval",
+        reference="http://argumentation.bplaced.net/arguana/data",
+        hf_hub_name="mteb/arguana",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="c22ab2a51041ffd869aaddef7af8d8215647e41a",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "ArguAna",
-            "hf_hub_name": "mteb/arguana",
-            "description": "NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval",
-            "reference": "http://argumentation.bplaced.net/arguana/data",
-            "type": "Retrieval",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "c22ab2a51041ffd869aaddef7af8d8215647e41a",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/CQADupstackAndroidRetrieval.py b/mteb/tasks/Retrieval/en/CQADupstackAndroidRetrieval.py
index e8c31fc21c..3df82c7000 100644
--- a/mteb/tasks/Retrieval/en/CQADupstackAndroidRetrieval.py
+++ b/mteb/tasks/Retrieval/en/CQADupstackAndroidRetrieval.py
@@ -1,9 +1,37 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class CQADupstackAndroidRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="CQADupstackAndroidRetrieval",
+        description="CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
+        reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
+        hf_hub_name="mteb/cqadupstack-android",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="f46a197baaae43b4f621051089b82a364682dfeb",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
         return {
             "name": "CQADupstackAndroidRetrieval",
             "hf_hub_name": "mteb/cqadupstack-android",
@@ -14,5 +42,5 @@ def description(self):
             "eval_splits": ["test"],
             "eval_langs": ["en"],
             "main_score": "ndcg_at_10",
-            "revision": "f46a197baaae43b4f621051089b82a364682dfeb"
+            "revision": "f46a197baaae43b4f621051089b82a364682dfeb",
         }
diff --git a/mteb/tasks/Retrieval/en/CQADupstackEnglishRetrieval.py b/mteb/tasks/Retrieval/en/CQADupstackEnglishRetrieval.py
index c70c2777d4..34e7d885af 100644
--- a/mteb/tasks/Retrieval/en/CQADupstackEnglishRetrieval.py
+++ b/mteb/tasks/Retrieval/en/CQADupstackEnglishRetrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class CQADupstackEnglishRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="CQADupstackEnglishRetrieval",
+        description="CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
+        reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
+        hf_hub_name="mteb/cqadupstack-english",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="ad9991cb51e31e31e430383c75ffb2885547b5f0",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "CQADupstackEnglishRetrieval",
-            "hf_hub_name": "mteb/cqadupstack-english",
-            "description": "CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
-            "reference": "http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "ad9991cb51e31e31e430383c75ffb2885547b5f0",            
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/CQADupstackGamingRetrieval.py b/mteb/tasks/Retrieval/en/CQADupstackGamingRetrieval.py
index 31cba6ca3f..c5e0086f11 100644
--- a/mteb/tasks/Retrieval/en/CQADupstackGamingRetrieval.py
+++ b/mteb/tasks/Retrieval/en/CQADupstackGamingRetrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class CQADupstackGamingRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="CQADupstackGamingRetrieval",
+        description="CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
+        reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
+        hf_hub_name="mteb/cqadupstack-gaming",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="4885aa143210c98657558c04aaf3dc47cfb54340",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "CQADupstackGamingRetrieval",
-            "hf_hub_name": "mteb/cqadupstack-gaming",
-            "description": "CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
-            "reference": "http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "4885aa143210c98657558c04aaf3dc47cfb54340",            
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/CQADupstackGisRetrieval.py b/mteb/tasks/Retrieval/en/CQADupstackGisRetrieval.py
index 6f7a0b5297..5a689ba195 100644
--- a/mteb/tasks/Retrieval/en/CQADupstackGisRetrieval.py
+++ b/mteb/tasks/Retrieval/en/CQADupstackGisRetrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class CQADupstackGisRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="CQADupstackGisRetrieval",
+        description="CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
+        reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
+        hf_hub_name="mteb/cqadupstack-gis",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="5003b3064772da1887988e05400cf3806fe491f2",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "CQADupstackGisRetrieval",
-            "hf_hub_name": "mteb/cqadupstack-gis",
-            "description": "CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
-            "reference": "http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "5003b3064772da1887988e05400cf3806fe491f2",            
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/CQADupstackMathematicaRetrieval.py b/mteb/tasks/Retrieval/en/CQADupstackMathematicaRetrieval.py
index 4a174dc646..09fee8d394 100644
--- a/mteb/tasks/Retrieval/en/CQADupstackMathematicaRetrieval.py
+++ b/mteb/tasks/Retrieval/en/CQADupstackMathematicaRetrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class CQADupstackMathematicaRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="CQADupstackMathematicaRetrieval",
+        description="CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
+        reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
+        hf_hub_name="mteb/cqadupstack-mathematica",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="90fceea13679c63fe563ded68f3b6f06e50061de",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "CQADupstackMathematicaRetrieval",
-            "hf_hub_name": "mteb/cqadupstack-mathematica",
-            "description": "CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
-            "reference": "http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "90fceea13679c63fe563ded68f3b6f06e50061de",            
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/CQADupstackPhysicsRetrieval.py b/mteb/tasks/Retrieval/en/CQADupstackPhysicsRetrieval.py
index 6439c8bf2a..d85ccef17d 100644
--- a/mteb/tasks/Retrieval/en/CQADupstackPhysicsRetrieval.py
+++ b/mteb/tasks/Retrieval/en/CQADupstackPhysicsRetrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class CQADupstackPhysicsRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="CQADupstackPhysicsRetrieval",
+        description="CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
+        reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
+        hf_hub_name="mteb/cqadupstack-physics",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="79531abbd1fb92d06c6d6315a0cbbbf5bb247ea4",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "CQADupstackPhysicsRetrieval",
-            "hf_hub_name": "mteb/cqadupstack-physics",
-            "description": "CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
-            "reference": "http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "79531abbd1fb92d06c6d6315a0cbbbf5bb247ea4",            
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/CQADupstackProgrammersRetrieval.py b/mteb/tasks/Retrieval/en/CQADupstackProgrammersRetrieval.py
index b6d3ba164c..9a058bb18e 100644
--- a/mteb/tasks/Retrieval/en/CQADupstackProgrammersRetrieval.py
+++ b/mteb/tasks/Retrieval/en/CQADupstackProgrammersRetrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class CQADupstackProgrammersRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="CQADupstackProgrammersRetrieval",
+        description="CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
+        reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
+        hf_hub_name="mteb/cqadupstack-programmers",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="6184bc1440d2dbc7612be22b50686b8826d22b32",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "CQADupstackProgrammersRetrieval",
-            "hf_hub_name": "mteb/cqadupstack-programmers",
-            "description": "CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
-            "reference": "http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "6184bc1440d2dbc7612be22b50686b8826d22b32",            
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/CQADupstackStatsRetrieval.py b/mteb/tasks/Retrieval/en/CQADupstackStatsRetrieval.py
index 43bf2a0e0d..ef72e65345 100644
--- a/mteb/tasks/Retrieval/en/CQADupstackStatsRetrieval.py
+++ b/mteb/tasks/Retrieval/en/CQADupstackStatsRetrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class CQADupstackStatsRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="CQADupstackStatsRetrieval",
+        description="CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
+        reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
+        hf_hub_name="mteb/cqadupstack-stats",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="65ac3a16b8e91f9cee4c9828cc7c335575432a2a",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "CQADupstackStatsRetrieval",
-            "hf_hub_name": "mteb/cqadupstack-stats",
-            "description": "CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
-            "reference": "http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "65ac3a16b8e91f9cee4c9828cc7c335575432a2a",            
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/CQADupstackTexRetrieval.py b/mteb/tasks/Retrieval/en/CQADupstackTexRetrieval.py
index 543f08fa55..e3887ea231 100644
--- a/mteb/tasks/Retrieval/en/CQADupstackTexRetrieval.py
+++ b/mteb/tasks/Retrieval/en/CQADupstackTexRetrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class CQADupstackTexRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="CQADupstackTexRetrieval",
+        description="CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
+        reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
+        hf_hub_name="mteb/cqadupstack-tex",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="6c6430d3a6d36f8d2a829195bc5dc94d7e063e53",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "CQADupstackTexRetrieval",
-            "hf_hub_name": "mteb/cqadupstack-tex",
-            "description": "CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
-            "reference": "http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "46989137a86843e03a6195de44b09deda022eec7",            
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/CQADupstackUnixRetrieval.py b/mteb/tasks/Retrieval/en/CQADupstackUnixRetrieval.py
index a808b4c4eb..9765909dc0 100644
--- a/mteb/tasks/Retrieval/en/CQADupstackUnixRetrieval.py
+++ b/mteb/tasks/Retrieval/en/CQADupstackUnixRetrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class CQADupstackUnixRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="CQADupstackUnixRetrieval",
+        description="CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
+        reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
+        hf_hub_name="mteb/cqadupstack-unix",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="6c6430d3a6d36f8d2a829195bc5dc94d7e063e53",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "CQADupstackUnixRetrieval",
-            "hf_hub_name": "mteb/cqadupstack-unix",
-            "description": "CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
-            "reference": "http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "6c6430d3a6d36f8d2a829195bc5dc94d7e063e53",            
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/CQADupstackWebmastersRetrieval.py b/mteb/tasks/Retrieval/en/CQADupstackWebmastersRetrieval.py
index f931022740..c8966d06a5 100644
--- a/mteb/tasks/Retrieval/en/CQADupstackWebmastersRetrieval.py
+++ b/mteb/tasks/Retrieval/en/CQADupstackWebmastersRetrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class CQADupstackWebmastersRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="CQADupstackWebmastersRetrieval",
+        description="CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
+        reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
+        hf_hub_name="mteb/cqadupstack-webmasters",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="160c094312a0e1facb97e55eeddb698c0abe3571",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "CQADupstackWebmastersRetrieval",
-            "hf_hub_name": "mteb/cqadupstack-webmasters",
-            "description": "CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
-            "reference": "http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "160c094312a0e1facb97e55eeddb698c0abe3571",            
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/CQADupstackWordpressRetrieval.py b/mteb/tasks/Retrieval/en/CQADupstackWordpressRetrieval.py
index 071817971c..2db5d7abd7 100644
--- a/mteb/tasks/Retrieval/en/CQADupstackWordpressRetrieval.py
+++ b/mteb/tasks/Retrieval/en/CQADupstackWordpressRetrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class CQADupstackWordpressRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="CQADupstackWordpressRetrieval",
+        hf_hub_name="mteb/cqadupstack-wordpress",
+        description="CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
+        reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="4ffe81d471b1924886b33c7567bfb200e9eec5c4",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "CQADupstackWordpressRetrieval",
-            "hf_hub_name": "mteb/cqadupstack-wordpress",
-            "description": "CQADupStack: A Benchmark Data Set for Community Question-Answering Research",
-            "reference": "http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "4ffe81d471b1924886b33c7567bfb200e9eec5c4",            
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/ClimateFEVERRetrieval.py b/mteb/tasks/Retrieval/en/ClimateFEVERRetrieval.py
index b5e907f784..e67e467c7e 100644
--- a/mteb/tasks/Retrieval/en/ClimateFEVERRetrieval.py
+++ b/mteb/tasks/Retrieval/en/ClimateFEVERRetrieval.py
@@ -1,21 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class ClimateFEVER(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="ClimateFEVER",
+        description="CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims regarding climate-change. ",
+        reference="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html",
+        hf_hub_name="mteb/climate-fever",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="47f2ac6acb640fc46020b02a5b59fdda04d39380",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "ClimateFEVER",
-            "hf_hub_name": "mteb/climate-fever",
-            "description": (
-                "CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims"
-                " regarding climate-change."
-            ),
-            "reference": "https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "47f2ac6acb640fc46020b02a5b59fdda04d39380",            
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/DBPediaRetrieval.py b/mteb/tasks/Retrieval/en/DBPediaRetrieval.py
index 4fbd98afa7..ca4a39e34a 100644
--- a/mteb/tasks/Retrieval/en/DBPediaRetrieval.py
+++ b/mteb/tasks/Retrieval/en/DBPediaRetrieval.py
@@ -1,20 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class DBPedia(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="DBPedia",
+        description="DBpedia-Entity is a standard test collection for entity search over the DBpedia knowledge base",
+        reference="https://github.com/iai-group/DBpedia-Entity/",
+        hf_hub_name="mteb/dbpedia",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["dev", "test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="c0f706b76e590d620bd6618b3ca8efdd34e2d659",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "DBPedia",
-            "hf_hub_name": "mteb/dbpedia",
-            "description": (
-                "DBpedia-Entity is a standard test collection for entity search over the DBpedia knowledge base"
-            ),
-            "reference": "https://github.com/iai-group/DBpedia-Entity/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["dev", "test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "c0f706b76e590d620bd6618b3ca8efdd34e2d659",            
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/FEVERRetrieval.py b/mteb/tasks/Retrieval/en/FEVERRetrieval.py
index 865c4a376c..9157d8f26b 100644
--- a/mteb/tasks/Retrieval/en/FEVERRetrieval.py
+++ b/mteb/tasks/Retrieval/en/FEVERRetrieval.py
@@ -1,22 +1,38 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class FEVER(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="FEVER",
+        hf_hub_name="mteb/fever",
+        description=(
+            "FEVER (Fact Extraction and VERification) consists of 185,445 claims generated by altering sentences"
+            " extracted from Wikipedia and subsequently verified without knowledge of the sentence they were"
+            " derived from."
+        ),
+        reference="https://fever.ai/",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["train", "dev", "test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="bea83ef9e8fb933d90a2f1d5515737465d613e12",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "FEVER",
-            "hf_hub_name": "mteb/fever",
-            "description": (
-                "FEVER (Fact Extraction and VERification) consists of 185,445 claims generated by altering sentences"
-                " extracted from Wikipedia and subsequently verified without knowledge of the sentence they were"
-                " derived from."
-            ),
-            "reference": "https://fever.ai/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["train", "dev", "test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "bea83ef9e8fb933d90a2f1d5515737465d613e12",            
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/FiQA2018Retrieval.py b/mteb/tasks/Retrieval/en/FiQA2018Retrieval.py
index 0d33415127..570df54079 100644
--- a/mteb/tasks/Retrieval/en/FiQA2018Retrieval.py
+++ b/mteb/tasks/Retrieval/en/FiQA2018Retrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class FiQA2018(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="FiQA2018",
+        description="Financial Opinion Mining and Question Answering",
+        reference="https://sites.google.com/view/fiqa/",
+        hf_hub_name="mteb/fiqa",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["train", "dev", "test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="27a168819829fe9bcd655c2df245fb19452e8e06",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "FiQA2018",
-            "hf_hub_name": "mteb/fiqa",
-            "description": "Financial Opinion Mining and Question Answering",
-            "reference": "https://sites.google.com/view/fiqa/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["train", "dev", "test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "27a168819829fe9bcd655c2df245fb19452e8e06",            
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/HagridRetrieval.py b/mteb/tasks/Retrieval/en/HagridRetrieval.py
index fe717a9eff..87d7dd5894 100644
--- a/mteb/tasks/Retrieval/en/HagridRetrieval.py
+++ b/mteb/tasks/Retrieval/en/HagridRetrieval.py
@@ -1,31 +1,47 @@
+from __future__ import annotations
+
 import uuid
 from typing import Dict, List
 
 import datasets
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class HagridRetrieval(AbsTaskRetrieval):
-    _EVAL_SPLITS = ["dev"]
+    metadata = TaskMetadata(
+        name="HagridRetrieval",
+        hf_hub_name="miracl/hagrid",
+        reference="https://github.com/project-miracl/hagrid",
+        description=(
+            "HAGRID (Human-in-the-loop Attributable Generative Retrieval for Information-seeking Dataset)"
+            "is a dataset for generative information-seeking scenarios. It consists of queries"
+            "along with a set of manually labelled relevant passages"
+        ),
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["dev"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="b2a085913606be3c4f2f1a8bff1810e38bade8fa",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
 
     @property
-    def description(self):
-        return {
-            "name": "HagridRetrieval",
-            "hf_hub_name": "miracl/hagrid",
-            "reference": "https://github.com/project-miracl/hagrid",
-            "description": (
-                "HAGRID (Human-in-the-loop Attributable Generative Retrieval for Information-seeking Dataset)"
-                "is a dataset for generative information-seeking scenarios. It consists of queries"
-                "along with a set of manually labelled relevant passages"
-            ),
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": self._EVAL_SPLITS,
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "b2a085913606be3c4f2f1a8bff1810e38bade8fa",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
+        return {}
 
     def load_data(self, **kwargs):
         """
@@ -35,13 +51,27 @@ def load_data(self, **kwargs):
             return
 
         data = datasets.load_dataset(
-            "miracl/hagrid", split=self._EVAL_SPLITS[0], revision=self.description.get("revision", None)
+            "miracl/hagrid",
+            split=self.metadata.eval_splits[0],
+            revision=self.metadata_dict.get("revision", None),
         )
         proc_data = self.preprocess_data(data)
 
-        self.queries = {self._EVAL_SPLITS[0]: {d["query_id"]: d["query_text"] for d in proc_data}}
-        self.corpus = {self._EVAL_SPLITS[0]: {d["answer_id"]: {"text": d["answer_text"]} for d in proc_data}}
-        self.relevant_docs = {self._EVAL_SPLITS[0]: {d["query_id"]: {d["answer_id"]: 1} for d in proc_data}}
+        self.queries = {
+            self.metadata.eval_splits[0]: {
+                d["query_id"]: d["query_text"] for d in proc_data
+            }
+        }
+        self.corpus = {
+            self.metadata.eval_splits[0]: {
+                d["answer_id"]: {"text": d["answer_text"]} for d in proc_data
+            }
+        }
+        self.relevant_docs = {
+            self.metadata.eval_splits[0]: {
+                d["query_id"]: {d["answer_id"]: 1} for d in proc_data
+            }
+        }
 
         self.data_loaded = True
 
@@ -80,6 +110,10 @@ def get_best_answer(self, data: Dict) -> str:
         PARAMS:
         data: a dict representing one element of the dataset
         """
-        good_answers = [a["answer"] for a in data["answers"] if a["informative"] == 1 and a["attributable"] == 1]
+        good_answers = [
+            a["answer"]
+            for a in data["answers"]
+            if a["informative"] == 1 and a["attributable"] == 1
+        ]
         # Return 1st one if >=1 good answers else None
         return good_answers[0] if len(good_answers) > 0 else None
diff --git a/mteb/tasks/Retrieval/en/HotpotQARetrieval.py b/mteb/tasks/Retrieval/en/HotpotQARetrieval.py
index 105852e351..380c6f7dbf 100644
--- a/mteb/tasks/Retrieval/en/HotpotQARetrieval.py
+++ b/mteb/tasks/Retrieval/en/HotpotQARetrieval.py
@@ -1,21 +1,37 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class HotpotQA(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="HotpotQA",
+        hf_hub_name="mteb/hotpotqa",
+        description=(
+            "HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong"
+            " supervision for supporting facts to enable more explainable question answering systems."
+        ),
+        reference="https://hotpotqa.github.io/",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["train", "dev", "test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="ab518f4d6fcca38d87c25209f94beba119d02014",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "HotpotQA",
-            "hf_hub_name": "mteb/hotpotqa",
-            "description": (
-                "HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong"
-                " supervision for supporting facts to enable more explainable question answering systems."
-            ),
-            "reference": "https://hotpotqa.github.io/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["train", "dev", "test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "ab518f4d6fcca38d87c25209f94beba119d02014",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/MSMARCORetrieval.py b/mteb/tasks/Retrieval/en/MSMARCORetrieval.py
index cd5d96e3ec..99b30bdfed 100644
--- a/mteb/tasks/Retrieval/en/MSMARCORetrieval.py
+++ b/mteb/tasks/Retrieval/en/MSMARCORetrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class MSMARCO(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="MSMARCO",
+        hf_hub_name="mteb/msmarco",
+        description="MS MARCO is a collection of datasets focused on deep learning in search",
+        reference="https://microsoft.github.io/msmarco/",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["train", "dev", "test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="c5a29a104738b98a9e76336939199e264163d4a0",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "MSMARCO",
-            "hf_hub_name": "mteb/msmarco",
-            "description": "MS MARCO is a collection of datasets focused on deep learning in search",
-            "reference": "https://microsoft.github.io/msmarco/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["train", "dev", "test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "c5a29a104738b98a9e76336939199e264163d4a0",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/MSMARCOv2Retrieval.py b/mteb/tasks/Retrieval/en/MSMARCOv2Retrieval.py
index d800358c52..205e94327b 100644
--- a/mteb/tasks/Retrieval/en/MSMARCOv2Retrieval.py
+++ b/mteb/tasks/Retrieval/en/MSMARCOv2Retrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class MSMARCOv2(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="MSMARCOv2",
+        hf_hub_name="mteb/msmarco-v2",
+        description="MS MARCO is a collection of datasets focused on deep learning in search",
+        reference="https://microsoft.github.io/msmarco/TREC-Deep-Learning.html",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["train", "dev", "dev2"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="b1663124850d305ab7c470bb0548acf8e2e7ea43",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "MSMARCOv2",
-            "hf_hub_name": "mteb/msmarco-v2",
-            "description": "MS MARCO is a collection of datasets focused on deep learning in search",
-            "reference": "https://microsoft.github.io/msmarco/TREC-Deep-Learning.html",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["train", "dev", "dev2"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "b1663124850d305ab7c470bb0548acf8e2e7ea43",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/NFCorpusRetrieval.py b/mteb/tasks/Retrieval/en/NFCorpusRetrieval.py
index 2567513259..7f1dbe1a80 100644
--- a/mteb/tasks/Retrieval/en/NFCorpusRetrieval.py
+++ b/mteb/tasks/Retrieval/en/NFCorpusRetrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class NFCorpus(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="NFCorpus",
+        hf_hub_name="mteb/nfcorpus",
+        description="NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval",
+        reference="https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="ec0fa4fe99da2ff19ca1214b7966684033a58814",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "NFCorpus",
-            "hf_hub_name": "mteb/nfcorpus",
-            "description": "NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval",
-            "reference": "https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "ec0fa4fe99da2ff19ca1214b7966684033a58814",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/NQRetrieval.py b/mteb/tasks/Retrieval/en/NQRetrieval.py
index dc9b81f565..5336d0dfa6 100644
--- a/mteb/tasks/Retrieval/en/NQRetrieval.py
+++ b/mteb/tasks/Retrieval/en/NQRetrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class NQ(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="NQ",
+        hf_hub_name="mteb/nq",
+        description="NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval",
+        reference="https://ai.google.com/research/NaturalQuestions/",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="b774495ed302d8c44a3a7ea25c90dbce03968f31",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "NQ",
-            "hf_hub_name": "mteb/nq",
-            "description": "NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval",
-            "reference": "https://ai.google.com/research/NaturalQuestions/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "b774495ed302d8c44a3a7ea25c90dbce03968f31",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/NarrativeQARetrieval.py b/mteb/tasks/Retrieval/en/NarrativeQARetrieval.py
index c046f6cb1f..9a29847c83 100644
--- a/mteb/tasks/Retrieval/en/NarrativeQARetrieval.py
+++ b/mteb/tasks/Retrieval/en/NarrativeQARetrieval.py
@@ -1,35 +1,68 @@
+from __future__ import annotations
+
 import datasets
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class NarrativeQARetrieval(AbsTaskRetrieval):
+    _EVAL_SPLIT = "test"
 
-    _EVAL_SPLIT = 'test'
+    metadata = TaskMetadata(
+        name="NarrativeQARetrieval",
+        hf_hub_name="narrativeqa",
+        reference="https://metatext.io/datasets/narrativeqa",
+        description=(
+            "NarrativeQA is a dataset for the task of question answering on long narratives. It consists of "
+            "realistic QA instances collected from literature (fiction and non-fiction) and movie scripts. "
+        ),
+        type="Retrieval",
+        category="s2p",
+        eval_splits=[_EVAL_SPLIT],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="2e643e7363944af1c33a652d1c87320d0871c4e4",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
 
     @property
-    def description(self):
-        return {
-            'name': 'NarrativeQARetrieval',
-            'hf_hub_name': 'narrativeqa',
-            'reference': 'https://metatext.io/datasets/narrativeqa',
-            "description": (
-                "NarrativeQA is a dataset for the task of question answering on long narratives. It consists of "
-                "realistic QA instances collected from literature (fiction and non-fiction) and movie scripts. "
-            ),
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
+        return {}
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
 
-        data = datasets.load_dataset(self.description['hf_hub_name'], split=self._EVAL_SPLIT)
-        self.queries = {self._EVAL_SPLIT: {str(i): row['question']['text'] for i, row in enumerate(data)}}
-        self.corpus = {self._EVAL_SPLIT: {str(row['document']['id']): {'text': row['document']['text']} for row in data}}
-        self.relevant_docs = {self._EVAL_SPLIT: {str(i): {row['document']['id']: 1} for i, row in enumerate(data)}}
+        data = datasets.load_dataset(
+            self.metadata_dict["hf_hub_name"], split=self._EVAL_SPLIT
+        )
+        self.queries = {
+            self._EVAL_SPLIT: {
+                str(i): row["question"]["text"] for i, row in enumerate(data)
+            }
+        }
+        self.corpus = {
+            self._EVAL_SPLIT: {
+                str(row["document"]["id"]): {"text": row["document"]["text"]}
+                for row in data
+            }
+        }
+        self.relevant_docs = {
+            self._EVAL_SPLIT: {
+                str(i): {row["document"]["id"]: 1} for i, row in enumerate(data)
+            }
+        }
 
         self.data_loaded = True
diff --git a/mteb/tasks/Retrieval/en/QuoraRetrieval.py b/mteb/tasks/Retrieval/en/QuoraRetrieval.py
index 1f5d19c2d6..e1065f9646 100644
--- a/mteb/tasks/Retrieval/en/QuoraRetrieval.py
+++ b/mteb/tasks/Retrieval/en/QuoraRetrieval.py
@@ -1,20 +1,37 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class QuoraRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="QuoraRetrieval",
+        hf_hub_name="mteb/quora",
+        description=(
+            "QuoraRetrieval is based on questions that are marked as duplicates on the Quora platform. Given a"
+            " question, find other (duplicate) questions."
+        ),
+        reference="https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs",
+        type="Retrieval",
+        category="s2s",
+        eval_splits=["dev", "test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="0be27e93455051e531182b85e85e425aba12e9d4",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "QuoraRetrieval",
-            "hf_hub_name": "mteb/quora",
-            "description": (
-                "QuoraRetrieval is based on questions that are marked as duplicates on the Quora platform. Given a"
-                " question, find other (duplicate) questions."
-            ),
-            "reference": "https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs",
-            "type": "Retrieval",
-            "category": "s2s",
-            "eval_splits": ["dev", "test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/SCIDOCSRetrieval.py b/mteb/tasks/Retrieval/en/SCIDOCSRetrieval.py
index ad52cfd366..2babf0a088 100644
--- a/mteb/tasks/Retrieval/en/SCIDOCSRetrieval.py
+++ b/mteb/tasks/Retrieval/en/SCIDOCSRetrieval.py
@@ -1,20 +1,37 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class SCIDOCS(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="SCIDOCS",
+        hf_hub_name="mteb/scidocs",
+        description=(
+            "SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from citation"
+            " prediction, to document classification and recommendation."
+        ),
+        reference="https://allenai.org/data/scidocs",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="56a6d0140cf6356659e2a7c1413286a774468d44",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "SCIDOCS",
-            "hf_hub_name": "mteb/scidocs",
-            "description": (
-                "SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from citation"
-                " prediction, to document classification and recommendation."
-            ),
-            "reference": "https://allenai.org/data/scidocs",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/SciFactRetrieval.py b/mteb/tasks/Retrieval/en/SciFactRetrieval.py
index fc2428d5fc..be2778e269 100644
--- a/mteb/tasks/Retrieval/en/SciFactRetrieval.py
+++ b/mteb/tasks/Retrieval/en/SciFactRetrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class SciFact(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="SciFact",
+        hf_hub_name="mteb/scifact",
+        description="SciFact verifies scientific claims using evidence from the research literature containing scientific paper abstracts.",
+        reference="https://github.com/allenai/scifact",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["train", "test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="0228b52cf27578f30900b9e5271d331663a030d7",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "SciFact",
-            "hf_hub_name": "mteb/scifact",
-            "description": "SciFact verifies scientific claims using evidence from the research literature containing scientific paper abstracts.",
-            "reference": "https://github.com/allenai/scifact",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["train", "test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "0228b52cf27578f30900b9e5271d331663a030d7"
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/TRECCOVIDRetrieval.py b/mteb/tasks/Retrieval/en/TRECCOVIDRetrieval.py
index 8c6040c894..05e44217d2 100644
--- a/mteb/tasks/Retrieval/en/TRECCOVIDRetrieval.py
+++ b/mteb/tasks/Retrieval/en/TRECCOVIDRetrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class TRECCOVID(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="TRECCOVID",
+        description="TRECCOVID is an ad-hoc search challenge based on the COVID-19 dataset containing scientific articles related to the COVID-19 pandemic.",
+        reference="https://ir.nist.gov/covidSubmit/index.html",
+        hf_hub_name="mteb/trec-covid",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="1271c7809071a13532e05f25fb53511ffce77117",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "TRECCOVID",
-            "hf_hub_name": "mteb/trec-covid",
-            "description": "TRECCOVID is an ad-hoc search challenge based on the CORD-19 dataset containing scientific articles related to the COVID-19 pandemic",
-            "reference": "https://ir.nist.gov/covidSubmit/index.html",
-            "description": "TRECCOVID is an ad-hoc search challenge based on the CORD-19 dataset containing scientific articles related to the COVID-19 pandemic.",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/en/Touche2020Retrieval.py b/mteb/tasks/Retrieval/en/Touche2020Retrieval.py
index 87b7e123d4..43c9c11223 100644
--- a/mteb/tasks/Retrieval/en/Touche2020Retrieval.py
+++ b/mteb/tasks/Retrieval/en/Touche2020Retrieval.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class Touche2020(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="Touche2020",
+        description="Touché Task 1: Argument Retrieval for Controversial Questions",
+        reference="https://webis.de/events/touche-20/shared-task-1.html",
+        hf_hub_name="mteb/touche2020",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="ndcg_at_10",
+        revision="a34f9a33db75fa0cbb21bb5cfc3dae8dc8bec93f",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "Touche2020",
-            "hf_hub_name": "mteb/touche2020",
-            "description": "Touché Task 1: Argument Retrieval for Controversial Questions",
-            "reference": "https://webis.de/events/touche-20/shared-task-1.html",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "ndcg_at_10",
-            "revision": "a34f9a33db75fa0cbb21bb5cfc3dae8dc8bec93f",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/es/SpanishPassageRetrievalS2P.py b/mteb/tasks/Retrieval/es/SpanishPassageRetrievalS2P.py
index 0de1b289e8..df13958c73 100644
--- a/mteb/tasks/Retrieval/es/SpanishPassageRetrievalS2P.py
+++ b/mteb/tasks/Retrieval/es/SpanishPassageRetrievalS2P.py
@@ -1,36 +1,69 @@
-from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
+from __future__ import annotations
 
 import datasets
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
+
 
 class SpanishPassageRetrievalS2P(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="SpanishPassageRetrievalS2P",
+        description="Test collection for passage retrieval from health-related Web resources in Spanish.",
+        reference="https://mklab.iti.gr/results/spanish-passage-retrieval-dataset/",
+        hf_hub_name="jinaai/spanish_passage_retrieval",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["es"],
+        main_score="ndcg_at_10",
+        revision="9cddf2ce5209ade52c2115ccfa00eb22c6d3a837",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "SpanishPassageRetrievalS2P",
-            "hf_hub_name": "jinaai/spanish_passage_retrieval",
-            "description": "Test collection for passage retrieval from health-related Web resources in Spanish.",
-            "reference": "https://mklab.iti.gr/results/spanish-passage-retrieval-dataset/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["es"],
-            "main_score": "ndcg_at_10",
-            "revision": "9cddf2ce5209ade52c2115ccfa00eb22c6d3a837",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
 
-        query_rows = datasets.load_dataset(self.description["hf_hub_name"], "queries", split='test', trust_remote_code=True)
-        corpus_rows = datasets.load_dataset(self.description["hf_hub_name"], "corpus.documents", split='test', trust_remote_code=True)
-        qrels_rows = datasets.load_dataset(self.description["hf_hub_name"], "qrels.s2p", split='test', trust_remote_code=True)
+        query_rows = datasets.load_dataset(
+            self.metadata_dict["hf_hub_name"],
+            "queries",
+            split="test",
+            trust_remote_code=True,
+        )
+        corpus_rows = datasets.load_dataset(
+            self.metadata_dict["hf_hub_name"],
+            "corpus.documents",
+            split="test",
+            trust_remote_code=True,
+        )
+        qrels_rows = datasets.load_dataset(
+            self.metadata_dict["hf_hub_name"],
+            "qrels.s2p",
+            split="test",
+            trust_remote_code=True,
+        )
 
-        self.queries = {'test': {row["_id"]: row["text"] for row in query_rows}}
-        self.corpus = {'test': {row["_id"]: row for row in corpus_rows}}
+        self.queries = {"test": {row["_id"]: row["text"] for row in query_rows}}
+        self.corpus = {"test": {row["_id"]: row for row in corpus_rows}}
         self.relevant_docs = {
-            'test': {row["_id"]: {v: 1 for v in row["text"].split(" ")} for row in qrels_rows}
+            "test": {
+                row["_id"]: {v: 1 for v in row["text"].split(" ")} for row in qrels_rows
+            }
         }
 
         self.data_loaded = True
diff --git a/mteb/tasks/Retrieval/es/SpanishPassageRetrievalS2S.py b/mteb/tasks/Retrieval/es/SpanishPassageRetrievalS2S.py
index fdded5e7d3..70fac5f6b5 100644
--- a/mteb/tasks/Retrieval/es/SpanishPassageRetrievalS2S.py
+++ b/mteb/tasks/Retrieval/es/SpanishPassageRetrievalS2S.py
@@ -1,36 +1,69 @@
-from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
+from __future__ import annotations
 
 import datasets
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
+
 
 class SpanishPassageRetrievalS2S(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="SpanishPassageRetrievalS2S",
+        description="Test collection for passage retrieval from health-related Web resources in Spanish.",
+        reference="https://mklab.iti.gr/results/spanish-passage-retrieval-dataset/",
+        hf_hub_name="jinaai/spanish_passage_retrieval",
+        type="Retrieval",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["es"],
+        main_score="ndcg_at_10",
+        revision="9cddf2ce5209ade52c2115ccfa00eb22c6d3a837",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "SpanishPassageRetrievalS2S",
-            "hf_hub_name": "jinaai/spanish_passage_retrieval",
-            "description": "Test collection for passage retrieval from health-related Web resources in Spanish.",
-            "reference": "https://mklab.iti.gr/results/spanish-passage-retrieval-dataset/",
-            "type": "Retrieval",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["es"],
-            "main_score": "ndcg_at_10",
-            "revision": "9cddf2ce5209ade52c2115ccfa00eb22c6d3a837",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
 
-        query_rows = datasets.load_dataset(self.description["hf_hub_name"], "queries", split='test', trust_remote_code=True)
-        corpus_rows = datasets.load_dataset(self.description["hf_hub_name"], "corpus.sentences", split='test', trust_remote_code=True)
-        qrels_rows = datasets.load_dataset(self.description["hf_hub_name"], "qrels.s2s", split='test', trust_remote_code=True)
+        query_rows = datasets.load_dataset(
+            self.metadata_dict["hf_hub_name"],
+            "queries",
+            split="test",
+            trust_remote_code=True,
+        )
+        corpus_rows = datasets.load_dataset(
+            self.metadata_dict["hf_hub_name"],
+            "corpus.sentences",
+            split="test",
+            trust_remote_code=True,
+        )
+        qrels_rows = datasets.load_dataset(
+            self.metadata_dict["hf_hub_name"],
+            "qrels.s2s",
+            split="test",
+            trust_remote_code=True,
+        )
 
-        self.queries = {'test': {row["_id"]: row["text"] for row in query_rows}}
-        self.corpus = {'test': {row["_id"]: row for row in corpus_rows}}
+        self.queries = {"test": {row["_id"]: row["text"] for row in query_rows}}
+        self.corpus = {"test": {row["_id"]: row for row in corpus_rows}}
         self.relevant_docs = {
-            'test': {row["_id"]: {v: 1 for v in row["text"].split(" ")} for row in qrels_rows}
+            "test": {
+                row["_id"]: {v: 1 for v in row["text"].split(" ")} for row in qrels_rows
+            }
         }
 
         self.data_loaded = True
diff --git a/mteb/tasks/Retrieval/fr/AlloprofRetrieval.py b/mteb/tasks/Retrieval/fr/AlloprofRetrieval.py
index 717c59c04a..027c5c39dd 100644
--- a/mteb/tasks/Retrieval/fr/AlloprofRetrieval.py
+++ b/mteb/tasks/Retrieval/fr/AlloprofRetrieval.py
@@ -1,37 +1,59 @@
+from __future__ import annotations
+
 import datasets
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class AlloprofRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="AlloprofRetrieval",
+        description="This dataset was provided by AlloProf, an organisation in Quebec, Canada offering resources and a help forum curated by a large number of teachers to students on all subjects taught from in primary and secondary school",
+        reference="https://huggingface.co/datasets/antoinelb7/alloprof",
+        hf_hub_name="mteb/alloprof",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["fr"],
+        main_score="ndcg_at_10",
+        revision="392ba3f5bcc8c51f578786c1fc3dae648662cb9b",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
 
     @property
-    def description(self):
-        return {
-            "name": "AlloprofRetrieval",
-            "hf_hub_name": "lyon-nlp/alloprof",
-            "reference": "https://huggingface.co/datasets/antoinelb7/alloprof",
-            "description": (
-                "This dataset was provided by AlloProf, an organisation in Quebec, Canada offering resources and a help forum"
-                "curated by a large number of teachers to students on all subjects taught from in primary and secondary school"
-            ),
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["fr"],
-            "main_score": "ndcg_at_10",
-            "revision": "392ba3f5bcc8c51f578786c1fc3dae648662cb9b",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
         # fetch both subsets of the dataset
-        corpus_raw = datasets.load_dataset(self.description["hf_hub_name"], "documents")
-        queries_raw = datasets.load_dataset(self.description["hf_hub_name"], "queries")
-        eval_split = self.description["eval_splits"][0]
-        self.queries = {eval_split: {str(q["id"]): q["text"] for q in queries_raw[eval_split]}}
-        self.corpus = {eval_split: {str(d["uuid"]): {"text": d["text"]} for d in corpus_raw["documents"]}}
+        corpus_raw = datasets.load_dataset(
+            self.metadata_dict["hf_hub_name"], "documents"
+        )
+        queries_raw = datasets.load_dataset(
+            self.metadata_dict["hf_hub_name"], "queries"
+        )
+        eval_split = self.metadata_dict["eval_splits"][0]
+        self.queries = {
+            eval_split: {str(q["id"]): q["text"] for q in queries_raw[eval_split]}
+        }
+        self.corpus = {
+            eval_split: {
+                str(d["uuid"]): {"text": d["text"]} for d in corpus_raw["documents"]
+            }
+        }
 
         self.relevant_docs = {eval_split: {}}
         for q in queries_raw[eval_split]:
diff --git a/mteb/tasks/Retrieval/fr/BSARDRetrieval.py b/mteb/tasks/Retrieval/fr/BSARDRetrieval.py
index 7a136f0ac4..b6ccc411eb 100644
--- a/mteb/tasks/Retrieval/fr/BSARDRetrieval.py
+++ b/mteb/tasks/Retrieval/fr/BSARDRetrieval.py
@@ -1,59 +1,75 @@
+from __future__ import annotations
+
 import datasets
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class BSARDRetrieval(AbsTaskRetrieval):
-    _EVAL_SPLITS = ["test"]
+    metadata = TaskMetadata(
+        name="BSARDRetrieval",
+        description="The Belgian Statutory Article Retrieval Dataset (BSARD) is a French native dataset for studying legal information retrieval. BSARD consists of more than 22,600 statutory articles from Belgian law and about 1,100 legal questions posed by Belgian citizens and labeled by experienced jurists with relevant articles from the corpus.",
+        reference="https://huggingface.co/datasets/maastrichtlawtech/bsard",
+        hf_hub_name="mteb/bsard",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["fr"],
+        main_score="ndcg_at_100",
+        revision="5effa1b9b5fa3b0f9e12523e6e43e5f86a6e6d59",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
 
     @property
-    def description(self):
-        return {
-            "name": "BSARDRetrieval",
-            "hf_hub_name": "maastrichtlawtech/bsard",
-            "reference": "https://huggingface.co/datasets/maastrichtlawtech/bsard",
-            "description": (
-                "The Belgian Statutory Article Retrieval Dataset (BSARD)"
-                "is a French native dataset for studying legal information retrieval."
-                "BSARD consists of more than 22,600 statutory articles from Belgian law"
-                "and about 1,100 legal questions posed by Belgian citizens and labeled"
-                "by experienced jurists with relevant articles from the corpus."
-            ),
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": self._EVAL_SPLITS,
-            "eval_langs": ["fr"],
-            "main_score": "ndcg_at_100",
-            "revision": "5effa1b9b5fa3b0f9e12523e6e43e5f86a6e6d59",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
         # fetch both subsets of the dataset, only test split
         corpus_raw = datasets.load_dataset(
-            self.description["hf_hub_name"],
-            "corpus", split="corpus",
-            revision=self.description.get("revision", None),
+            self.metadata_dict["hf_hub_name"],
+            "corpus",
+            split="corpus",
+            revision=self.metadata_dict.get("revision", None),
         )
         queries_raw = datasets.load_dataset(
-            self.description["hf_hub_name"],
+            self.metadata_dict["hf_hub_name"],
             "questions",
-            split=self._EVAL_SPLITS[0],
-            revision=self.description.get("revision", None),
+            split=self.metadata.eval_splits[0],
+            revision=self.metadata_dict.get("revision", None),
         )
 
         self.queries = {
-            self._EVAL_SPLITS[0]: {
-                str(q["id"]): " ".join((q["question"] + q["extra_description"])) for q in queries_raw
+            self.metadata.eval_splits[0]: {
+                str(q["id"]): " ".join((q["question"] + q["extra_description"]))
+                for q in queries_raw
             }
         }
 
-        self.corpus = {self._EVAL_SPLITS[0]: {str(d["id"]): {"text": d["article"]} for d in corpus_raw}}
+        self.corpus = {
+            self.metadata.eval_splits[0]: {
+                str(d["id"]): {"text": d["article"]} for d in corpus_raw
+            }
+        }
 
-        self.relevant_docs = {self._EVAL_SPLITS[0]: {}}
+        self.relevant_docs = {self.metadata.eval_splits[0]: {}}
         for q in queries_raw:
             for doc_id in q["article_ids"]:
-                self.relevant_docs[self._EVAL_SPLITS[0]][str(q["id"])] = {str(doc_id): 1}
+                self.relevant_docs[self.metadata.eval_splits[0]][str(q["id"])] = {
+                    str(doc_id): 1
+                }
 
         self.data_loaded = True
diff --git a/mteb/tasks/Retrieval/fr/SyntecRetrieval.py b/mteb/tasks/Retrieval/fr/SyntecRetrieval.py
index aa0ff769aa..e41679185e 100644
--- a/mteb/tasks/Retrieval/fr/SyntecRetrieval.py
+++ b/mteb/tasks/Retrieval/fr/SyntecRetrieval.py
@@ -1,52 +1,70 @@
+from __future__ import annotations
+
 import datasets
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class SyntecRetrieval(AbsTaskRetrieval):
     _EVAL_SPLITS = ["test"]
 
+    metadata = TaskMetadata(
+        name="SyntecRetrieval",
+        description="This dataset has been built from the Syntec Collective bargaining agreement.",
+        reference="https://huggingface.co/datasets/lyon-nlp/mteb-fr-retrieval-syntec-s2p",
+        hf_hub_name="lyon-nlp/mteb-fr-retrieval-syntec-s2p",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=_EVAL_SPLITS,
+        eval_langs=["fr"],
+        main_score="map",
+        revision="b205c5084a0934ce8af14338bf03feb19499c84d",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
 
     @property
-    def description(self):
-        return {
-            "name": "SyntecRetrieval",
-            "hf_hub_name": "lyon-nlp/mteb-fr-retrieval-syntec-s2p",
-            "reference": "https://huggingface.co/datasets/lyon-nlp/mteb-fr-retrieval-syntec-s2p",
-            "description": (
-                "This dataset has been built from the Syntec Collective bargaining agreement."
-                "It maps a question to an article from the agreement"
-            ),
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": self._EVAL_SPLITS,
-            "eval_langs": ["fr"],
-            "main_score": "ndcg_at_5",
-            "revision": "77f7e271bf4a92b24fce5119f3486b583ca016ff",
-        }
-
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
         # fetch both subsets of the dataset
-        corpus_raw = datasets.load_dataset(self.description["hf_hub_name"], "documents")
-        queries_raw = datasets.load_dataset(self.description["hf_hub_name"], "queries")
+        corpus_raw = datasets.load_dataset(
+            self.metadata_dict["hf_hub_name"], "documents"
+        )
+        queries_raw = datasets.load_dataset(
+            self.metadata_dict["hf_hub_name"], "queries"
+        )
 
         self.queries = {
             self._EVAL_SPLITS[0]: {
-                str(i): q["Question"] 
-                for i, q in enumerate(queries_raw["queries"])}
+                str(i): q["Question"] for i, q in enumerate(queries_raw["queries"])
             }
+        }
 
         corpus_raw = corpus_raw["documents"]
         corpus_raw = corpus_raw.rename_column("content", "text")
-        self.corpus = {self._EVAL_SPLITS[0]: {str(row["id"]): row for row in corpus_raw}}
+        self.corpus = {
+            self._EVAL_SPLITS[0]: {str(row["id"]): row for row in corpus_raw}
+        }
 
         self.relevant_docs = {
             self._EVAL_SPLITS[0]: {
-                str(i) : {str(q["Article"]): 1}
+                str(i): {str(q["Article"]): 1}
                 for i, q in enumerate(queries_raw["queries"])
-        }}
+            }
+        }
 
         self.data_loaded = True
diff --git a/mteb/tasks/Retrieval/ko/KoMiracl.py b/mteb/tasks/Retrieval/ko/KoMiracl.py
index 420ea68c20..901af78d43 100644
--- a/mteb/tasks/Retrieval/ko/KoMiracl.py
+++ b/mteb/tasks/Retrieval/ko/KoMiracl.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class KoMiracl(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="Ko-miracl",
+        description="Ko-miracl",
+        reference=None,
+        hf_hub_name="taeminlee/Ko-miracl",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["dev"],
+        eval_langs=["ko"],
+        main_score="ndcg_at_10",
+        revision="5c7690518e481375551916f24241048cf7b017d0",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "Ko-miracl",
-            "hf_hub_name": "taeminlee/Ko-miracl",
-            "description": "Ko-miracl",
-            "reference": "",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["dev"],
-            "eval_langs": ["ko"],
-            "main_score": "ndcg_at_10",
-            "revision": "5c7690518e481375551916f24241048cf7b017d0",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/ko/KoMrtydi.py b/mteb/tasks/Retrieval/ko/KoMrtydi.py
index 03e28621d7..36d9a8d859 100644
--- a/mteb/tasks/Retrieval/ko/KoMrtydi.py
+++ b/mteb/tasks/Retrieval/ko/KoMrtydi.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class KoMrtydi(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="Ko-mrtydi",
+        description="Ko-mrtydi",
+        reference=None,
+        hf_hub_name="taeminlee/Ko-mrtydi",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["dev"],
+        eval_langs=["ko"],
+        main_score="ndcg_at_10",
+        revision="71a2e011a42823051a2b4eb303a3366bdbe048d3",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "Ko-mrtydi",
-            "hf_hub_name": "taeminlee/Ko-mrtydi",
-            "description": "Ko-mrtydi",
-            "reference": "",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["dev"],
-            "eval_langs": ["ko"],
-            "main_score": "ndcg_at_10",
-            "revision": "71a2e011a42823051a2b4eb303a3366bdbe048d3",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/ko/KoStrategyQA.py b/mteb/tasks/Retrieval/ko/KoStrategyQA.py
index a1a2aaaf4c..7d26281173 100644
--- a/mteb/tasks/Retrieval/ko/KoStrategyQA.py
+++ b/mteb/tasks/Retrieval/ko/KoStrategyQA.py
@@ -1,18 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class KoStrategyQA(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="Ko-StrategyQA",
+        description="Ko-StrategyQA",
+        reference=None,
+        hf_hub_name="taeminlee/Ko-StrategyQA",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["dev"],
+        eval_langs=["ko"],
+        main_score="ndcg_at_10",
+        revision="d243889a3eb6654029dbd7e7f9319ae31d58f97c",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "Ko-StrategyQA",
-            "hf_hub_name": "taeminlee/Ko-StrategyQA",
-            "description": "Ko-StrategyQA",
-            "reference": "",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["dev"],
-            "eval_langs": ["ko"],
-            "main_score": "ndcg_at_10",
-            "revision": "d243889a3eb6654029dbd7e7f9319ae31d58f97c"
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/multilingual/MIRACLRetrieval.py b/mteb/tasks/Retrieval/multilingual/MIRACLRetrieval.py
index 66756557d1..9814dcd72c 100644
--- a/mteb/tasks/Retrieval/multilingual/MIRACLRetrieval.py
+++ b/mteb/tasks/Retrieval/multilingual/MIRACLRetrieval.py
@@ -1,5 +1,9 @@
+from __future__ import annotations
+
 import datasets
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import MultilingualTask
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
@@ -7,7 +11,9 @@
 _LANGS = ["de", "es"]
 
 
-def _load_miracl_data(path: str, langs: list, split: str, cache_dir: str = None, revision: str = None):
+def _load_miracl_data(
+    path: str, langs: list, split: str, cache_dir: str = None, revision: str = None
+):
     queries = {lang: {split: {}} for lang in langs}
     corpus = {lang: {split: {}} for lang in langs}
     relevant_docs = {lang: {split: {}} for lang in langs}
@@ -54,35 +60,43 @@ def _load_miracl_data(path: str, langs: list, split: str, cache_dir: str = None,
 
 
 class MIRACLRetrieval(MultilingualTask, AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="MIRACLRetrieval",
+        description="MIRACLRetrieval",
+        reference=None,
+        hf_hub_name="jinaai/miracl",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=[_EVAL_SPLIT],
+        eval_langs=_LANGS,
+        main_score="ndcg_at_10",
+        revision="d28a029f35c4ff7f616df47b0edf54e6882395e6",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "MIRACLRetrieval",
-            "hf_hub_name": "jinaai/miracl",
-            "reference": "https://project-miracl.github.io/",
-            "description": (
-                "MIRACL (Multilingual Information Retrieval Across a Continuum of Languages) is a multilingual "
-                "retrieval dataset that focuses on search across 18 different languages. This task focuses on "
-                "the Spanish subset, using the test set containing 648 queries and 6443 passages."
-            ),
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": [_EVAL_SPLIT],
-            "eval_langs": _LANGS,
-            "main_score": "ndcg_at_10",
-            "revision": "d28a029f35c4ff7f616df47b0edf54e6882395e6",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
 
         self.corpus, self.queries, self.relevant_docs = _load_miracl_data(
-            path=self.description["hf_hub_name"],
+            path=self.metadata_dict["hf_hub_name"],
             langs=self.langs,
-            split=self.description["eval_splits"][0],
+            split=self.metadata_dict["eval_splits"][0],
             cache_dir=kwargs.get("cache_dir", None),
-            revision=self.description["revision"],
+            revision=self.metadata_dict["revision"],
         )
 
         self.data_loaded = True
diff --git a/mteb/tasks/Retrieval/multilingual/MintakaRetrieval.py b/mteb/tasks/Retrieval/multilingual/MintakaRetrieval.py
index acc05ca9ec..9ae739ae09 100644
--- a/mteb/tasks/Retrieval/multilingual/MintakaRetrieval.py
+++ b/mteb/tasks/Retrieval/multilingual/MintakaRetrieval.py
@@ -1,5 +1,9 @@
+from __future__ import annotations
+
 import datasets
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import MultilingualTask
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
@@ -7,7 +11,9 @@
 _LANGS = ["ar", "de", "es", "fr", "hi", "it", "ja", "pt"]
 
 
-def _load_mintaka_data(path: str, langs: list, split: str, cache_dir: str = None, revision: str = None):
+def _load_mintaka_data(
+    path: str, langs: list, split: str, cache_dir: str = None, revision: str = None
+):
     queries = {lang: {split: {}} for lang in langs}
     corpus = {lang: {split: {}} for lang in langs}
     relevant_docs = {lang: {split: {}} for lang in langs}
@@ -20,7 +26,9 @@ def _load_mintaka_data(path: str, langs: list, split: str, cache_dir: str = None
             cache_dir=cache_dir,
             revision=revision,
         )
-        question_ids = {question: _id for _id, question in enumerate(set(data["question"]))}
+        question_ids = {
+            question: _id for _id, question in enumerate(set(data["question"]))
+        }
         answer_ids = {answer: _id for _id, answer in enumerate(set(data["answer"]))}
 
         for row in data:
@@ -40,34 +48,45 @@ def _load_mintaka_data(path: str, langs: list, split: str, cache_dir: str = None
 
     return corpus, queries, relevant_docs
 
+
 class MintakaRetrieval(MultilingualTask, AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="MintakaRetrieval",
+        description="MintakaRetrieval",
+        reference=None,
+        hf_hub_name="jinaai/mintakaqa",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=[_EVAL_SPLIT],
+        eval_langs=_LANGS,
+        main_score="ndcg_at_10",
+        revision="efa78cc2f74bbcd21eff2261f9e13aebe40b814e",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "MintakaRetrieval",
-            "hf_hub_name": "jinaai/mintakaqa",
-            "reference": "https://github.com/amazon-science/mintaka",
-            "description": (
-                "Mintaka: A Complex, Natural, and Multilingual Dataset for End-to-End Question Answering."
-            ),
-            "type": "Retrieval",
-            "category": "s2s",
-            "eval_splits": [_EVAL_SPLIT],
-            "eval_langs": _LANGS,
-            "main_score": "ndcg_at_10",
-            "revision": "efa78cc2f74bbcd21eff2261f9e13aebe40b814e"
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
 
         self.corpus, self.queries, self.relevant_docs = _load_mintaka_data(
-            path=self.description["hf_hub_name"],
-            langs=self.langs,
-            split=self.description["eval_splits"][0],
+            path=self.metadata_dict["hf_hub_name"],
+            langs=self.metadata.eval_langs,
+            split=self.metadata_dict["eval_splits"][0],
             cache_dir=kwargs.get("cache_dir", None),
-            revision=self.description["revision"],
+            revision=self.metadata_dict["revision"],
         )
 
         self.data_loaded = True
diff --git a/mteb/tasks/Retrieval/multilingual/MultiLongDocRetrieval.py b/mteb/tasks/Retrieval/multilingual/MultiLongDocRetrieval.py
index fd723d7aab..fbdc04b664 100644
--- a/mteb/tasks/Retrieval/multilingual/MultiLongDocRetrieval.py
+++ b/mteb/tasks/Retrieval/multilingual/MultiLongDocRetrieval.py
@@ -1,25 +1,48 @@
+from __future__ import annotations
+
 import datasets
-from ....abstasks import MultilingualTask, AbsTaskRetrieval
-from ....abstasks.AbsTaskRetrieval import *
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
-_LANGUAGES = ['ar', 'de', 'en', 'es', 'fr', 'hi', 'it', 'ja', 'ko', 'pt', 'ru', 'th', 'zh']
+from ....abstasks import AbsTaskRetrieval, MultilingualTask
+from ....abstasks.AbsTaskRetrieval import *
 
+_LANGUAGES = [
+    "ar",
+    "de",
+    "en",
+    "es",
+    "fr",
+    "hi",
+    "it",
+    "ja",
+    "ko",
+    "pt",
+    "ru",
+    "th",
+    "zh",
+]
 
-def load_mldr_data(path: str, langs: list, eval_splits: list, cache_dir: str=None):
+
+def load_mldr_data(path: str, langs: list, eval_splits: list, cache_dir: str = None):
     corpus = {lang: {split: None for split in eval_splits} for lang in langs}
     queries = {lang: {split: None for split in eval_splits} for lang in langs}
     relevant_docs = {lang: {split: None for split in eval_splits} for lang in langs}
-    
+
     for lang in langs:
-        lang_corpus = datasets.load_dataset(path, f'corpus-{lang}', cache_dir=cache_dir)['corpus']
-        lang_corpus = {e['docid']: {'text': e['text']} for e in lang_corpus}
+        lang_corpus = datasets.load_dataset(
+            path, f"corpus-{lang}", cache_dir=cache_dir
+        )["corpus"]
+        lang_corpus = {e["docid"]: {"text": e["text"]} for e in lang_corpus}
         lang_data = datasets.load_dataset(path, lang, cache_dir=cache_dir)
         for split in eval_splits:
             corpus[lang][split] = lang_corpus
-            queries[lang][split] = {e['query_id']: e['query'] for e in lang_data[split]}
-            relevant_docs[lang][split] = {e['query_id']: {e['positive_passages'][0]['docid']: 1} for e in lang_data[split]}
-    
+            queries[lang][split] = {e["query_id"]: e["query"] for e in lang_data[split]}
+            relevant_docs[lang][split] = {
+                e["query_id"]: {e["positive_passages"][0]["docid"]: 1}
+                for e in lang_data[split]
+            }
+
     corpus = datasets.DatasetDict(corpus)
     queries = datasets.DatasetDict(queries)
     relevant_docs = datasets.DatasetDict(relevant_docs)
@@ -27,28 +50,49 @@ def load_mldr_data(path: str, langs: list, eval_splits: list, cache_dir: str=Non
 
 
 class MultiLongDocRetrieval(MultilingualTask, AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="MultiLongDocRetrieval",
+        description="MultiLongDocRetrieval",
+        reference="https://arxiv.org/abs/2402.03216",
+        hf_hub_name="Shitao/MLDR",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["dev", "test"],
+        eval_langs=_LANGUAGES,
+        main_score="ndcg_at_10",
+        revision="d79af07e969a6678fcbbe819956840425816468f",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation="""@misc{bge-m3,
+      title={BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation}, 
+      author={Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu},
+      year={2024},
+      eprint={2402.03216},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+""",
+    )
+
     @property
-    def description(self):
-        return {
-            'name': 'MultiLongDocRetrieval',
-            'hf_hub_name': 'Shitao/MLDR',
-            'reference': 'https://arxiv.org/abs/2402.03216',
-            'description': 'MultiLongDocRetrieval: A Multilingual Long-Document Retrieval Dataset',
-            'type': 'Retrieval',
-            'category': 's2p',
-            'eval_splits': ['dev', 'test'],
-            'eval_langs': _LANGUAGES,
-            'main_score': 'ndcg_at_10',
-        }
-    
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
+
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
-        
+
         self.corpus, self.queries, self.relevant_docs = load_mldr_data(
-            path=self.description['hf_hub_name'],
-            langs=self.langs,
-            eval_splits=self.description['eval_splits'],
-            cache_dir=kwargs.get('cache_dir', None)
+            path=self.metadata_dict["hf_hub_name"],
+            langs=self.metadata.eval_langs,
+            eval_splits=self.metadata_dict["eval_splits"],
+            cache_dir=kwargs.get("cache_dir", None),
         )
         self.data_loaded = True
diff --git a/mteb/tasks/Retrieval/multilingual/XMarketRetrieval.py b/mteb/tasks/Retrieval/multilingual/XMarketRetrieval.py
index c2366ce66d..b0af6dd89e 100644
--- a/mteb/tasks/Retrieval/multilingual/XMarketRetrieval.py
+++ b/mteb/tasks/Retrieval/multilingual/XMarketRetrieval.py
@@ -1,13 +1,19 @@
-from ....abstasks import MultilingualTask
-from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
+from __future__ import annotations
 
 import datasets
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+from ....abstasks import MultilingualTask
+from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
+
 _EVAL_SPLIT = "test"
 _EVAL_LANGS = ["es", "de", "en"]
 
 
-def _load_xmarket_data(path: str, langs: list, split: str, cache_dir: str=None, revision: str=None):
+def _load_xmarket_data(
+    path: str, langs: list, split: str, cache_dir: str = None, revision: str = None
+):
     corpus = {lang: {split: None} for lang in langs}
     queries = {lang: {split: None} for lang in langs}
     relevant_docs = {lang: {split: None} for lang in langs}
@@ -39,7 +45,9 @@ def _load_xmarket_data(path: str, langs: list, split: str, cache_dir: str=None,
 
         corpus[lang][split] = {row["_id"]: row for row in corpus_rows}
         queries[lang][split] = {row["_id"]: row["text"] for row in query_rows}
-        relevant_docs[lang][split] = {row["_id"]: {v: 1 for v in row["text"].split(" ")} for row in qrels_rows}
+        relevant_docs[lang][split] = {
+            row["_id"]: {v: 1 for v in row["text"].split(" ")} for row in qrels_rows
+        }
 
     corpus = datasets.DatasetDict(corpus)
     queries = datasets.DatasetDict(queries)
@@ -49,32 +57,43 @@ def _load_xmarket_data(path: str, langs: list, split: str, cache_dir: str=None,
 
 
 class XMarket(MultilingualTask, AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="XMarket",
+        description="XMarket",
+        reference=None,
+        hf_hub_name="jinaai/xmarket_ml",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=[_EVAL_SPLIT],
+        eval_langs=_EVAL_LANGS,
+        main_score="ndcg_at_10",
+        revision="dfe57acff5b62c23732a7b7d3e3fb84ff501708b",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
 
     @property
-    def description(self):
-        return {
-            "name": "XMarket",
-            "hf_hub_name": "jinaai/xmarket_ml",
-            "description": "XMarket is an ecommerce category to product retrieval dataset in German.",
-            "reference": "https://xmrec.github.io/",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": [_EVAL_SPLIT],
-            "eval_langs": _EVAL_LANGS,
-            "main_score": "ndcg_at_10",
-            "revision": "dfe57acff5b62c23732a7b7d3e3fb84ff501708b",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
 
         self.corpus, self.queries, self.relevant_docs = _load_xmarket_data(
-            path=self.description['hf_hub_name'],
-            langs=self.langs,
-            split=self.description['eval_splits'][0],
-            cache_dir=kwargs.get('cache_dir', None),
-            revision=self.description['revision'],
+            path=self.metadata_dict["hf_hub_name"],
+            langs=self.metadata.eval_langs,
+            split=self.metadata_dict["eval_splits"][0],
+            cache_dir=kwargs.get("cache_dir", None),
+            revision=self.metadata_dict["revision"],
         )
 
         self.data_loaded = True
diff --git a/mteb/tasks/Retrieval/multilingual/XPQARetrieval.py b/mteb/tasks/Retrieval/multilingual/XPQARetrieval.py
index c05f43ee1c..eea0b179fd 100644
--- a/mteb/tasks/Retrieval/multilingual/XPQARetrieval.py
+++ b/mteb/tasks/Retrieval/multilingual/XPQARetrieval.py
@@ -1,5 +1,9 @@
+from __future__ import annotations
+
 import datasets
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import MultilingualTask
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
@@ -7,7 +11,9 @@
 _LANGS = ["ar", "de", "es", "fr", "hi", "it", "ja", "ko", "pl", "pt", "ta", "zh"]
 
 
-def _load_xpqa_data(path: str, langs: list, split: str, cache_dir: str = None, revision: str = None):
+def _load_xpqa_data(
+    path: str, langs: list, split: str, cache_dir: str = None, revision: str = None
+):
     queries = {lang: {split: {}} for lang in langs}
     corpus = {lang: {split: {}} for lang in langs}
     relevant_docs = {lang: {split: {}} for lang in langs}
@@ -20,7 +26,9 @@ def _load_xpqa_data(path: str, langs: list, split: str, cache_dir: str = None, r
             cache_dir=cache_dir,
             revision=revision,
         )
-        question_ids = {question: _id for _id, question in enumerate(set(data["question"]))}
+        question_ids = {
+            question: _id for _id, question in enumerate(set(data["question"]))
+        }
         answer_ids = {answer: _id for _id, answer in enumerate(set(data["answer"]))}
 
         for row in data:
@@ -42,31 +50,43 @@ def _load_xpqa_data(path: str, langs: list, split: str, cache_dir: str = None, r
 
 
 class XPQARetrieval(MultilingualTask, AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="XPQARetrieval",
+        description="XPQARetrieval",
+        reference="https://arxiv.org/abs/2305.09249",
+        hf_hub_name="jinaai/xpqa",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=[_EVAL_SPLIT],
+        eval_langs=_LANGS,
+        main_score="ndcg_at_10",
+        revision="c99d599f0a6ab9b85b065da6f9d94f9cf731679f",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "XPQARetrieval",
-            "hf_hub_name": "jinaai/xpqa",
-            "reference": "https://arxiv.org/abs/2305.09249",
-            "description": "xPQA is a large-scale annotated cross-lingual Product QA dataset.",
-            "type": "Retrieval",
-            "category": "s2s",
-            "eval_splits": [_EVAL_SPLIT],
-            "eval_langs": _LANGS,
-            "main_score": "ndcg_at_10",
-            "revision": "c99d599f0a6ab9b85b065da6f9d94f9cf731679f",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
 
         self.corpus, self.queries, self.relevant_docs = _load_xpqa_data(
-            path=self.description["hf_hub_name"],
+            path=self.metadata_dict["hf_hub_name"],
             langs=self.langs,
-            split=self.description["eval_splits"][0],
+            split=self.metadata_dict["eval_splits"][0],
             cache_dir=kwargs.get("cache_dir", None),
-            revision=self.description["revision"],
+            revision=self.metadata_dict["revision"],
         )
 
         self.data_loaded = True
diff --git a/mteb/tasks/Retrieval/pl/ArguAnaPLRetrieval.py b/mteb/tasks/Retrieval/pl/ArguAnaPLRetrieval.py
index f1f003c2a5..1d45db9d88 100644
--- a/mteb/tasks/Retrieval/pl/ArguAnaPLRetrieval.py
+++ b/mteb/tasks/Retrieval/pl/ArguAnaPLRetrieval.py
@@ -1,19 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class ArguAnaPL(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="ArguAna-PL",
+        description="ArguAna-PL",
+        reference="https://huggingface.co/datasets/clarin-knext/arguana-pl",
+        hf_hub_name="clarin-knext/arguana-pl",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="ndcg_at_10",
+        revision="63fc86750af76253e8c760fc9e534bbf24d260a2",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "ArguAna-PL",
-            "hf_hub_name": "clarin-knext/arguana-pl",
-            "description": "NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval",
-            "reference": "http://argumentation.bplaced.net/arguana/data",
-            "benchmark": "BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language",
-            "type": "Retrieval",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "ndcg_at_10",
-            "revision": "63fc86750af76253e8c760fc9e534bbf24d260a2",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/pl/DBPediaPLRetrieval.py b/mteb/tasks/Retrieval/pl/DBPediaPLRetrieval.py
index 124676db52..cc2af21a1d 100644
--- a/mteb/tasks/Retrieval/pl/DBPediaPLRetrieval.py
+++ b/mteb/tasks/Retrieval/pl/DBPediaPLRetrieval.py
@@ -1,21 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class DBPediaPL(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="DBPedia-PL",
+        description="DBpedia-Entity is a standard test collection for entity search over the DBpedia knowledge base",
+        reference="https://github.com/iai-group/DBpedia-Entity/",
+        hf_hub_name="clarin-knext/dbpedia-pl",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="ndcg_at_10",
+        revision="76afe41d9af165cc40999fcaa92312b8b012064a",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "DBPedia-PL",
-            "hf_hub_name": "clarin-knext/dbpedia-pl",
-            "description": (
-                "DBpedia-Entity is a standard test collection for entity search over the DBpedia knowledge base"
-            ),
-            "reference": "https://github.com/iai-group/DBpedia-Entity/",
-            "benchmark": "BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "ndcg_at_10",
-            "revision": "76afe41d9af165cc40999fcaa92312b8b012064a",            
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/pl/FiQAPLRetrieval.py b/mteb/tasks/Retrieval/pl/FiQAPLRetrieval.py
index b1870e6ec3..a3c44e2433 100644
--- a/mteb/tasks/Retrieval/pl/FiQAPLRetrieval.py
+++ b/mteb/tasks/Retrieval/pl/FiQAPLRetrieval.py
@@ -1,19 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class FiQAPLRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="FiQA-PL",
+        description="Financial Opinion Mining and Question Answering",
+        reference="https://sites.google.com/view/fiqa/",
+        hf_hub_name="clarin-knext/fiqa-pl",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="ndcg_at_10",
+        revision="2e535829717f8bf9dc829b7f911cc5bbd4e6608e",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "FiQA-PL",
-            "hf_hub_name": "clarin-knext/fiqa-pl",
-            "description": "Financial Opinion Mining and Question Answering",
-            "reference": "https://sites.google.com/view/fiqa/",
-            "benchmark": "BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "ndcg_at_10",
-            "revision": "2e535829717f8bf9dc829b7f911cc5bbd4e6608e",            
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/pl/HotpotQAPLRetrieval.py b/mteb/tasks/Retrieval/pl/HotpotQAPLRetrieval.py
index 8bdf729bc8..33e7e82b98 100644
--- a/mteb/tasks/Retrieval/pl/HotpotQAPLRetrieval.py
+++ b/mteb/tasks/Retrieval/pl/HotpotQAPLRetrieval.py
@@ -1,22 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class HotpotQAPL(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="HotpotQA-PL",
+        description="HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong supervision for supporting facts to enable more explainable question answering systems.",
+        reference="https://hotpotqa.github.io/",
+        hf_hub_name="clarin-knext/hotpotqa-pl",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="ndcg_at_10",
+        revision="a0bd479ac97b4ccb5bd6ce320c415d0bb4beb907",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "HotpotQA-PL",
-            "hf_hub_name": "clarin-knext/hotpotqa-pl",
-            "description": (
-                "HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong"
-                " supervision for supporting facts to enable more explainable question answering systems."
-            ),
-            "reference": "https://hotpotqa.github.io/",
-            "benchmark": "BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "ndcg_at_10",
-            "revision": "a0bd479ac97b4ccb5bd6ce320c415d0bb4beb907",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/pl/MSMARCOPLRetrieval.py b/mteb/tasks/Retrieval/pl/MSMARCOPLRetrieval.py
index dd478bea4e..d2f738dff4 100644
--- a/mteb/tasks/Retrieval/pl/MSMARCOPLRetrieval.py
+++ b/mteb/tasks/Retrieval/pl/MSMARCOPLRetrieval.py
@@ -1,19 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class MSMARCOPL(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="MSMARCO-PL",
+        description="MS MARCO is a collection of datasets focused on deep learning in search",
+        reference="https://microsoft.github.io/msmarco/",
+        hf_hub_name="clarin-knext/msmarco-pl",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="ndcg_at_10",
+        revision="8634c07806d5cce3a6138e260e59b81760a0a640",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "MSMARCO-PL",
-            "hf_hub_name": "clarin-knext/msmarco-pl",
-            "description": "MS MARCO is a collection of datasets focused on deep learning in search",
-            "reference": "https://microsoft.github.io/msmarco/",
-            "benchmark": "BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["validation", "test"],  # "validation" if using latest BEIR i.e. HFDataLoader
-            "eval_langs": ["pl"],
-            "main_score": "ndcg_at_10",
-            "revision": "8634c07806d5cce3a6138e260e59b81760a0a640",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/pl/NFCorpusPLRetrieval.py b/mteb/tasks/Retrieval/pl/NFCorpusPLRetrieval.py
index 58bda5a78a..7437f80a85 100644
--- a/mteb/tasks/Retrieval/pl/NFCorpusPLRetrieval.py
+++ b/mteb/tasks/Retrieval/pl/NFCorpusPLRetrieval.py
@@ -1,19 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class NFCorpusPL(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="NFCorpus-PL",
+        description="NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval",
+        reference="https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/",
+        hf_hub_name="clarin-knext/nfcorpus-pl",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="ndcg_at_10",
+        revision="9a6f9567fda928260afed2de480d79c98bf0bec0",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "NFCorpus-PL",
-            "hf_hub_name": "clarin-knext/nfcorpus-pl",
-            "description": "NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval",
-            "reference": "https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/",
-            "benchmark": "BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "ndcg_at_10",
-            "revision": "9a6f9567fda928260afed2de480d79c98bf0bec0",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/pl/NQPLRetrieval.py b/mteb/tasks/Retrieval/pl/NQPLRetrieval.py
index c18d05faf3..12d34e7297 100644
--- a/mteb/tasks/Retrieval/pl/NQPLRetrieval.py
+++ b/mteb/tasks/Retrieval/pl/NQPLRetrieval.py
@@ -1,19 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class NQPL(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="NQ-PL",
+        description="Natural Questions: A Benchmark for Question Answering Research",
+        reference="https://ai.google.com/research/NaturalQuestions/",
+        hf_hub_name="clarin-knext/nq-pl",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="ndcg_at_10",
+        revision="f171245712cf85dd4700b06bef18001578d0ca8d",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "NQ-PL",
-            "hf_hub_name": "clarin-knext/nq-pl",
-            "description": "Natural Questions: A Benchmark for Question Answering Research",
-            "reference": "https://ai.google.com/research/NaturalQuestions/",
-            "benchmark": "BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "ndcg_at_10",
-            "revision": "f171245712cf85dd4700b06bef18001578d0ca8d",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/pl/QuoraPLRetrieval.py b/mteb/tasks/Retrieval/pl/QuoraPLRetrieval.py
index 49f38883ac..b1ef313287 100644
--- a/mteb/tasks/Retrieval/pl/QuoraPLRetrieval.py
+++ b/mteb/tasks/Retrieval/pl/QuoraPLRetrieval.py
@@ -1,22 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class QuoraPLRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="Quora-PL",
+        description="QuoraRetrieval is based on questions that are marked as duplicates on the Quora platform. Given a question, find other (duplicate) questions.",
+        reference="https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs",
+        hf_hub_name="clarin-knext/quora-pl",
+        type="Retrieval",
+        category="s2s",
+        eval_splits=["validation", "test"],  # validation for new DataLoader
+        eval_langs=["pl"],
+        main_score="ndcg_at_10",
+        revision="0be27e93455051e531182b85e85e425aba12e9d4",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "Quora-PL",
-            "hf_hub_name": "clarin-knext/quora-pl",
-            "description": (
-                "QuoraRetrieval is based on questions that are marked as duplicates on the Quora platform. Given a"
-                " question, find other (duplicate) questions."
-            ),
-            "reference": "https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs",
-            "type": "Retrieval",
-            "benchmark": "BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language",
-            "category": "s2s",
-            "eval_splits": ["validation", "test"],  # validation for new DataLoader
-            "eval_langs": ["pl"],
-            "main_score": "ndcg_at_10",
-            "revision": "0be27e93455051e531182b85e85e425aba12e9d4",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/pl/SCIDOCSPLRetrieval.py b/mteb/tasks/Retrieval/pl/SCIDOCSPLRetrieval.py
index f89d306e1e..6e91fcf2c6 100644
--- a/mteb/tasks/Retrieval/pl/SCIDOCSPLRetrieval.py
+++ b/mteb/tasks/Retrieval/pl/SCIDOCSPLRetrieval.py
@@ -1,22 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class SCIDOCSPL(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="SCIDOCS-PL",
+        description="SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from citation prediction, to document classification and recommendation.",
+        reference="https://allenai.org/data/scidocs",
+        hf_hub_name="clarin-knext/scidocs-pl",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="ndcg_at_10",
+        revision="45452b03f05560207ef19149545f168e596c9337",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "SCIDOCS-PL",
-            "hf_hub_name": "clarin-knext/scidocs-pl",
-            "description": (
-                "SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from citation"
-                " prediction, to document classification and recommendation."
-            ),
-            "reference": "https://allenai.org/data/scidocs",
-            "benchmark": "BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "ndcg_at_10",
-            "revision": "45452b03f05560207ef19149545f168e596c9337",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/pl/SciFactPLRetrieval.py b/mteb/tasks/Retrieval/pl/SciFactPLRetrieval.py
index f22b57800b..cc9dd08efe 100644
--- a/mteb/tasks/Retrieval/pl/SciFactPLRetrieval.py
+++ b/mteb/tasks/Retrieval/pl/SciFactPLRetrieval.py
@@ -1,19 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class SciFactPL(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="SciFact-PL",
+        description="SciFact verifies scientific claims using evidence from the research literature containing scientific paper abstracts.",
+        reference="https://github.com/allenai/scifact",
+        hf_hub_name="clarin-knext/scifact-pl",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="ndcg_at_10",
+        revision="47932a35f045ef8ed01ba82bf9ff67f6e109207e",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "SciFact-PL",
-            "hf_hub_name": "clarin-knext/scifact-pl",
-            "reference": "https://github.com/allenai/scifact",
-            "benchmark": "BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language",
-            "description": "SciFact verifies scientific claims using evidence from the research literature containing scientific paper abstracts.",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "ndcg_at_10",
-            "revision": "47932a35f045ef8ed01ba82bf9ff67f6e109207e",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/pl/TRECCOVIDPLRetrieval.py b/mteb/tasks/Retrieval/pl/TRECCOVIDPLRetrieval.py
index ab14e63760..9cf1a0062f 100644
--- a/mteb/tasks/Retrieval/pl/TRECCOVIDPLRetrieval.py
+++ b/mteb/tasks/Retrieval/pl/TRECCOVIDPLRetrieval.py
@@ -1,19 +1,34 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 
 class TRECCOVIDPL(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="TRECCOVID-PL",
+        description="TRECCOVID is an ad-hoc search challenge based on the COVID-19 dataset containing scientific articles related to the COVID-19 pandemic.",
+        reference="https://ir.nist.gov/covidSubmit/index.html",
+        hf_hub_name="clarin-knext/trec-covid-pl",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="ndcg_at_10",
+        revision="81bcb408f33366c2a20ac54adafad1ae7e877fdd",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "TRECCOVID-PL",
-            "hf_hub_name": "clarin-knext/trec-covid-pl",
-            "reference": "https://ir.nist.gov/covidSubmit/index.html",
-            "benchmark": "BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language",
-            "description": "TRECCOVID is an ad-hoc search challenge based on the CORD-19 dataset containing scientific articles related to the COVID-19 pandemic.",
-            "type": "Retrieval",
-            "category": "s2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "ndcg_at_10",
-            "revision": "81bcb408f33366c2a20ac54adafad1ae7e877fdd",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Retrieval/zh/CMTEBRetrieval.py b/mteb/tasks/Retrieval/zh/CMTEBRetrieval.py
index 94ce249554..312aa61089 100644
--- a/mteb/tasks/Retrieval/zh/CMTEBRetrieval.py
+++ b/mteb/tasks/Retrieval/zh/CMTEBRetrieval.py
@@ -1,5 +1,10 @@
+from __future__ import annotations
+
 from collections import defaultdict
-from datasets import load_dataset, DatasetDict
+
+from datasets import DatasetDict, load_dataset
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
@@ -7,217 +12,319 @@
 def load_retrieval_data(hf_hub_name, eval_splits):
     eval_split = eval_splits[0]
     dataset = load_dataset(hf_hub_name)
-    qrels = load_dataset(hf_hub_name + '-qrels')[eval_split]
+    qrels = load_dataset(hf_hub_name + "-qrels")[eval_split]
 
-    corpus = {e['id']: {'text': e['text']} for e in dataset['corpus']}
-    queries = {e['id']: e['text'] for e in dataset['queries']}
+    corpus = {e["id"]: {"text": e["text"]} for e in dataset["corpus"]}
+    queries = {e["id"]: e["text"] for e in dataset["queries"]}
     relevant_docs = defaultdict(dict)
     for e in qrels:
-        relevant_docs[e['qid']][e['pid']] = e['score']
+        relevant_docs[e["qid"]][e["pid"]] = e["score"]
 
-    corpus = DatasetDict({eval_split:corpus})
-    queries = DatasetDict({eval_split:queries})
-    relevant_docs = DatasetDict({eval_split:relevant_docs})
+    corpus = DatasetDict({eval_split: corpus})
+    queries = DatasetDict({eval_split: queries})
+    relevant_docs = DatasetDict({eval_split: relevant_docs})
     return corpus, queries, relevant_docs
 
 
 class T2Retrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="T2Retrieval",
+        description="T2Ranking: A large-scale Chinese Benchmark for Passage Ranking",
+        reference="https://arxiv.org/abs/2304.03679",
+        hf_hub_name="C-MTEB/T2Retrieval",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["dev"],
+        eval_langs=["zh"],
+        main_score="ndcg_at_10",
+        revision="8731a845f1bf500a4f111cf1070785c793d10e64",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            'name': 'T2Retrieval',
-            'hf_hub_name': 'C-MTEB/T2Retrieval',
-            'reference': 'https://arxiv.org/abs/2304.03679',
-            'description': 'T2Ranking: A large-scale Chinese Benchmark for Passage Ranking',
-            'type': 'Retrieval',
-            'category': 's2p',
-            'eval_splits': ['dev'],
-            'eval_langs': ['zh'],
-            'main_score': 'ndcg_at_10',
-            'revision': '8731a845f1bf500a4f111cf1070785c793d10e64',
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
 
-        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(self.description['hf_hub_name'],
-                                                                            self.description['eval_splits'])
+        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(
+            self.metadata_dict["hf_hub_name"], self.metadata_dict["eval_splits"]
+        )
         self.data_loaded = True
 
 
 class MMarcoRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="MMarcoRetrieval",
+        description="MMarcoRetrieval",
+        reference="https://arxiv.org/abs/2309.07597",
+        hf_hub_name="C-MTEB/MMarcoRetrieval",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["dev"],
+        eval_langs=["zh"],
+        main_score="ndcg_at_10",
+        revision="539bbde593d947e2a124ba72651aafc09eb33fc2",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            'name': 'MMarcoRetrieval',
-            'hf_hub_name': 'C-MTEB/MMarcoRetrieval',
-            'reference': 'https://github.com/unicamp-dl/mMARCO',
-            'description': 'mMARCO is a multilingual version of the MS MARCO passage ranking dataset',
-            'type': 'Retrieval',
-            'category': 's2p',
-            'eval_splits': ['dev'],
-            'eval_langs': ['zh'],
-            'main_score': 'ndcg_at_10',
-            'revision': '539bbde593d947e2a124ba72651aafc09eb33fc2',
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
 
-        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(self.description['hf_hub_name'],
-                                                                            self.description['eval_splits'])
+        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(
+            self.metadata_dict["hf_hub_name"], self.metadata_dict["eval_splits"]
+        )
         self.data_loaded = True
 
 
 class DuRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="DuRetrieval",
+        description="A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine",
+        reference="https://aclanthology.org/2022.emnlp-main.357.pdf",
+        hf_hub_name="C-MTEB/DuRetrieval",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["dev"],
+        eval_langs=["zh"],
+        main_score="ndcg_at_10",
+        revision="a1a333e290fe30b10f3f56498e3a0d911a693ced",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            'name': 'DuRetrieval',
-            'hf_hub_name': 'C-MTEB/DuRetrieval',
-            'reference': 'https://aclanthology.org/2022.emnlp-main.357.pdf',
-            'description': 'A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine',
-            'type': 'Retrieval',
-            'category': 's2p',
-            'eval_splits': ['dev'],
-            'eval_langs': ['zh'],
-            'main_score': 'ndcg_at_10',
-            'revision': 'a1a333e290fe30b10f3f56498e3a0d911a693ced',
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
 
-        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(self.description['hf_hub_name'],
-                                                                            self.description['eval_splits'])
+        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(
+            self.metadata_dict["hf_hub_name"], self.metadata_dict["eval_splits"]
+        )
         self.data_loaded = True
 
 
 class CovidRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="CovidRetrieval",
+        description="COVID-19 news articles",
+        reference="https://arxiv.org/abs/2203.03367",
+        hf_hub_name="C-MTEB/CovidRetrieval",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["zh"],
+        main_score="ndcg_at_10",
+        revision="687de13dc7294d6fd9be10c6945f9e8fec8166b9",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            'name': 'CovidRetrieval',
-            'hf_hub_name': 'C-MTEB/CovidRetrieval',
-            'reference': 'https://aclanthology.org/2022.emnlp-main.357.pdf',
-            'description': 'COVID-19 news articles',
-            'type': 'Retrieval',
-            'category': 's2p',
-            'eval_splits': ['dev'],
-            'eval_langs': ['zh'],
-            'main_score': 'ndcg_at_10',
-            'revision': '1271c7809071a13532e05f25fb53511ffce77117',
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
 
-        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(self.description['hf_hub_name'],
-                                                                            self.description['eval_splits'])
+        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(
+            self.metadata_dict["hf_hub_name"], self.metadata_dict["eval_splits"]
+        )
         self.data_loaded = True
 
 
-
 class CmedqaRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="CmedqaRetrieval",
+        description="Online medical consultation text",
+        reference="https://aclanthology.org/2022.emnlp-main.357.pdf",
+        hf_hub_name="C-MTEB/CmedqaRetrieval",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["dev"],
+        eval_langs=["zh"],
+        main_score="ndcg_at_10",
+        revision="cd540c506dae1cf9e9a59c3e06f42030d54e7301",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            'name': 'CmedqaRetrieval',
-            'hf_hub_name': 'C-MTEB/CmedqaRetrieval',
-            'reference': 'https://aclanthology.org/2022.emnlp-main.357.pdf',
-            'description': 'Online medical consultation text',
-            'type': 'Retrieval',
-            'category': 's2p',
-            'eval_splits': ['dev'],
-            'eval_langs': ['zh'],
-            'main_score': 'ndcg_at_10',
-            'revision': 'cd540c506dae1cf9e9a59c3e06f42030d54e7301',
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
 
-        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(self.description['hf_hub_name'],
-                                                                            self.description['eval_splits'])
+        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(
+            self.metadata_dict["hf_hub_name"], self.metadata_dict["eval_splits"]
+        )
         self.data_loaded = True
 
 
 class EcomRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="EcomRetrieval",
+        description="EcomRetrieval",
+        reference="https://arxiv.org/abs/2203.03367",
+        hf_hub_name="C-MTEB/EcomRetrieval",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["dev"],
+        eval_langs=["zh"],
+        main_score="ndcg_at_10",
+        revision="687de13dc7294d6fd9be10c6945f9e8fec8166b9",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            'name': 'EcomRetrieval',
-            'hf_hub_name': 'C-MTEB/EcomRetrieval',
-            'reference': 'https://arxiv.org/abs/2203.03367',
-            'description': 'Passage retrieval dataset collected from Alibaba search engine systems in ecom domain',
-            'type': 'Retrieval',
-            'category': 's2p',
-            'eval_splits': ['dev'],
-            'eval_langs': ['zh'],
-            'main_score': 'ndcg_at_10',
-            'revision': '687de13dc7294d6fd9be10c6945f9e8fec8166b9',
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
 
-        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(self.description['hf_hub_name'],
-                                                                            self.description['eval_splits'])
+        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(
+            self.metadata_dict["hf_hub_name"], self.metadata_dict["eval_splits"]
+        )
         self.data_loaded = True
 
 
 class MedicalRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="MedicalRetrieval",
+        description="MedicalRetrieval",
+        reference="https://arxiv.org/abs/2203.03367",
+        hf_hub_name="C-MTEB/MedicalRetrieval",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["dev"],
+        eval_langs=["zh"],
+        main_score="ndcg_at_10",
+        revision="2039188fb5800a9803ba5048df7b76e6fb151fc6",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            'name': 'MedicalRetrieval',
-            'hf_hub_name': 'C-MTEB/MedicalRetrieval',
-            'reference': 'https://arxiv.org/abs/2203.03367',
-            'description': 'Passage retrieval dataset collected from Alibaba search engine systems in medical domain',
-            'type': 'Retrieval',
-            'category': 's2p',
-            'eval_splits': ['dev'],
-            'eval_langs': ['zh'],
-            'main_score': 'ndcg_at_10',
-            'revision': '2039188fb5800a9803ba5048df7b76e6fb151fc6',
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
 
-        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(self.description['hf_hub_name'],
-                                                                            self.description['eval_splits'])
+        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(
+            self.metadata_dict["hf_hub_name"], self.metadata_dict["eval_splits"]
+        )
         self.data_loaded = True
 
 
 class VideoRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="VideoRetrieval",
+        description="VideoRetrieval",
+        reference="https://arxiv.org/abs/2203.03367",
+        hf_hub_name="C-MTEB/VideoRetrieval",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["dev"],
+        eval_langs=["zh"],
+        main_score="ndcg_at_10",
+        revision="58c2597a5943a2ba48f4668c3b90d796283c5639",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            'name': 'VideoRetrieval',
-            'hf_hub_name': 'C-MTEB/VideoRetrieval',
-            'reference': 'https://arxiv.org/abs/2203.03367',
-            'description': 'Passage retrieval dataset collected from Alibaba search engine systems in video domain',
-            'type': 'Retrieval',
-            'category': 's2p',
-            'eval_splits': ['dev'],
-            'eval_langs': ['zh'],
-            'main_score': 'ndcg_at_10',
-            'revision': '58c2597a5943a2ba48f4668c3b90d796283c5639',
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
 
-        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(self.description['hf_hub_name'],
-                                                                            self.description['eval_splits'])
+        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(
+            self.metadata_dict["hf_hub_name"], self.metadata_dict["eval_splits"]
+        )
         self.data_loaded = True
-
diff --git a/mteb/tasks/STS/__init__.py b/mteb/tasks/STS/__init__.py
index e3ed648aec..ef5f90612b 100644
--- a/mteb/tasks/STS/__init__.py
+++ b/mteb/tasks/STS/__init__.py
@@ -1,16 +1,18 @@
-from .en.BiossesSTS import *
-from .zh.CMTEBSTS import *
+from __future__ import annotations
+
 from .de.GermanSTSBenchmarkSTS import *
-from .pl.PolishSTS import *
-from .fr.SickFrSTS import *
+from .en.BiossesSTS import *
 from .en.SickrSTS import *
 from .en.STS12STS import *
 from .en.STS13STS import *
 from .en.STS14STS import *
 from .en.STS15STS import *
 from .en.STS16STS import *
+from .en.STSBenchmarkSTS import *
+from .es.STSES import *
+from .fr.SickFrSTS import *
 from .multilingual.STS17CrosslingualSTS import *
 from .multilingual.STS22CrosslingualSTS import *
 from .multilingual.STSBenchmarkMultilingualSTS import *
-from .en.STSBenchmarkSTS import *
-from .es.STSES import *
+from .pl.PolishSTS import *
+from .zh.CMTEBSTS import *
diff --git a/mteb/tasks/STS/de/GermanSTSBenchmarkSTS.py b/mteb/tasks/STS/de/GermanSTSBenchmarkSTS.py
index 6c1194e036..9f0ef64e08 100644
--- a/mteb/tasks/STS/de/GermanSTSBenchmarkSTS.py
+++ b/mteb/tasks/STS/de/GermanSTSBenchmarkSTS.py
@@ -1,21 +1,38 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskSTS import AbsTaskSTS
 
 
 class GermanSTSBenchmarkSTS(AbsTaskSTS):
+    metadata = TaskMetadata(
+        name="GermanSTSBenchmark",
+        hf_hub_name="jinaai/german-STSbenchmark",
+        description="Semantic Textual Similarity Benchmark (STSbenchmark) dataset translated into German. "
+        "Translations were originally done by T-Systems on site services GmbH.",
+        reference="https://github.com/t-systems-on-site-services-gmbh/german-STSbenchmark",
+        type="STS",
+        category="s2s",
+        eval_splits=["validation", "test"],
+        eval_langs=["de"],
+        main_score="cosine_spearman",
+        revision="e36907544d44c3a247898ed81540310442329e20",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "GermanSTSBenchmark",
-            "hf_hub_name": "jinaai/german-STSbenchmark",
-            "description": "Semantic Textual Similarity Benchmark (STSbenchmark) dataset translated into German. "
-            "Translations were originally done by T-Systems on site services GmbH.",
-            "reference": "https://github.com/t-systems-on-site-services-gmbh/german-STSbenchmark",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["validation", "test"],
-            "eval_langs": ["de"],
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 5,
-            "revision": "e36907544d44c3a247898ed81540310442329e20",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 5
+        return metadata_dict
diff --git a/mteb/tasks/STS/en/BiossesSTS.py b/mteb/tasks/STS/en/BiossesSTS.py
index 582b971e4f..c1ec9ccf31 100644
--- a/mteb/tasks/STS/en/BiossesSTS.py
+++ b/mteb/tasks/STS/en/BiossesSTS.py
@@ -1,20 +1,37 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskSTS import AbsTaskSTS
 
 
 class BiossesSTS(AbsTaskSTS):
+    metadata = TaskMetadata(
+        name="BIOSSES",
+        hf_hub_name="mteb/biosses-sts",
+        description="Biomedical Semantic Similarity Estimation.",
+        reference="https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html",
+        type="STS",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="cosine_spearman",
+        revision="d3fb88f8f02e40887cd149695127462bbcf29b4a",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "BIOSSES",
-            "hf_hub_name": "mteb/biosses-sts",
-            "description": "Biomedical Semantic Similarity Estimation.",
-            "reference": "https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 4,
-            "revision": "d3fb88f8f02e40887cd149695127462bbcf29b4a",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 5
+        return metadata_dict
diff --git a/mteb/tasks/STS/en/STS12STS.py b/mteb/tasks/STS/en/STS12STS.py
index aaca36ce88..0be28a21b6 100644
--- a/mteb/tasks/STS/en/STS12STS.py
+++ b/mteb/tasks/STS/en/STS12STS.py
@@ -1,20 +1,37 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskSTS import AbsTaskSTS
 
 
 class STS12STS(AbsTaskSTS):
+    metadata = TaskMetadata(
+        name="STS12",
+        hf_hub_name="mteb/sts12-sts",
+        description="SemEval STS 2012 dataset.",
+        reference="https://www.aclweb.org/anthology/S12-1051.pdf",
+        type="STS",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="cosine_spearman",
+        revision="a0d554a64d88156834ff5ae9920b964011b16384",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "STS12",
-            "hf_hub_name": "mteb/sts12-sts",
-            "description": "SemEval STS 2012 dataset.",
-            "reference": "https://www.aclweb.org/anthology/S12-1051.pdf",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 5,
-            "revision": "a0d554a64d88156834ff5ae9920b964011b16384",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 5
+        return metadata_dict
diff --git a/mteb/tasks/STS/en/STS13STS.py b/mteb/tasks/STS/en/STS13STS.py
index 9dd95795de..09b79eedc5 100644
--- a/mteb/tasks/STS/en/STS13STS.py
+++ b/mteb/tasks/STS/en/STS13STS.py
@@ -1,20 +1,37 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskSTS import AbsTaskSTS
 
 
 class STS13STS(AbsTaskSTS):
+    metadata = TaskMetadata(
+        name="STS13",
+        hf_hub_name="mteb/sts13-sts",
+        description="SemEval STS 2013 dataset.",
+        reference="https://www.aclweb.org/anthology/S13-1004/",
+        type="STS",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="cosine_spearman",
+        revision="7e90230a92c190f1bf69ae9002b8cea547a64cca",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "STS13",
-            "hf_hub_name": "mteb/sts13-sts",
-            "description": "SemEval STS 2013 dataset.",
-            "reference": "https://www.aclweb.org/anthology/S13-1004/",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 5,
-            "revision": "7e90230a92c190f1bf69ae9002b8cea547a64cca",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 5
+        return metadata_dict
diff --git a/mteb/tasks/STS/en/STS14STS.py b/mteb/tasks/STS/en/STS14STS.py
index a53de1c120..b19f3c9ddc 100644
--- a/mteb/tasks/STS/en/STS14STS.py
+++ b/mteb/tasks/STS/en/STS14STS.py
@@ -1,20 +1,37 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskSTS import AbsTaskSTS
 
 
 class STS14STS(AbsTaskSTS):
+    metadata = TaskMetadata(
+        name="STS14",
+        hf_hub_name="mteb/sts14-sts",
+        description="SemEval STS 2014 dataset. Currently only the English dataset",
+        reference="https://www.aclweb.org/anthology/S14-1002",
+        type="STS",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="cosine_spearman",
+        revision="6031580fec1f6af667f0bd2da0a551cf4f0b2375",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "STS14",
-            "hf_hub_name": "mteb/sts14-sts",
-            "description": "SemEval STS 2014 dataset. Currently only the English dataset",
-            "reference": "http://alt.qcri.org/semeval2014/task10/",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 5,
-            "revision": "6031580fec1f6af667f0bd2da0a551cf4f0b2375",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 5
+        return metadata_dict
diff --git a/mteb/tasks/STS/en/STS15STS.py b/mteb/tasks/STS/en/STS15STS.py
index a989f042dd..1982da86ff 100644
--- a/mteb/tasks/STS/en/STS15STS.py
+++ b/mteb/tasks/STS/en/STS15STS.py
@@ -1,20 +1,37 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskSTS import AbsTaskSTS
 
 
 class STS15STS(AbsTaskSTS):
+    metadata = TaskMetadata(
+        name="STS15",
+        hf_hub_name="mteb/sts15-sts",
+        description="SemEval STS 2015 dataset",
+        reference="https://www.aclweb.org/anthology/S15-2010",
+        type="STS",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="cosine_spearman",
+        revision="ae752c7c21bf194d8b67fd573edf7ae58183cbe3",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "STS15",
-            "hf_hub_name": "mteb/sts15-sts",
-            "description": "SemEval STS 2015 dataset",
-            "reference": "http://alt.qcri.org/semeval2015/task2/",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 5,
-            "revision": "ae752c7c21bf194d8b67fd573edf7ae58183cbe3",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 5
+        return metadata_dict
diff --git a/mteb/tasks/STS/en/STS16STS.py b/mteb/tasks/STS/en/STS16STS.py
index 0c0e2bfae9..b9b0d3c9ef 100644
--- a/mteb/tasks/STS/en/STS16STS.py
+++ b/mteb/tasks/STS/en/STS16STS.py
@@ -1,20 +1,37 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskSTS import AbsTaskSTS
 
 
 class STS16STS(AbsTaskSTS):
+    metadata = TaskMetadata(
+        name="STS16",
+        hf_hub_name="mteb/sts16-sts",
+        description="SemEval STS 2016 dataset",
+        reference="https://www.aclweb.org/anthology/S16-1001",
+        type="STS",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="cosine_spearman",
+        revision="4d8694f8f0e0100860b497b999b3dbed754a0513",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "STS16",
-            "hf_hub_name": "mteb/sts16-sts",
-            "description": "SemEval STS 2016 dataset",
-            "reference": "http://alt.qcri.org/semeval2016/task1/",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 5,
-            "revision": "4d8694f8f0e0100860b497b999b3dbed754a0513",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 5
+        return metadata_dict
diff --git a/mteb/tasks/STS/en/STSBenchmarkSTS.py b/mteb/tasks/STS/en/STSBenchmarkSTS.py
index 61580b230f..469ae8cdbf 100644
--- a/mteb/tasks/STS/en/STSBenchmarkSTS.py
+++ b/mteb/tasks/STS/en/STSBenchmarkSTS.py
@@ -1,20 +1,37 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskSTS import AbsTaskSTS
 
 
 class STSBenchmarkSTS(AbsTaskSTS):
+    metadata = TaskMetadata(
+        name="STSBenchmark",
+        hf_hub_name="mteb/stsbenchmark-sts",
+        description="Semantic Textual Similarity Benchmark (STSbenchmark) dataset.",
+        reference="https://github.com/PhilipMay/stsb-multi-mt/",
+        type="STS",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="cosine_spearman",
+        revision="b0fddb56ed78048fa8b90373c8a3cfc37b684831",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "STSBenchmark",
-            "hf_hub_name": "mteb/stsbenchmark-sts",
-            "description": "Semantic Textual Similarity Benchmark (STSbenchmark) dataset.",
-            "reference": "http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["validation", "test"],
-            "eval_langs": ["en"],
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 5,
-            "revision": "b0fddb56ed78048fa8b90373c8a3cfc37b684831",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 5
+        return metadata_dict
diff --git a/mteb/tasks/STS/en/SickrSTS.py b/mteb/tasks/STS/en/SickrSTS.py
index 089ff57831..501d70db39 100644
--- a/mteb/tasks/STS/en/SickrSTS.py
+++ b/mteb/tasks/STS/en/SickrSTS.py
@@ -1,20 +1,37 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskSTS import AbsTaskSTS
 
 
 class SickrSTS(AbsTaskSTS):
+    metadata = TaskMetadata(
+        name="SICK-R",
+        hf_hub_name="MMathematica/sickr-sts",
+        description="Semantic Textual Similarity SICK-R dataset as described here:",
+        reference="https://aclanthology.org/2020.lrec-1.207",
+        type="STS",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="cosine_spearman",
+        revision="a6ea5a8cab320b040a23452cc28066d9beae2cee",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "SICK-R",
-            "hf_hub_name": "mteb/sickr-sts",
-            "description": "Semantic Textual Similarity SICK-R dataset as described here:",
-            "reference": "https://www.aclweb.org/anthology/S14-2001.pdf",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "cosine_spearman",
-            "min_score": 1,
-            "max_score": 5,
-            "revision": "a6ea5a8cab320b040a23452cc28066d9beae2cee",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 5
+        return metadata_dict
diff --git a/mteb/tasks/STS/es/STSES.py b/mteb/tasks/STS/es/STSES.py
index aa18b873af..1fceff5044 100644
--- a/mteb/tasks/STS/es/STSES.py
+++ b/mteb/tasks/STS/es/STSES.py
@@ -1,38 +1,56 @@
+from __future__ import annotations
+
 from datasets import load_dataset
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskSTS import AbsTaskSTS
 
-_EVAL_SPLIT = 'test'
+_EVAL_SPLIT = "test"
 
 
 class STSES(AbsTaskSTS):
+    metadata = TaskMetadata(
+        name="STSES",
+        hf_hub_name="PlanTL-GOB-ES/sts-es",
+        description="Spanish test sets from SemEval-2014 (Agirre et al., 2014) and SemEval-2015 (Agirre et al., 2015)",
+        reference="https://huggingface.co/datasets/PlanTL-GOB-ES/sts-es",
+        type="STS",
+        category="s2s",
+        eval_splits=[_EVAL_SPLIT],
+        eval_langs=["es"],
+        main_score="cosine_spearman",
+        revision="0912bb6c9393c76d62a7c5ee81c4c817ff47c9f4",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "STSES",
-            "hf_hub_name": "PlanTL-GOB-ES/sts-es",
-            "description": "Spanish test sets from SemEval-2014 (Agirre et al., 2014) and SemEval-2015 (Agirre et al., 2015)",
-            "reference": "https://huggingface.co/datasets/PlanTL-GOB-ES/sts-es",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["es"],
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 5,
-            "revision": "0912bb6c9393c76d62a7c5ee81c4c817ff47c9f4",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 5
+
+        return dict(self.metadata)
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
 
         data = load_dataset(
-            self.description["hf_hub_name"],
+            self.metadata_dict["hf_hub_name"],
             trust_remote_code=True,
-            revision=self.description.get("revision", None)
+            revision=self.metadata_dict.get("revision", None),
         )[_EVAL_SPLIT]
-        data = data.add_column('score', [d['label'] for d in data])
+        data = data.add_column("score", [d["label"] for d in data])
         self.dataset = {_EVAL_SPLIT: data}
 
         self.data_loaded = True
diff --git a/mteb/tasks/STS/fr/SickFrSTS.py b/mteb/tasks/STS/fr/SickFrSTS.py
index b490b8da33..acd6e95b5a 100644
--- a/mteb/tasks/STS/fr/SickFrSTS.py
+++ b/mteb/tasks/STS/fr/SickFrSTS.py
@@ -1,25 +1,43 @@
-from ....abstasks.AbsTaskSTS import AbsTaskSTS
+from __future__ import annotations
+
 import datasets
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+from ....abstasks.AbsTaskSTS import AbsTaskSTS
+
 
 class SickFrSTS(AbsTaskSTS):
+    metadata = TaskMetadata(
+        name="SICKFr",
+        hf_hub_name="Lajavaness/SICK-fr",
+        description="SICK dataset french version",
+        reference="https://huggingface.co/datasets/Lajavaness/SICK-fr",
+        type="STS",
+        category="s2s",
+        eval_splits=["validation", "test"],
+        eval_langs=["fr"],
+        main_score="cosine_spearman",
+        revision="e077ab4cf4774a1e36d86d593b150422fafd8e8a",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "SICKFr",
-            "hf_hub_name": "Lajavaness/SICK-fr",
-            "description": "SICK dataset french version",
-            "reference": "https://huggingface.co/datasets/Lajavaness/SICK-fr",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["validation", "test"],
-            "eval_langs": ["fr"],
-            "main_score": "cosine_spearman",
-            "min_score": 1,
-            "max_score": 5,
-            "revision": "e077ab4cf4774a1e36d86d593b150422fafd8e8a",
-        }
-    
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 5
+        return metadata_dict
+
     def load_data(self, **kwargs):
         """
         Load dataset from HuggingFace hub and rename columns to the standard format.
@@ -28,11 +46,16 @@ def load_data(self, **kwargs):
             return
 
         self.dataset = datasets.load_dataset(
-            self.description["hf_hub_name"], revision=self.description.get("revision", None)
+            self.metadata_dict["hf_hub_name"],
+            revision=self.metadata_dict.get("revision", None),
         )
 
-        self.dataset = self.dataset.rename_columns({
-        "sentence_A": "sentence1",  "sentence_B": "sentence2", 
-        "relatedness_score": "score", "Unnamed: 0": "id"
-        })
+        self.dataset = self.dataset.rename_columns(
+            {
+                "sentence_A": "sentence1",
+                "sentence_B": "sentence2",
+                "relatedness_score": "score",
+                "Unnamed: 0": "id",
+            }
+        )
         self.data_loaded = True
diff --git a/mteb/tasks/STS/multilingual/STS17CrosslingualSTS.py b/mteb/tasks/STS/multilingual/STS17CrosslingualSTS.py
index bd55b68a95..b03bfc293c 100644
--- a/mteb/tasks/STS/multilingual/STS17CrosslingualSTS.py
+++ b/mteb/tasks/STS/multilingual/STS17CrosslingualSTS.py
@@ -1,22 +1,51 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskSTS, CrosslingualTask
 
-_LANGUAGES = ["ko-ko", "ar-ar", "en-ar", "en-de", "en-en", "en-tr", "es-en", "es-es", "fr-en", "it-en", "nl-en"]
+_LANGUAGES = [
+    "ko-ko",
+    "ar-ar",
+    "en-ar",
+    "en-de",
+    "en-en",
+    "en-tr",
+    "es-en",
+    "es-es",
+    "fr-en",
+    "it-en",
+    "nl-en",
+]
 
 
 class STS17Crosslingual(AbsTaskSTS, CrosslingualTask):
+    metadata = TaskMetadata(
+        name="STS17",
+        hf_hub_name="mteb/sts17-crosslingual-sts",
+        description="STS 2017 dataset",
+        reference="http://alt.qcri.org/semeval2016/task1/",
+        type="STS",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=_LANGUAGES,
+        main_score="cosine_spearman",
+        revision="af5e6fb845001ecf41f4c1e033ce921939a2a68d",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "STS17",
-            "hf_hub_name": "mteb/sts17-crosslingual-sts",
-            "description": "STS 2017 dataset",
-            "reference": "http://alt.qcri.org/semeval2016/task1/",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": _LANGUAGES,
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 5,
-            "revision": "af5e6fb845001ecf41f4c1e033ce921939a2a68d",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 5
+        return metadata_dict
diff --git a/mteb/tasks/STS/multilingual/STS22CrosslingualSTS.py b/mteb/tasks/STS/multilingual/STS22CrosslingualSTS.py
index 070802fa2a..afe8080df5 100644
--- a/mteb/tasks/STS/multilingual/STS22CrosslingualSTS.py
+++ b/mteb/tasks/STS/multilingual/STS22CrosslingualSTS.py
@@ -1,41 +1,58 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskSTS, CrosslingualTask
 
-_LANGUAGES = {
-    "en": "en",
-    "de": "de",
-    "es": "es",
-    "pl": "pl",
-    "tr": "tr",
-    "ar": "ar",
-    "ru": "ru",
-    "zh": "zh",
-    "fr": "fr",
-    "de-en": "de-en",
-    "es-en": "es-en",
-    "it": "it",
-    "pl-en": "pl-en",
-    "zh-en": "zh-en",
-    "es-it": "es-it",
-    "de-fr": "de-fr",
-    "de-pl": "de-pl",
-    "fr-pl": "fr-pl",
-}
+_LANGUAGES = [
+    "en",
+    "de",
+    "es",
+    "pl",
+    "tr",
+    "ar",
+    "ru",
+    "zh",
+    "fr",
+    "de-en",
+    "es-en",
+    "it",
+    "pl-en",
+    "zh-en",
+    "es-it",
+    "de-fr",
+    "de-pl",
+    "fr-pl",
+]
 
 
 class STS22CrosslingualSTS(AbsTaskSTS, CrosslingualTask):
+    metadata = TaskMetadata(
+        name="STS22",
+        hf_hub_name="mteb/sts22-crosslingual-sts",
+        description="SemEval 2022 Task 8: Multilingual News Article Similarity",
+        reference="https://competitions.codalab.org/competitions/33835",
+        type="STS",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=_LANGUAGES,
+        main_score="cosine_spearman",
+        revision="eea2b4fe26a775864c896887d910b76a8098ad3f",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "STS22",
-            "hf_hub_name": "mteb/sts22-crosslingual-sts",
-            "description": "SemEval 2022 Task 8: Multilingual News Article Similarity",
-            "reference": "https://competitions.codalab.org/competitions/33835",
-            "type": "STS",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": _LANGUAGES,
-            "main_score": "cosine_spearman",
-            "min_score": 1,
-            "max_score": 4,
-            "revision": "eea2b4fe26a775864c896887d910b76a8098ad3f",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 1
+        metadata_dict["max_score"] = 4
+        return metadata_dict
diff --git a/mteb/tasks/STS/multilingual/STSBenchmarkMultilingualSTS.py b/mteb/tasks/STS/multilingual/STSBenchmarkMultilingualSTS.py
index 561a3f3524..1be043b9c9 100644
--- a/mteb/tasks/STS/multilingual/STSBenchmarkMultilingualSTS.py
+++ b/mteb/tasks/STS/multilingual/STSBenchmarkMultilingualSTS.py
@@ -1,35 +1,54 @@
+from __future__ import annotations
+
 import datasets
-from ....abstasks import AbsTaskSTS, MultilingualTask
 
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+from ....abstasks import AbsTaskSTS, MultilingualTask
 
 _LANGUAGES = ["en", "de", "es", "fr", "it", "nl", "pl", "pt", "ru", "zh"]
 _SPLITS = ["dev", "test"]
 
+
 class STSBenchmarkMultilingualSTS(AbsTaskSTS, MultilingualTask):
-    @property
-    def description(self):
-        return {
-            "name": "STSBenchmarkMultilingualSTS",
-            "hf_hub_name": "stsb_multi_mt",
-            "description": ("Semantic Textual Similarity Benchmark (STSbenchmark) dataset,"
-                            "but translated using DeepL API."),
-            "reference": "https://github.com/PhilipMay/stsb-multi-mt/",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": _SPLITS,
-            "eval_langs": _LANGUAGES,
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 5,
-            "revision": "93d57ef91790589e3ce9c365164337a8a78b7632",
-        }
+    metadata = TaskMetadata(
+        name="STSBenchmarkMultilingualSTS",
+        hf_hub_name="stsb_multi_mt",
+        description=(
+            "Semantic Textual Similarity Benchmark (STSbenchmark) dataset,"
+            "but translated using DeepL API."
+        ),
+        reference="https://github.com/PhilipMay/stsb-multi-mt/",
+        type="STS",
+        category="s2s",
+        eval_splits=_SPLITS,
+        eval_langs=_LANGUAGES,
+        main_score="cosine_spearman",
+        revision="93d57ef91790589e3ce9c365164337a8a78b7632",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
 
+    @property
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 5
+        return metadata_dict
 
     def load_data(self, **kwargs):
         if self.data_loaded:
             return
-        
-        def get_dataset_subset(lang:str):
+
+        def get_dataset_subset(lang: str):
             """For a specified subset (=language)
             only get the splits listed in _SPLIT
             and rename column "score"
@@ -40,19 +59,23 @@ def get_dataset_subset(lang:str):
             Returns:
                 datasets.DatasetDict: the dataset of the specified language
             """
-            subset = datasets.DatasetDict(**dict(zip(
-                _SPLITS, datasets.load_dataset(
-                    self.description["hf_hub_name"],
-                    lang,
-                    split=_SPLITS,
-                    revision=self.description.get("revision", None)
-                ))))
+            subset = datasets.DatasetDict(
+                **dict(
+                    zip(
+                        _SPLITS,
+                        datasets.load_dataset(
+                            self.metadata_dict["hf_hub_name"],
+                            lang,
+                            split=_SPLITS,
+                            revision=self.metadata_dict.get("revision", None),
+                        ),
+                    )
+                )
+            )
             return subset.rename_column("similarity_score", "score")
-        
-        self.dataset = datasets.DatasetDict(**dict(
-            zip(self.langs, [get_dataset_subset(lang) for lang in self.langs])
-            ))
-
-        self.data_loaded = True
 
+        self.dataset = datasets.DatasetDict(
+            **dict(zip(self.langs, [get_dataset_subset(lang) for lang in self.langs]))
+        )
 
+        self.data_loaded = True
diff --git a/mteb/tasks/STS/pl/PolishSTS.py b/mteb/tasks/STS/pl/PolishSTS.py
index ef07d00e51..407c6ce1f4 100644
--- a/mteb/tasks/STS/pl/PolishSTS.py
+++ b/mteb/tasks/STS/pl/PolishSTS.py
@@ -1,37 +1,68 @@
+from __future__ import annotations
+
 from mteb.abstasks.AbsTaskSTS import AbsTaskSTS
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class SickrPLSTS(AbsTaskSTS):
+    metadata = TaskMetadata(
+        name="SICK-R-PL",
+        hf_hub_name="PL-MTEB/sickr-pl-sts",
+        description="Polish version of SICK dataset for textual relatedness.",
+        reference="https://aclanthology.org/2020.lrec-1.207",
+        type="STS",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="cosine_spearman",
+        revision="a6ea5a8cab320b040a23452cc28066d9beae2cee",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "SICK-R-PL",
-            "hf_hub_name": "PL-MTEB/sickr-pl-sts",
-            "description": "Polish version of SICK dataset for textual relatedness.",
-            "reference": "https://aclanthology.org/2020.lrec-1.207.pdf",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "cosine_spearman",
-            "min_score": 1,
-            "max_score": 5,
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 1
+        metadata_dict["max_score"] = 5
+        return metadata_dict
 
 
 class CdscrSTS(AbsTaskSTS):
+    metadata = TaskMetadata(
+        name="CDSC-R",
+        hf_hub_name="PL-MTEB/cdscr-sts",
+        description="Compositional Distributional Semantics Corpus for textual relatedness.",
+        reference="https://aclanthology.org/P17-1073.pdf",
+        type="STS",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["pl"],
+        main_score="cosine_spearman",
+        revision="1de08520a7b361e92ffa2a2201ebd41942c54675",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "CDSC-R",
-            "hf_hub_name": "PL-MTEB/cdscr-sts",
-            "description": "Compositional Distributional Semantics Corpus for textual relatedness.",
-            "reference": "https://aclanthology.org/P17-1073.pdf",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["pl"],
-            "main_score": "cosine_spearman",
-            "min_score": 1,
-            "max_score": 5,
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 1
+        metadata_dict["max_score"] = 5
+        return metadata_dict
diff --git a/mteb/tasks/STS/zh/CMTEBSTS.py b/mteb/tasks/STS/zh/CMTEBSTS.py
index f7be6b1e3e..787628ca27 100644
--- a/mteb/tasks/STS/zh/CMTEBSTS.py
+++ b/mteb/tasks/STS/zh/CMTEBSTS.py
@@ -1,124 +1,226 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskSTS import AbsTaskSTS
 
 
 class ATEC(AbsTaskSTS):
-    @property
-    def description(self):
-        return {
-            "name": "ATEC",
-            "hf_hub_name": "C-MTEB/ATEC",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["zh"],
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 1,
-            'revision': '0f319b1142f28d00e055a6770f3f726ae9b7d865',
-        }
+    metadata = TaskMetadata(
+        name="ATEC",
+        hf_hub_name="C-MTEB/ATEC",
+        description="A Chinese dataset for textual relatedness",
+        reference="https://aclanthology.org/2021.emnlp-main.357",
+        type="STS",
+        category="s2s",
+        eval_splits=["validation", "test"],
+        eval_langs=["zh"],
+        main_score="cosine_spearman",
+        revision="0f319b1142f2ae3f7dc7be10c3c7f3598ec6c602",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
 
+    @property
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 1
+        return metadata_dict
 
 
 class BQ(AbsTaskSTS):
+    metadata = TaskMetadata(
+        name="BQ",
+        hf_hub_name="C-MTEB/BQ",
+        description="A Chinese dataset for textual relatedness",
+        reference="https://aclanthology.org/2021.emnlp-main.357",
+        type="STS",
+        category="s2s",
+        eval_splits=["validation", "test"],
+        eval_langs=["zh"],
+        main_score="cosine_spearman",
+        revision="e3dda5e115e487b39ec7e618c0c6a29137052a55",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "BQ",
-            "hf_hub_name": "C-MTEB/BQ",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["zh"],
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 1,
-            'revision': 'e3dda5e115e487b39ec7e618c0c6a29137052a55',
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 1
+        return metadata_dict
 
 
 class LCQMC(AbsTaskSTS):
-    @property
-    def description(self):
-        return {
-            "name": "LCQMC",
-            "hf_hub_name": "C-MTEB/LCQMC",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["zh"],
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 1,
-            'revision': '17f9b096f80380fce5ed12a9be8be7784b337daf',
-        }
+    metadata = TaskMetadata(
+        name="LCQMC",
+        hf_hub_name="C-MTEB/LCQMC",
+        description="A Chinese dataset for textual relatedness",
+        reference="https://aclanthology.org/2021.emnlp-main.357",
+        type="STS",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["zh"],
+        main_score="cosine_spearman",
+        revision="17f9b096f80380fce5ed12a9be8be7784b337daf",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
 
+    @property
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 1
+        return metadata_dict
 
 
 class PAWSX(AbsTaskSTS):
+    metadata = TaskMetadata(
+        name="PAWSX",
+        hf_hub_name="C-MTEB/PAWSX",
+        description="A Chinese dataset for textual relatedness",
+        reference="https://aclanthology.org/2021.emnlp-main.357",
+        type="STS",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["zh"],
+        main_score="cosine_spearman",
+        revision="9c6a90e430ac22b5779fb019a23e820b11a8b5e1",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "PAWSX",
-            "hf_hub_name": "C-MTEB/PAWSX",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["zh"],
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 1,
-            'revision': '9c6a90e430ac22b5779fb019a23e820b11a8b5e1',
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 1
+        return metadata_dict
 
 
 class STSB(AbsTaskSTS):
+    metadata = TaskMetadata(
+        name="STSB",
+        hf_hub_name="C-MTEB/STSB",
+        description="A Chinese dataset for textual relatedness",
+        reference="https://aclanthology.org/2021.emnlp-main.357",
+        type="STS",
+        category="s2s",
+        eval_splits=["validation", "test"],
+        eval_langs=["zh"],
+        main_score="cosine_spearman",
+        revision="0cde68302b3541bb8b3c340dc0644b0b745b3dc0",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "STSB",
-            "hf_hub_name": "C-MTEB/STSB",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["zh"],
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 5,
-            'revision': '0cde68302b3541bb8b3c340dc0644b0b745b3dc0',
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 5
+        return metadata_dict
 
 
 class AFQMC(AbsTaskSTS):
-    @property
-    def description(self):
-        return {
-            "name": "AFQMC",
-            "hf_hub_name": "C-MTEB/AFQMC",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["validation"],
-            "eval_langs": ["zh"],
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 1,
-            'revision': 'b44c3b011063adb25877c13823db83bb193913c4',
-        }
+    metadata = TaskMetadata(
+        name="AFQMC",
+        hf_hub_name="C-MTEB/AFQMC",
+        description="A Chinese dataset for textual relatedness",
+        reference="https://aclanthology.org/2021.emnlp-main.357",
+        type="STS",
+        category="s2s",
+        eval_splits=["validation", "test"],
+        eval_langs=["zh"],
+        main_score="cosine_spearman",
+        revision="b44c3b011063adb25877c13823db83bb193913c4",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
 
+    @property
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 1
+        return metadata_dict
 
 
 class QBQTC(AbsTaskSTS):
+    metadata = TaskMetadata(
+        name="QBQTC",
+        hf_hub_name="C-MTEB/QBQTC",
+        description="",
+        reference="https://github.com/CLUEbenchmark/QBQTC/tree/main/dataset",
+        type="STS",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["zh"],
+        main_score="cosine_spearman",
+        revision="790b0510dc52b1553e8c49f3d2afb48c0e5c48b7",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "QBQTC",
-            "hf_hub_name": "C-MTEB/QBQTC",
-            "reference": "https://github.com/CLUEbenchmark/QBQTC/tree/main/dataset",
-            "type": "STS",
-            "category": "s2s",
-            "eval_splits": ["test"],
-            "eval_langs": ["zh"],
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 1,
-            'revision': '790b0510dc52b1553e8c49f3d2afb48c0e5c48b7',
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        return dict(self.metadata)
diff --git a/mteb/tasks/Summarization/__init__.py b/mteb/tasks/Summarization/__init__.py
index a526a2e69e..970d2ef293 100644
--- a/mteb/tasks/Summarization/__init__.py
+++ b/mteb/tasks/Summarization/__init__.py
@@ -1,2 +1,4 @@
-from .fr.SummEvalFrSummarization import *
+from __future__ import annotations
+
 from .en.SummEvalSummarization import *
+from .fr.SummEvalFrSummarization import *
diff --git a/mteb/tasks/Summarization/en/SummEvalSummarization.py b/mteb/tasks/Summarization/en/SummEvalSummarization.py
index c43a370fb4..bb36333352 100644
--- a/mteb/tasks/Summarization/en/SummEvalSummarization.py
+++ b/mteb/tasks/Summarization/en/SummEvalSummarization.py
@@ -1,20 +1,37 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks import AbsTaskSummarization
 
 
 class SummEvalSummarization(AbsTaskSummarization):
+    metadata = TaskMetadata(
+        name="SummEval",
+        description="News Article Summary Semantic Similarity Estimation.",
+        reference="https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html",
+        hf_hub_name="mteb/summeval",
+        type="Summarization",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["en"],
+        main_score="cosine_spearman",
+        revision="cda12ad7615edc362dbf25a00fdd61d3b1eaf93c",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "SummEval",
-            "hf_hub_name": "mteb/summeval",
-            "description": "News Article Summary Semantic Similarity Estimation.",
-            "reference": "https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html",
-            "type": "Summarization",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["en"],
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 5,
-            "revision": "cda12ad7615edc362dbf25a00fdd61d3b1eaf93c",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 5
+        return metadata_dict
diff --git a/mteb/tasks/Summarization/fr/SummEvalFrSummarization.py b/mteb/tasks/Summarization/fr/SummEvalFrSummarization.py
index 52607bb147..e1f5e341ee 100644
--- a/mteb/tasks/Summarization/fr/SummEvalFrSummarization.py
+++ b/mteb/tasks/Summarization/fr/SummEvalFrSummarization.py
@@ -1,20 +1,37 @@
+from __future__ import annotations
+
 from mteb.abstasks import AbsTaskSummarization
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class SummEvalFrSummarization(AbsTaskSummarization):
+    metadata = TaskMetadata(
+        name="SummEvalFr",
+        description="News Article Summary Semantic Similarity Estimation translated from english to french with DeepL.",
+        reference="https://github.com/Yale-LILY/SummEval",
+        hf_hub_name="lyon-nlp/summeval",
+        type="Summarization",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["fr"],
+        main_score="cosine_spearman",
+        revision="b385812de6a9577b6f4d0f88c6a6e35395a94054",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+    )
+
     @property
-    def description(self):
-        return {
-            "name": "SummEvalFr",
-            "hf_hub_name": "lyon-nlp/summarization-summeval-fr-p2p",
-            "description": "News Article Summary Semantic Similarity Estimation translated from english to french with DeepL.",
-            "reference": "https://github.com/Yale-LILY/SummEval",
-            "type": "Summarization",
-            "category": "p2p",
-            "eval_splits": ["test"],
-            "eval_langs": ["fr"],
-            "main_score": "cosine_spearman",
-            "min_score": 0,
-            "max_score": 5,
-            "revision": "b385812de6a9577b6f4d0f88c6a6e35395a94054",
-        }
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = dict(self.metadata)
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 5
+
+        return metadata_dict
diff --git a/mteb/tasks/__init__.py b/mteb/tasks/__init__.py
index 650d5d65a7..6411aa3713 100644
--- a/mteb/tasks/__init__.py
+++ b/mteb/tasks/__init__.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from .BitextMining import *
 from .Classification import *
 from .Clustering import *
diff --git a/pyproject.toml b/pyproject.toml
index d14c10a029..e7cef2da3e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ readme = "README.md"
 authors = [
     { name = "MTEB Contributors", email = "niklas@huggingface.co" },
     { email = "nouamane@huggingface.co" },
-    { email = "info@nils-reimers.de" }
+    { email = "info@nils-reimers.de" },
 ]
 license = { file = "LICENSE" }
 keywords = ["deep learning", "text embeddings", "benchmark"]
@@ -21,7 +21,7 @@ classifiers = [
     "Intended Audience :: Information Technology",
     "License :: OSI Approved :: Apache Software License",
     "Operating System :: OS Independent",
-    "Programming Language :: Python"
+    "Programming Language :: Python",
 ]
 requires-python = ">=3.8"
 dependencies = [
@@ -35,7 +35,10 @@ dependencies = [
     "torch",
     "tqdm",
     "rich",
-    "pytrec_eval"
+    "pytrec_eval",
+    "pydantic",
+    "typing_extensions",
+    "eval_type_backport",
 ]
 
 
@@ -48,11 +51,7 @@ homepage = "https://github.com/embeddings-benchmark/mteb"
 mteb = "mteb.cmd:main"
 
 [project.optional-dependencies]
-dev = [
-    "ruff>=0.0.254",
-    "pytest",
-    "pytest-xdist"
-]
+dev = ["ruff>=0.0.254", "pytest", "pytest-xdist"]
 
 
 [tool.setuptools.packages.find]
@@ -62,12 +61,8 @@ exclude = ["tests", "results"]
 target-version = "py38"
 
 [tool.ruff.lint]
-ignore = [
-    "E203",
-    "E501",
-    "E741",
-    "F403"
-]
+select = ["E4", "E7", "E9", "F"]
+ignore = ["E203", "E501", "E741", "F403"]
 ignore-init-module-imports = true
 
 [tool.ruff.lint.pydocstyle]
diff --git a/scripts/data/amazon_polarity/create_data.py b/scripts/data/amazon_polarity/create_data.py
index ded94a3b31..f01cb17221 100644
--- a/scripts/data/amazon_polarity/create_data.py
+++ b/scripts/data/amazon_polarity/create_data.py
@@ -1,20 +1,27 @@
+from __future__ import annotations
+
 import os
 
 from datasets import load_dataset
-
 from huggingface_hub import create_repo, upload_file
 
-
 repo_name = "amazon_polarity"
 create_repo(repo_name, organization="mteb", repo_type="dataset")
 
 id2label = {0: "negative", 1: "positive"}
 raw_dset = load_dataset("amazon_polarity")
-raw_dset = raw_dset.map(lambda x: {"text": x["title"] + "\n\n" + x["content"]}, num_proc=4)
+raw_dset = raw_dset.map(
+    lambda x: {"text": x["title"] + "\n\n" + x["content"]}, num_proc=4
+)
 raw_dset = raw_dset.map(lambda x: {"label_text": id2label[x["label"]]}, num_proc=4)
 raw_dset = raw_dset.remove_columns(["title", "content"])
 for split, dset in raw_dset.items():
     save_path = f"{split}.jsonl"
     dset.to_json(save_path)
-    upload_file(path_or_fileobj=save_path, path_in_repo=save_path, repo_id="mteb/" + repo_name, repo_type="dataset")
+    upload_file(
+        path_or_fileobj=save_path,
+        path_in_repo=save_path,
+        repo_id="mteb/" + repo_name,
+        repo_type="dataset",
+    )
     os.system(f"rm {save_path}")
diff --git a/scripts/data/amazon_reviews_multi/create_data.py b/scripts/data/amazon_reviews_multi/create_data.py
index a0f377be79..b533276a75 100644
--- a/scripts/data/amazon_reviews_multi/create_data.py
+++ b/scripts/data/amazon_reviews_multi/create_data.py
@@ -1,10 +1,10 @@
+from __future__ import annotations
+
 import os
 
 from datasets import load_dataset
-
 from huggingface_hub import create_repo, upload_file
 
-
 _LANGUAGES = {
     "de": "German",
     "en": "English",
@@ -18,19 +18,32 @@
 
 for lang in _LANGUAGES:
     raw_dset = load_dataset("amazon_reviews_multi", lang)
-    raw_dset = raw_dset.map(lambda x: {"text": x["review_title"] + "\n\n" + x["review_body"]}, num_proc=4)
+    raw_dset = raw_dset.map(
+        lambda x: {"text": x["review_title"] + "\n\n" + x["review_body"]}, num_proc=4
+    )
     raw_dset = raw_dset.map(lambda x: {"label": x["stars"] - 1}, num_proc=4)
     raw_dset = raw_dset.map(lambda x: {"label_text": str(x["label"])}, num_proc=4)
 
     raw_dset = raw_dset.rename_column("review_id", "id")
     raw_dset = raw_dset.remove_columns(
-        ["product_id", "reviewer_id", "review_body", "review_title", "language", "product_category", "stars"]
+        [
+            "product_id",
+            "reviewer_id",
+            "review_body",
+            "review_title",
+            "language",
+            "product_category",
+            "stars",
+        ]
     )
 
     for split, dset in raw_dset.items():
         save_path = f"{lang}/{split}.jsonl"
         dset.to_json(save_path)
         upload_file(
-            path_or_fileobj=save_path, path_in_repo=save_path, repo_id="mteb/" + repo_name, repo_type="dataset"
+            path_or_fileobj=save_path,
+            path_in_repo=save_path,
+            repo_id="mteb/" + repo_name,
+            repo_type="dataset",
         )
     os.system(f"rm -r {lang}")
diff --git a/scripts/data/arxiv/script_clustering.py b/scripts/data/arxiv/script_clustering.py
index 3822fc5dac..8497ec417d 100644
--- a/scripts/data/arxiv/script_clustering.py
+++ b/scripts/data/arxiv/script_clustering.py
@@ -1,14 +1,14 @@
+from __future__ import annotations
+
 import gzip
 import os
 from collections import Counter
 
 import datasets
+import jsonlines
 import numpy as np
 from tqdm import tqdm
 
-import jsonlines
-
-
 np.random.seed(28042000)
 
 d = datasets.load_dataset("mteb/raw_arxiv")["train"]
diff --git a/scripts/data/arxiv/script_raw.py b/scripts/data/arxiv/script_raw.py
index 69ea519bf3..e564a33488 100644
--- a/scripts/data/arxiv/script_raw.py
+++ b/scripts/data/arxiv/script_raw.py
@@ -3,13 +3,13 @@
 only keep useful information
 """
 
+from __future__ import annotations
+
 import gzip
 import json
 
-from tqdm import tqdm
-
 import jsonlines
-
+from tqdm import tqdm
 
 with open("archive/arxiv-metadata-oai-snapshot.json", "r") as file:
     old_lines = file.readlines()
diff --git a/scripts/data/biorxiv/script_clustering.py b/scripts/data/biorxiv/script_clustering.py
index 981369d767..9add4315d8 100644
--- a/scripts/data/biorxiv/script_clustering.py
+++ b/scripts/data/biorxiv/script_clustering.py
@@ -1,13 +1,13 @@
+from __future__ import annotations
+
 import gzip
 import os
 
 import datasets
+import jsonlines
 import numpy as np
 from tqdm import tqdm
 
-import jsonlines
-
-
 np.random.seed(28042000)
 
 d = datasets.load_dataset("mteb/raw_biorxiv")["train"]
diff --git a/scripts/data/biorxiv/script_raw.py b/scripts/data/biorxiv/script_raw.py
index 0fa9a81bd9..fc432ed9c0 100644
--- a/scripts/data/biorxiv/script_raw.py
+++ b/scripts/data/biorxiv/script_raw.py
@@ -2,13 +2,13 @@
 Fetch data from https://api.biorxiv.org/ and keep useful information
 """
 
-import gzip
+from __future__ import annotations
 
-from tqdm import tqdm
+import gzip
 
 import jsonlines
 import requests
-
+from tqdm import tqdm
 
 api = "https://api.biorxiv.org/details/biorxiv/2021-01-01/2022-05-10/"
 
diff --git a/scripts/data/bucc/create_data.py b/scripts/data/bucc/create_data.py
index e02e05925b..c642140133 100644
--- a/scripts/data/bucc/create_data.py
+++ b/scripts/data/bucc/create_data.py
@@ -1,10 +1,11 @@
+from __future__ import annotations
+
 import gzip
 import json
 import re
 
 from huggingface_hub import upload_file
 
-
 repo_name = "bucc-bitext-mining"
 # create_repo(repo_name, organization="mteb", repo_type="dataset")
 
diff --git a/scripts/data/create_task_table.py b/scripts/data/create_task_table.py
index ebaafd4cd7..ade37e3ad4 100644
--- a/scripts/data/create_task_table.py
+++ b/scripts/data/create_task_table.py
@@ -1,4 +1,7 @@
+from __future__ import annotations
+
 import os
+
 from mteb import MTEB
 
 HEADER = "| Name | Hub URL | Description | Type | Category | #Languages | Train #Samples | Dev #Samples | Test #Samples | Avg. chars / train | Avg. chars / dev | Avg. chars / test"
@@ -80,14 +83,16 @@ def get_ds_stats_beir(hf_hub_name):
     for split in lens.keys():
         try:
             corpus, queries, relevant_docs = BeirDataLoader(path).load(split=split)
-        except:  # split does not exist
+        except:  # split does not exist  # noqa: E722
             continue
         # + 1 for space added between Title & Text by default in BEIR
         avg_lens_c = [len(v["text"]) + len(v["title"]) + 1 for v in corpus.values()]
         avg_lens_q = [len(v) for v in queries.values()]
         lens[split].extend(avg_lens_c)
         lens[split].extend(avg_lens_q)
-    avg_lens = {k: round(sum(lens[k]) / len(lens[k]), 1) if lens[k] else 0 for k in lens}
+    avg_lens = {
+        k: round(sum(lens[k]) / len(lens[k]), 1) if lens[k] else 0 for k in lens
+    }
     return (
         len(lens["train"]),
         len(lens["dev"]),
@@ -117,7 +122,9 @@ def get_ds_stats(hf_hub_name):
             else:
                 raise ValueError(f"Unknown type {type(ds[split][k])}")
 
-    avg_lens = {k: round(sum(lens[k]) / len(lens[k]), 1) if lens[k] else 0 for k in lens}
+    avg_lens = {
+        k: round(sum(lens[k]) / len(lens[k]), 1) if lens[k] else 0 for k in lens
+    }
     return (
         len(lens["train"]),
         len(lens[dev_key]),
@@ -131,21 +138,21 @@ def get_ds_stats(hf_hub_name):
 # Select all tasks
 for task in MTEB().tasks:
     print("Task: ", task)
-    if "hf_hub_name" in task.description:
-        hub_name = hub_url = task.description.get("hf_hub_name")
+    if "hf_hub_name" in task.metadata_dict:
+        hub_name = hub_url = task.metadata_dict.get("hf_hub_name")
         ds_stats = get_ds_stats(hub_name.split("/")[-1])
-    elif "beir_name" in task.description:
-        hub_name = hub_url = "BeIR/" + task.description.get("beir_name")
+    elif "beir_name" in task.metadata_dict:
+        hub_name = hub_url = "BeIR/" + task.metadata_dict.get("beir_name")
         ds_stats = get_ds_stats_beir("/".join(hub_name.split("/")[1:]))
         if "cqadupstack" in hub_name:
             hub_url = "BeIR/cqadupstack-qrels"
     TABLE_STRING += "\n" + ONE_LINE.format(
-        f"[{task.description['name']}]({task.description['reference']})",
+        f"[{task.metadata_dict['name']}]({task.metadata_dict['reference']})",
         f"[{hub_name}](https://huggingface.co/datasets/{hub_url})",
-        task.description["description"],
-        task.description["type"],
-        task.description["category"],
-        len(task.description["eval_langs"]),
+        task.metadata_dict["description"],
+        task.metadata_dict["type"],
+        task.metadata_dict["category"],
+        len(task.metadata_dict["eval_langs"]),
         *ds_stats,
     )
 
diff --git a/scripts/data/germanquad/process_data.py b/scripts/data/germanquad/process_data.py
index e8fe907d3a..8fa4db4919 100644
--- a/scripts/data/germanquad/process_data.py
+++ b/scripts/data/germanquad/process_data.py
@@ -1,8 +1,9 @@
 """See clarin-knext/arguana-pl, clarin-knext/arguana-pl-qrels and
 beir.datasets.data_loader_hf.HFDataLoader for BEIR format."""
-import os
-from datasets import load_dataset, Dataset, DatasetDict, Features, Value
 
+from __future__ import annotations
+
+from datasets import Dataset, DatasetDict, Features, Value, load_dataset
 
 dataset = load_dataset("deepset/germanquad")
 dataset.pop("train")
@@ -18,42 +19,27 @@
     # Check if the context is already in the dictionary
     if item["context"] not in context_to_id:
         context_to_id[item["context"]] = "c" + str(item["id"])
-        entry = {
-            "_id": context_to_id[item["context"]],
-            "text": item["context"]
-        }
+        entry = {"_id": context_to_id[item["context"]], "text": item["context"]}
         corpus_data["_id"].append(entry["_id"])
         corpus_data["text"].append(entry["text"])
 
 for item in dataset["test"]:
-    entry = {
-        "_id": "q" + str(item["id"]),
-        "text": item["question"]
-    }
+    entry = {"_id": "q" + str(item["id"]), "text": item["question"]}
     queries_data["_id"].append(entry["_id"])
     queries_data["text"].append(entry["text"])
 
 # this maps queries to relevant documents
 for item in dataset["test"]:
     corpus_id = context_to_id[item["context"]]
-    entry = {
-        "query-id": "q" + str(item["id"]),
-        "corpus-id": corpus_id,
-        "score": 1
-    }
+    entry = {"query-id": "q" + str(item["id"]), "corpus-id": corpus_id, "score": 1}
     qrels_data["query-id"].append(entry["query-id"])
     qrels_data["corpus-id"].append(entry["corpus-id"])
     qrels_data["score"].append(entry["score"])
 
-corpus_features = Features({
-    "_id": Value("string"),
-    "text": Value("string")
-})
-qrels_features = Features({
-    "query-id": Value("string"),
-    "corpus-id": Value("string"),
-    "score": Value("int32")
-})
+corpus_features = Features({"_id": Value("string"), "text": Value("string")})
+qrels_features = Features(
+    {"query-id": Value("string"), "corpus-id": Value("string"), "score": Value("int32")}
+)
 corpus_dataset = Dataset.from_dict(corpus_data, features=corpus_features)
 queries_dataset = Dataset.from_dict(queries_data, features=corpus_features)
 qrels_dataset = Dataset.from_dict(qrels_data, features=qrels_features)
diff --git a/scripts/data/hal/create_data.py b/scripts/data/hal/create_data.py
index afc61c62e8..3be0599535 100644
--- a/scripts/data/hal/create_data.py
+++ b/scripts/data/hal/create_data.py
@@ -1,10 +1,10 @@
-import os
+from __future__ import annotations
 
-from huggingface_hub import create_repo, upload_file
+import os
 
 import pandas as pd
 import requests
-
+from huggingface_hub import create_repo, upload_file
 
 MAX_OUTPUT = 10000
 NB_RESULTS = 100000
@@ -12,7 +12,7 @@
 REPO_NAME = "clustering-hal-s2s"
 SAVE_PATH = "test.jsonl"
 
-df_papers = pd.DataFrame(columns=["hal_id","title","domain"])
+df_papers = pd.DataFrame(columns=["hal_id", "title", "domain"])
 
 
 start_index = 0
@@ -21,30 +21,29 @@
         "GET",
         f"https://api.archives-ouvertes.fr/search/?q=*:*&wt=json&fl=halId_s,title_s,level0_domain_s&fq=language_s:fr&fq=submittedDateY_i:[2000%20TO%20*]&rows={MAX_OUTPUT}&start={start_index}",
     )
-    if "response" in response.json() :
+    if "response" in response.json():
         papers = response.json()["response"]["docs"]
-        for paper in papers :
+        for paper in papers:
             if ("title_s" in paper) and ("level0_domain_s" in paper):
                 paper_info = {
                     "hal_id": paper["halId_s"],
                     "title": paper["title_s"][0],
-                    "domain": paper["level0_domain_s"][0]
+                    "domain": paper["level0_domain_s"][0],
                 }
-                df_papers = pd.concat([df_papers, pd.DataFrame([paper_info])], ignore_index=True)
+                df_papers = pd.concat(
+                    [df_papers, pd.DataFrame([paper_info])], ignore_index=True
+                )
     start_index += MAX_OUTPUT
 
 df_papers = df_papers.drop_duplicates()
 df_papers.to_json(SAVE_PATH, orient="records", lines=True)
 
-create_repo(
-    ORGANIZATION + REPO_NAME, 
-    repo_type="dataset"
-)
+create_repo(ORGANIZATION + REPO_NAME, repo_type="dataset")
 
 upload_file(
     path_or_fileobj=SAVE_PATH,
     path_in_repo=SAVE_PATH,
     repo_id=ORGANIZATION + REPO_NAME,
-    repo_type="dataset"
+    repo_type="dataset",
 )
 os.system(f"rm {SAVE_PATH}")
diff --git a/scripts/data/imdb/create_data.py b/scripts/data/imdb/create_data.py
index 1c21339984..f8bed910f1 100644
--- a/scripts/data/imdb/create_data.py
+++ b/scripts/data/imdb/create_data.py
@@ -1,10 +1,10 @@
+from __future__ import annotations
+
 import os
 
 from datasets import load_dataset
-
 from huggingface_hub import create_repo, upload_file
 
-
 repo_name = "imdb"
 create_repo(repo_name, organization="mteb", repo_type="dataset")
 
@@ -14,5 +14,10 @@
 for split, dset in raw_dset.items():
     save_path = f"{split}.jsonl"
     dset.to_json(save_path)
-    upload_file(path_or_fileobj=save_path, path_in_repo=save_path, repo_id="mteb/" + repo_name, repo_type="dataset")
+    upload_file(
+        path_or_fileobj=save_path,
+        path_in_repo=save_path,
+        repo_id="mteb/" + repo_name,
+        repo_type="dataset",
+    )
     os.system(f"rm {save_path}")
diff --git a/scripts/data/medrxiv/script_clustering.py b/scripts/data/medrxiv/script_clustering.py
index 1715bda077..d41f9e7067 100644
--- a/scripts/data/medrxiv/script_clustering.py
+++ b/scripts/data/medrxiv/script_clustering.py
@@ -1,13 +1,13 @@
+from __future__ import annotations
+
 import gzip
 import os
 
 import datasets
+import jsonlines
 import numpy as np
 from tqdm import tqdm
 
-import jsonlines
-
-
 np.random.seed(28042000)
 
 d = datasets.load_dataset("mteb/raw_medrxiv")["train"]
diff --git a/scripts/data/medrxiv/script_raw.py b/scripts/data/medrxiv/script_raw.py
index 7e895afc74..3afc8c2751 100644
--- a/scripts/data/medrxiv/script_raw.py
+++ b/scripts/data/medrxiv/script_raw.py
@@ -2,13 +2,13 @@
 Fetch data from https://api.biorxiv.org/ and keep useful information
 """
 
-import gzip
+from __future__ import annotations
 
-from tqdm import tqdm
+import gzip
 
 import jsonlines
 import requests
-
+from tqdm import tqdm
 
 api = "https://api.biorxiv.org/details/medrxiv/2021-01-01/2022-05-10/"
 
diff --git a/scripts/data/mind/prepare_data.py b/scripts/data/mind/prepare_data.py
index 377c04cc92..1c441bdf5e 100644
--- a/scripts/data/mind/prepare_data.py
+++ b/scripts/data/mind/prepare_data.py
@@ -1,15 +1,20 @@
+from __future__ import annotations
+
 import os
 
 import pandas as pd
 
-
-df_news = pd.read_csv("scripts/mind/data/MINDsmall_train/news.tsv", sep="\t", header=None)
+df_news = pd.read_csv(
+    "scripts/mind/data/MINDsmall_train/news.tsv", sep="\t", header=None
+)
 df_news = df_news[[0, 3]]
 df_news.columns = ["id", "text"]
 df_news.index = df_news["id"]
 
 
-df_behaviours = pd.read_csv("scripts/mind/data/MINDsmall_train/behaviors.tsv", sep="\t", header=None)
+df_behaviours = pd.read_csv(
+    "scripts/mind/data/MINDsmall_train/behaviors.tsv", sep="\t", header=None
+)
 df_behaviours = df_behaviours[[0, 3, 4]]
 df_behaviours.columns = ["id", "query", "data"]
 df_behaviours.dropna(inplace=True)
diff --git a/scripts/data/redditp2p/script_clustering.py b/scripts/data/redditp2p/script_clustering.py
index 8d7f7374aa..5b9aa775d9 100644
--- a/scripts/data/redditp2p/script_clustering.py
+++ b/scripts/data/redditp2p/script_clustering.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import gzip
 import os
 import random
@@ -37,7 +39,9 @@
 
     # Weigh by counts to reduce noise from random poorly defined subreddits
     # For 10 labels, ~85K samples; For 100 labels ~850K
-    labels = random.choices(list(unique_to_count.keys()), weights=unique_to_count.values(), k=num_labels)
+    labels = random.choices(
+        list(unique_to_count.keys()), weights=unique_to_count.values(), k=num_labels
+    )
     sub_ds = ds.filter(lambda x: x["subreddit"] in labels).shuffle()
     if len(sub_ds) < MIN_SAMPLES:
         continue
diff --git a/scripts/data/stackexchangep2p/script_clustering.py b/scripts/data/stackexchangep2p/script_clustering.py
index 8cebb9a49b..f6929036ff 100644
--- a/scripts/data/stackexchangep2p/script_clustering.py
+++ b/scripts/data/stackexchangep2p/script_clustering.py
@@ -1,16 +1,18 @@
+from __future__ import annotations
+
 import gzip
 import os
 
 import datasets
+import jsonlines
 import numpy as np
 from tqdm import tqdm
 
-import jsonlines
-
-
 np.random.seed(28042000)
 
-d = datasets.load_dataset("flax-sentence-embeddings/stackexchange_title_body_jsonl")["validation"]
+d = datasets.load_dataset("flax-sentence-embeddings/stackexchange_title_body_jsonl")[
+    "validation"
+]
 
 # d = d.select(range(1000))
 
diff --git a/scripts/data/sts22-crosslingual-sts/create_data.py b/scripts/data/sts22-crosslingual-sts/create_data.py
index 6e9d88b408..72f067de5f 100644
--- a/scripts/data/sts22-crosslingual-sts/create_data.py
+++ b/scripts/data/sts22-crosslingual-sts/create_data.py
@@ -7,13 +7,15 @@
 mv 6798bbee-77fa-452d-bde2-96b8631acb5d final_evaluation_data.csv
 python3 -m semeval_8_2022_ia_downloader.cli --links_file=final_evaluation_data.csv --dump_dir=test
 """
+
+from __future__ import annotations
+
 import glob
 import json
 import os
 
 import matplotlib.pyplot as plt
 import pandas as pd
-
 from huggingface_hub import create_repo, upload_file
 
 # Prepare the data
@@ -66,5 +68,10 @@
     save_path = lang + "/train.jsonl"
     lang_df.to_json(save_path, orient="records", lines=True)
 
-    upload_file(path_or_fileobj=save_path, path_in_repo=save_path, repo_id="mteb/" + repo_name, repo_type="dataset")
+    upload_file(
+        path_or_fileobj=save_path,
+        path_in_repo=save_path,
+        repo_id="mteb/" + repo_name,
+        repo_type="dataset",
+    )
     os.system(f"rm {save_path}")
diff --git a/scripts/data/summeval_fr/create_data.py b/scripts/data/summeval_fr/create_data.py
index 0075ef1f54..b6f9417e33 100644
--- a/scripts/data/summeval_fr/create_data.py
+++ b/scripts/data/summeval_fr/create_data.py
@@ -1,29 +1,32 @@
-import json
-import os 
+from __future__ import annotations
 
-from huggingface_hub import create_repo, upload_file
+import json
+import os
 
 import datasets
 import requests
+from huggingface_hub import create_repo, upload_file
 
 # API key DeepL to set before running the script with the command 'export DEEPL_API_KEY=***'
-DEEPL_API_KEY= os.environ.get('DEEPL_AUTH_KEY')
+DEEPL_API_KEY = os.environ.get("DEEPL_AUTH_KEY")
 ORGANIZATION = "lyon-nlp/"
 REPO_NAME = "summarization-summeval-fr-p2p"
 SAVE_PATH = "test.json"
 HEADERS = {
-    'Authorization': f"DeepL-Auth-Key {DEEPL_API_KEY}",
-    'Content-Type': 'application/x-www-form-urlencoded',
+    "Authorization": f"DeepL-Auth-Key {DEEPL_API_KEY}",
+    "Content-Type": "application/x-www-form-urlencoded",
 }
 
 
 def translate_with_deepl(text: str) -> str:
     data = {
-        'text': text,
-        'target_lang': 'FR',
+        "text": text,
+        "target_lang": "FR",
     }
-    response = requests.post('https://api.deepl.com/v2/translate', headers=HEADERS, data=data)
-    return response.json()['translations'][0]['text']
+    response = requests.post(
+        "https://api.deepl.com/v2/translate", headers=HEADERS, data=data
+    )
+    return response.json()["translations"][0]["text"]
 
 
 summeval = datasets.load_dataset("mteb/summeval")["test"]
@@ -35,29 +38,26 @@ def translate_with_deepl(text: str) -> str:
     trad["text"] = translate_with_deepl(line["text"])
 
     machine_summaries = []
-    for machine_sum in line['machine_summaries']:
+    for machine_sum in line["machine_summaries"]:
         machine_summaries.append(translate_with_deepl(machine_sum))
-    trad['machine_summaries'] = machine_summaries
+    trad["machine_summaries"] = machine_summaries
 
     human_summaries = []
-    for human_sum in line['human_summaries']:
+    for human_sum in line["human_summaries"]:
         human_summaries.append(translate_with_deepl(human_sum))
-    trad['human_summaries'] = human_summaries
+    trad["human_summaries"] = human_summaries
     trads.append(trad)
 
-    with open(SAVE_PATH, "w", encoding='utf8') as final:
+    with open(SAVE_PATH, "w", encoding="utf8") as final:
         json.dump(trads, final, ensure_ascii=False)
 
 
-create_repo(
-    ORGANIZATION + REPO_NAME,
-    repo_type="dataset"
-)
+create_repo(ORGANIZATION + REPO_NAME, repo_type="dataset")
 
 upload_file(
     path_or_fileobj=SAVE_PATH,
     path_in_repo=SAVE_PATH,
     repo_id=ORGANIZATION + REPO_NAME,
-    repo_type="dataset"
+    repo_type="dataset",
 )
 os.system(f"rm {SAVE_PATH}")
diff --git a/scripts/data/toxic_conversations_50k/create_data.py b/scripts/data/toxic_conversations_50k/create_data.py
index d7d26fe694..0887652440 100644
--- a/scripts/data/toxic_conversations_50k/create_data.py
+++ b/scripts/data/toxic_conversations_50k/create_data.py
@@ -1,10 +1,11 @@
+from __future__ import annotations
+
 import json
 import random
 from collections import Counter
 
 import pandas as pd
 
-
 df = pd.read_csv("original.csv")
 
 print(df)
diff --git a/scripts/merge_cqadupstack.py b/scripts/merge_cqadupstack.py
index a7a6bb4a8b..2fb0c79fe5 100644
--- a/scripts/merge_cqadupstack.py
+++ b/scripts/merge_cqadupstack.py
@@ -2,6 +2,9 @@
 Merges CQADupstack subset results
 Usage: python merge_cqadupstack.py path_to_results_folder
 """
+
+from __future__ import annotations
+
 import glob
 import json
 import logging
@@ -55,12 +58,16 @@
                     if metric == "evaluation_time":
                         score = all_results[split][metric] + score
                     elif metric not in NOAVG_KEYS:
-                        score = all_results[split][metric] + score * 1 / len(TASK_LIST_CQA)
+                        score = all_results[split][metric] + score * 1 / len(
+                            TASK_LIST_CQA
+                        )
                     all_results[split][metric] = score
     all_results["mteb_dataset_name"] = "CQADupstackRetrieval"
 
     logger.info("Saving ", all_results)
-    with open(os.path.join(results_folder, "CQADupstackRetrieval.json"), "w", encoding="utf-8") as f:
+    with open(
+        os.path.join(results_folder, "CQADupstackRetrieval.json"), "w", encoding="utf-8"
+    ) as f:
         json.dump(all_results, f, indent=4)
 else:
     logger.warning(
diff --git a/scripts/mteb_meta.py b/scripts/mteb_meta.py
index adc10a103a..d7094fdd50 100644
--- a/scripts/mteb_meta.py
+++ b/scripts/mteb_meta.py
@@ -1,7 +1,7 @@
 """
 Usage: python mteb_meta.py path_to_results_folder
 
-Creates evaluation results metadata for the model card. 
+Creates evaluation results metadata for the model card.
 E.g.
 ---
 tags:
@@ -23,6 +23,8 @@
 ---
 """
 
+from __future__ import annotations
+
 import json
 import logging
 import os
@@ -50,9 +52,29 @@
 # Use "train" split instead
 TRAIN_SPLIT = ["DanishPoliticalCommentsClassification"]
 # Use "validation" split instead
-VALIDATION_SPLIT = ["AFQMC", "Cmnli", "IFlyTek", "TNews", "MSMARCO", "MultilingualSentiment", "Ocnli"]
+VALIDATION_SPLIT = [
+    "AFQMC",
+    "Cmnli",
+    "IFlyTek",
+    "TNews",
+    "MSMARCO",
+    "MultilingualSentiment",
+    "Ocnli",
+]
 # Use "dev" split instead
-DEV_SPLIT = ["CmedqaRetrieval", "CovidRetrieval", "DuRetrieval", "EcomRetrieval", "MedicalRetrieval", "MMarcoReranking", "MMarcoRetrieval", "MSMARCO", "T2Reranking", "T2Retrieval", "VideoRetrieval"]
+DEV_SPLIT = [
+    "CmedqaRetrieval",
+    "CovidRetrieval",
+    "DuRetrieval",
+    "EcomRetrieval",
+    "MedicalRetrieval",
+    "MMarcoReranking",
+    "MMarcoRetrieval",
+    "MSMARCO",
+    "T2Reranking",
+    "T2Retrieval",
+    "VideoRetrieval",
+]
 
 MARKER = "---"
 TAGS = "tags:"
@@ -70,11 +92,19 @@
 
 for ds_name, res_dict in sorted(all_results.items()):
     mteb_desc = (
-        MTEB(tasks=[ds_name.replace("CQADupstackRetrieval", "CQADupstackAndroidRetrieval")]).tasks[0].description
+        MTEB(
+            tasks=[
+                ds_name.replace("CQADupstackRetrieval", "CQADupstackAndroidRetrieval")
+            ]
+        )
+        .tasks[0]
+        .metadata_dict
     )
     hf_hub_name = mteb_desc.get("hf_hub_name", mteb_desc.get("beir_name"))
     if "beir_name" in mteb_desc:
-        logger.warning(f"`beir_name` is deprecated and will be removed in the future. New result files contain `hf_hub_name` instead.")
+        logger.warning(
+            "`beir_name` is deprecated and will be removed in the future. New result files contain `hf_hub_name` instead."
+        )
     if ds_name == "CQADupstackRetrieval" in ds_name:
         hf_hub_name = "mteb/cqadupstack"
     mteb_type = mteb_desc["type"]
diff --git a/scripts/run_mteb_chinese.py b/scripts/run_mteb_chinese.py
index a76d03a8f8..f1a04fb3c2 100644
--- a/scripts/run_mteb_chinese.py
+++ b/scripts/run_mteb_chinese.py
@@ -1,11 +1,14 @@
 """Example script for benchmarking all datasets constituting the MTEB Chinese leaderboard & average scores"""
 
-import logging
+from __future__ import annotations
+
 import functools
+import logging
 
-from mteb import MTEB
 from sentence_transformers import SentenceTransformer
 
+from mteb import MTEB
+
 logging.basicConfig(level=logging.INFO)
 
 logger = logging.getLogger("main")
diff --git a/scripts/run_mteb_english.py b/scripts/run_mteb_english.py
index e4bd869b36..8b2d50950d 100644
--- a/scripts/run_mteb_english.py
+++ b/scripts/run_mteb_english.py
@@ -1,10 +1,13 @@
 """Example script for benchmarking all datasets constituting the MTEB English leaderboard & average scores"""
 
+from __future__ import annotations
+
 import logging
 
-from mteb import MTEB
 from sentence_transformers import SentenceTransformer
 
+from mteb import MTEB
+
 logging.basicConfig(level=logging.INFO)
 
 logger = logging.getLogger("main")
@@ -109,5 +112,9 @@
 for task in TASK_LIST:
     logger.info(f"Running task: {task}")
     eval_splits = ["dev"] if task == "MSMARCO" else ["test"]
-    evaluation = MTEB(tasks=[task], task_langs=["en"])  # Remove "en" for running all languages
-    evaluation.run(model, output_folder=f"results/{model_name}", eval_splits=eval_splits)
+    evaluation = MTEB(
+        tasks=[task], task_langs=["en"]
+    )  # Remove "en" for running all languages
+    evaluation.run(
+        model, output_folder=f"results/{model_name}", eval_splits=eval_splits
+    )
diff --git a/scripts/run_mteb_french.py b/scripts/run_mteb_french.py
index 9f5a37b171..fdec20d14f 100644
--- a/scripts/run_mteb_french.py
+++ b/scripts/run_mteb_french.py
@@ -1,5 +1,7 @@
 """Example script for benchmarking all datasets constituting the MTEB French leaderboard & average scores"""
 
+from __future__ import annotations
+
 import logging
 
 from sentence_transformers import SentenceTransformer
@@ -34,31 +36,23 @@
     "PawsX",
 ]
 
-TASK_LIST_RERANKING = [
-    "SyntecReranking",
-    "AlloprofReranking"
-]
+TASK_LIST_RERANKING = ["SyntecReranking", "AlloprofReranking"]
 
 TASK_LIST_RETRIEVAL = [
-    "AlloprofRetrieval", 
-    "BSARDRetrieval", 
+    "AlloprofRetrieval",
+    "BSARDRetrieval",
     "SyntecRetrieval",
     "XPQARetrieval",
     "MintakaRetrieval",
 ]
 
-TASK_LIST_STS = [
-    "SummEvalFr",
-    "STSBenchmarkMultilingualSTS",
-    "STS22",
-    "SICKFr"
-]
+TASK_LIST_STS = ["SummEvalFr", "STSBenchmarkMultilingualSTS", "STS22", "SICKFr"]
 
 TASK_LIST_BITEXTMINING = [
     "DiaBLaBitextMining",
     "FloresBitextMining",
     "TatoebaBitextMining",
-    "BUCCBitextMining"
+    "BUCCBitextMining",
 ]
 
 
@@ -78,5 +72,7 @@
 logger.info(f"Task list : {TASK_LIST}")
 for task in TASK_LIST:
     logger.info(f"Running task: {task}")
-    evaluation = MTEB(tasks=[task], task_langs=["fr"])  # Remove "fr" for running all languages
+    evaluation = MTEB(
+        tasks=[task], task_langs=["fr"]
+    )  # Remove "fr" for running all languages
     evaluation.run(model, output_folder=f"results/{model_name}")
diff --git a/scripts/run_mteb_german.py b/scripts/run_mteb_german.py
index cae1b3e2fa..aba9854c0f 100644
--- a/scripts/run_mteb_german.py
+++ b/scripts/run_mteb_german.py
@@ -1,10 +1,13 @@
 """Example script for benchmarking German Context models."""
 
+from __future__ import annotations
+
 import logging
 
-from mteb import MTEB
 from sentence_transformers import SentenceTransformer
 
+from mteb import MTEB
+
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("main")
 
@@ -17,7 +20,12 @@
     "MassiveScenarioClassification",
 ]
 
-TASK_LIST_CLUSTERING = ["BlurbsClusteringP2P", "BlurbsClusteringS2S", "TenKGnadClusteringP2P", "TenKGnadClusteringS2S"]
+TASK_LIST_CLUSTERING = [
+    "BlurbsClusteringP2P",
+    "BlurbsClusteringS2S",
+    "TenKGnadClusteringP2P",
+    "TenKGnadClusteringS2S",
+]
 
 TASK_LIST_PAIR_CLASSIFICATION = ["PawsX"]
 
@@ -40,4 +48,8 @@
 model = SentenceTransformer(model_name)
 
 evaluation = MTEB(tasks=TASK_LIST, task_langs=["de"])
-evaluation.run(model, overwrite_results=True, output_folder=f"results/de/{model_name.split('/')[-1]}")
+evaluation.run(
+    model,
+    overwrite_results=True,
+    output_folder=f"results/de/{model_name.split('/')[-1]}",
+)
diff --git a/scripts/run_mteb_korean.py b/scripts/run_mteb_korean.py
index ef085adc68..aa097e0e56 100644
--- a/scripts/run_mteb_korean.py
+++ b/scripts/run_mteb_korean.py
@@ -1,34 +1,28 @@
 """Example script for benchmarking all datasets constituting the MTEB Korean leaderboard & average scores"""
 
+from __future__ import annotations
+
 import logging
 
-from mteb import MTEB
 from sentence_transformers import SentenceTransformer
 
+from mteb import MTEB
+
 logging.basicConfig(level=logging.INFO)
 
 logger = logging.getLogger("main")
 
-TASK_LIST_CLASSIFICATION = [
-]
+TASK_LIST_CLASSIFICATION = []
 
-TASK_LIST_CLUSTERING = [
-]
+TASK_LIST_CLUSTERING = []
 
-TASK_LIST_PAIR_CLASSIFICATION = [
-]
+TASK_LIST_PAIR_CLASSIFICATION = []
 
-TASK_LIST_RERANKING = [
-]
+TASK_LIST_RERANKING = []
 
-TASK_LIST_RETRIEVAL = [
-    'Ko-StrategyQA',
-    'Ko-mrtydi',
-    'Ko-miracl'
-]
+TASK_LIST_RETRIEVAL = ["Ko-StrategyQA", "Ko-mrtydi", "Ko-miracl"]
 
-TASK_LIST_STS = [
-]
+TASK_LIST_STS = []
 
 TASK_LIST = (
     TASK_LIST_CLASSIFICATION
@@ -44,5 +38,7 @@
 
 for task in TASK_LIST:
     logger.info(f"Running task: {task}")
-    evaluation = MTEB(tasks=[task], task_langs=["ko"])  # Remove "ko" for running all languages
+    evaluation = MTEB(
+        tasks=[task], task_langs=["ko"]
+    )  # Remove "ko" for running all languages
     evaluation.run(model, output_folder=f"results/{model_name}")
diff --git a/scripts/run_mteb_polish.py b/scripts/run_mteb_polish.py
index f078b3ff0b..ec0e0177bc 100644
--- a/scripts/run_mteb_polish.py
+++ b/scripts/run_mteb_polish.py
@@ -1,10 +1,13 @@
 """Example script for benchmarking all datasets constituting the MTEB Polish leaderboard & average scores"""
 
+from __future__ import annotations
+
 import logging
 
-from mteb import MTEB
 from sentence_transformers import SentenceTransformer
 
+from mteb import MTEB
+
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("main")
 
@@ -15,30 +18,16 @@
     "AllegroReviews",
     "PAC",
     "MassiveIntentClassification",
-    "MassiveScenarioClassification"
+    "MassiveScenarioClassification",
 ]
 
-clustering_tasks = [
-    "8TagsClustering"
-]
+clustering_tasks = ["8TagsClustering"]
 
-pair_classification_tasks = [
-    "SICK-E-PL",
-    "PPC",
-    "CDSC-E",
-    "PSC"
-]
+pair_classification_tasks = ["SICK-E-PL", "PPC", "CDSC-E", "PSC"]
 
-sts_tasks = [
-    "SICK-R-PL",
-    "CDSC-R",
-    "STS22"
-]
+sts_tasks = ["SICK-R-PL", "CDSC-R", "STS22"]
 
-tasks = classification_tasks \
-        + clustering_tasks \
-        + pair_classification_tasks \
-        + sts_tasks
+tasks = classification_tasks + clustering_tasks + pair_classification_tasks + sts_tasks
 
 model_name = "sdadas/st-polish-paraphrase-from-distilroberta"
 model = SentenceTransformer(model_name)
diff --git a/tests/test_ClusteringEvaluator.py b/tests/test_ClusteringEvaluator.py
index 439981b1e9..0769a088cd 100644
--- a/tests/test_ClusteringEvaluator.py
+++ b/tests/test_ClusteringEvaluator.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from typing import List
 
 import numpy as np
diff --git a/tests/test_PairClassificationEvaluator.py b/tests/test_PairClassificationEvaluator.py
index 130e063858..0435e99526 100644
--- a/tests/test_PairClassificationEvaluator.py
+++ b/tests/test_PairClassificationEvaluator.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import pytest
 
 from mteb.evaluation.evaluators import PairClassificationEvaluator
@@ -21,8 +23,10 @@ def test_f1(self):
         labels = [0, 0, 0, 0, 1, 0, 0, 0, 1, 0]
         high_score_more_similar = True
 
-        f1, precision, recall, f1_threshold = PairClassificationEvaluator.find_best_f1_and_threshold(
-            scores, labels, high_score_more_similar
+        f1, precision, recall, f1_threshold = (
+            PairClassificationEvaluator.find_best_f1_and_threshold(
+                scores, labels, high_score_more_similar
+            )
         )
         assert f1 == pytest.approx(0.66666, TOL)
         assert precision == pytest.approx(1.0, TOL)
@@ -33,5 +37,7 @@ def test_ap(self):
         scores = [6.12, 5.39, 5.28, 5.94, 6.34, 6.47, 7.88, 6.62, 8.04, 5.9]
         labels = [0, 0, 0, 0, 1, 0, 0, 0, 1, 0]
         high_score_more_similar = True
-        ap = PairClassificationEvaluator.ap_score(scores, labels, high_score_more_similar)
+        ap = PairClassificationEvaluator.ap_score(
+            scores, labels, high_score_more_similar
+        )
         assert ap == pytest.approx(0.7, TOL)
diff --git a/tests/test_RerankingEvaluator.py b/tests/test_RerankingEvaluator.py
index db3ef8c4f3..2fa7619f15 100644
--- a/tests/test_RerankingEvaluator.py
+++ b/tests/test_RerankingEvaluator.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import pytest
 
 from mteb.evaluation.evaluators import RerankingEvaluator
@@ -17,12 +19,20 @@ def test_mrr_at_k(self):
         is_relevant = [1, 1, 1, 0, 0, 0, 0, 0, 0]
         pred_ranking = [5, 2, 6, 1, 3, 4, 7, 8, 9]
 
-        assert self.evaluator.mrr_at_k_score(is_relevant, pred_ranking, 10) == pytest.approx(0.5, TOL)
-        assert self.evaluator.mrr_at_k_score(is_relevant, pred_ranking, 3) == pytest.approx(0.5, TOL)
-        assert self.evaluator.mrr_at_k_score(is_relevant, pred_ranking, 1) == pytest.approx(0, TOL)
+        assert self.evaluator.mrr_at_k_score(
+            is_relevant, pred_ranking, 10
+        ) == pytest.approx(0.5, TOL)
+        assert self.evaluator.mrr_at_k_score(
+            is_relevant, pred_ranking, 3
+        ) == pytest.approx(0.5, TOL)
+        assert self.evaluator.mrr_at_k_score(
+            is_relevant, pred_ranking, 1
+        ) == pytest.approx(0, TOL)
 
     def test_map(self):
         is_relevant = [1, 1, 1, 0, 0]
         pred_scores = [0.75, 0.93, 0.85, 0.76, 0.75]
 
-        assert self.evaluator.ap_score(is_relevant, pred_scores) == pytest.approx(0.86666, TOL)
+        assert self.evaluator.ap_score(is_relevant, pred_scores) == pytest.approx(
+            0.86666, TOL
+        )
diff --git a/tests/test_RetrievalEvaluator.py b/tests/test_RetrievalEvaluator.py
index b68519e01f..78062d9fb4 100644
--- a/tests/test_RetrievalEvaluator.py
+++ b/tests/test_RetrievalEvaluator.py
@@ -1,9 +1,10 @@
-import pytest
+from __future__ import annotations
 
 from mteb.evaluation.evaluators import RetrievalEvaluator
 
 TOL = 0.0001
 
+
 class TestRetrievalEvaluator:
     def setup_method(self):
         """setup any state tied to the execution of the given method in a
@@ -29,7 +30,7 @@ def test_metrics_at_k(self):
             [1, 2, 3],
         )
 
-        assert ndcg == {'NDCG@1': 0.5, 'NDCG@2': 0.30657, 'NDCG@3': 0.30657}
-        assert _map == {'MAP@1': 0.25, 'MAP@2': 0.25, 'MAP@3': 0.25}
-        assert recall == {'Recall@1': 0.25, 'Recall@2': 0.25, 'Recall@3': 0.25}
-        assert precision == {'P@1': 0.5, 'P@2': 0.25, 'P@3': 0.16667}
+        assert ndcg == {"NDCG@1": 0.5, "NDCG@2": 0.30657, "NDCG@3": 0.30657}
+        assert _map == {"MAP@1": 0.25, "MAP@2": 0.25, "MAP@3": 0.25}
+        assert recall == {"Recall@1": 0.25, "Recall@2": 0.25, "Recall@3": 0.25}
+        assert precision == {"P@1": 0.5, "P@2": 0.25, "P@3": 0.16667}
diff --git a/tests/test_all_abstasks.py b/tests/test_all_abstasks.py
index ff578b54f4..fca6c0133a 100644
--- a/tests/test_all_abstasks.py
+++ b/tests/test_all_abstasks.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import logging
 from typing import Union
 
@@ -9,7 +11,6 @@
 from mteb.tasks.BitextMining import BUCCBitextMining
 
 logging.basicConfig(level=logging.INFO)
-from mteb import MTEB
 
 
 def test_two_mteb_tasks():