Restructing the readme (#262)

* restructing the readme * removed double specification of versions and moved all setup to pyproject.toml * correctly use flat-layout for the package
embeddings-benchmark · Mar 20, 2024 · 769157b · 769157b
1 parent 8a758bc
commit 769157b
Show file tree

Hide file tree

Showing 17 changed files with 424 additions and 355 deletions.
diff --git a/.gitignore b/.gitignore
@@ -133,3 +133,6 @@ dmypy.json
 
 # error logs
 error_logs.txt
+
+# tests
+tests/results
diff --git a/README.md b/README.md
diff --git a/docs/adding_a_model.md b/docs/adding_a_model.md
@@ -0,0 +1,11 @@
+## Adding a Model to the MTEB Leaderboard
+
+The MTEB Leaderboard is available [here](https://huggingface.co/spaces/mteb/leaderboard). To submit:
+
+1. Run on MTEB: You can reference [scripts/run_mteb_english.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/run_mteb_english.py) for all MTEB English datasets used in the main ranking, or [scripts/run_mteb_chinese.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/run_mteb_chinese.py) for the Chinese ones. 
+Advanced scripts with different models are available in the [mteb/mtebscripts repo](https://github.com/embeddings-benchmark/mtebscripts).
+2. Format the json files into metadata using the script at `scripts/mteb_meta.py`. For example
+`python scripts/mteb_meta.py path_to_results_folder`, which will create a `mteb_metadata.md` file. If you ran CQADupstack retrieval, make sure to merge the results first with `python scripts/merge_cqadupstack.py path_to_results_folder`.
+3. Copy the content of the `mteb_metadata.md` file to the top of a `README.md` file of your model on the Hub. See [here](https://huggingface.co/Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit/blob/main/README.md) for an example.
+4. Hit the Refresh button at the bottom of the leaderboard and you should see your scores 🥇
+5. To have the scores appear without refreshing, you can open an issue on the [Community Tab of the LB](https://huggingface.co/spaces/mteb/leaderboard/discussions) and someone will restart the space to cache your average scores. The cache is updated anyways ~1x/week.
diff --git a/docs/contributing.md b/docs/contributing.md
@@ -0,0 +1,29 @@
+## Contributing to MTEB
+We welcome contributions such as new datasets to MTEB! This section describes how to set up the repository for development.
+
+### Development Installation
+If you want to submit a dataset or on other ways contribute to MTEB, you can install the package in development mode:
+
+```bash
+git clone https://github.com/embeddings-benchmark/mteb
+cd mteb
+
+# create your virtual environment and activate it
+make install
+```
+
+### Running Tests
+To run the tests, you can use the following command:
+
+```bash
+make test
+# or if you want to run on multiple cores
+make test-parallel
+```
+
+### Running linting
+To run the linting before a PR you can use the following command:
+
+```bash
+make lint
+```
diff --git a/images/hf_logo.png → docs/images/hf_logo.png b/images/hf_logo.png → docs/images/hf_logo.png
diff --git a/...es/mteb_logo/mteb_logo_tight_hfhub.drawio → ...es/mteb_logo/mteb_logo_tight_hfhub.drawio b/...es/mteb_logo/mteb_logo_tight_hfhub.drawio → ...es/mteb_logo/mteb_logo_tight_hfhub.drawio
diff --git a/images/mteb_logo/mteb_logo_tight_hfhub.png → ...mages/mteb_logo/mteb_logo_tight_hfhub.png b/images/mteb_logo/mteb_logo_tight_hfhub.png → ...mages/mteb_logo/mteb_logo_tight_hfhub.png
diff --git a/images/mteb_logo/mteb_logo_transparent.png → ...mages/mteb_logo/mteb_logo_transparent.png b/images/mteb_logo/mteb_logo_transparent.png → ...mages/mteb_logo/mteb_logo_transparent.png
diff --git a/...es/mteb_logo/mteb_logo_wide_github.drawio → ...es/mteb_logo/mteb_logo_wide_github.drawio b/...es/mteb_logo/mteb_logo_wide_github.drawio → ...es/mteb_logo/mteb_logo_wide_github.drawio
diff --git a/images/mteb_logo/mteb_logo_wide_github.png → ...mages/mteb_logo/mteb_logo_wide_github.png b/images/mteb_logo/mteb_logo_wide_github.png → ...mages/mteb_logo/mteb_logo_wide_github.png
diff --git a/docs/tasks.md b/docs/tasks.md
diff --git a/mteb/__init__.py b/mteb/__init__.py
@@ -1,7 +1,9 @@
-__version__ = "1.2.1.dev0"
+from importlib.metadata import version
 
 from mteb.evaluation import *
 
+__version__ = version("mteb")  # fetch version from install metadata
+
 
 MTEB_MAIN_EN = [
     "AmazonCounterfactualClassification",
@@ -70,5 +72,5 @@
     "TweetSentimentExtractionClassification",
     "TwentyNewsgroupsClustering",
     "TwitterSemEval2015",
-    "TwitterURLCorpus"
+    "TwitterURLCorpus",
 ]
diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
@@ -4,19 +4,18 @@
 import pathlib
 import traceback
 from datetime import datetime
+from importlib.metadata import version
 from time import time
+from typing import List, Union
 
 import datasets
 
-from .. import __version__
 from ..abstasks import *
 from ..abstasks import AbsTask, LangMapping
 from ..tasks import *
 
 logger = logging.getLogger(__name__)
 
-from typing import List, Union
-
 
 class MTEB:
     def __init__(
@@ -27,7 +26,7 @@ def __init__(
         tasks: List[Union[str, AbsTask]] = None,
         version=None,
         err_logs_path="error_logs.txt",
-        **kwargs
+        **kwargs,
     ):
         """
         Create an Evaluation pipeline. The tasks selected
@@ -119,7 +118,9 @@ def _display_tasks(self, task_list, name=None):
         if name:
             console.rule(f"[bold]{name}\n", style="grey15")
         for task_type in self.available_task_types:
-            current_type_tasks = list(filter(lambda x: x.description["type"] == task_type, task_list))
+            current_type_tasks = list(
+                filter(lambda x: x.description["type"] == task_type, task_list)
+            )
             if len(current_type_tasks) == 0:
                 continue
             else:
@@ -138,7 +139,9 @@ def _display_tasks(self, task_list, name=None):
                         if task.is_crosslingual
                         else ""
                     )
-                    console.print(f"{prefix}{name}{category}{multilingual}{crosslingual}")
+                    console.print(
+                        f"{prefix}{name}{category}{multilingual}{crosslingual}"
+                    )
                 console.print("\n")
 
     @classmethod
@@ -168,31 +171,46 @@ def select_tasks(self, **kwargs):
 
         # If `task_list` is specified, select list of tasks
         if self._tasks is not None:
-            self.tasks = list(filter(lambda x: (x.description["name"] in self._tasks), self.tasks_cls))
+            self.tasks = list(
+                filter(lambda x: (x.description["name"] in self._tasks), self.tasks_cls)
+            )
             if len(self.tasks) != len(self._tasks):
                 tasks_known = set([x.description["name"] for x in self.tasks_cls])
-                tasks_unknown = set(x for x in self._tasks if isinstance(x, str)) - tasks_known
+                tasks_unknown = (
+                    set(x for x in self._tasks if isinstance(x, str)) - tasks_known
+                )
                 if tasks_unknown:
-                    unknown_str, known_str = ",".join(sorted(list(tasks_unknown))), ",".join(sorted(list(tasks_known)))
-                    logger.warning(f"WARNING: Unknown tasks: {unknown_str}. Known tasks: {known_str}.")
+                    unknown_str, known_str = (
+                        ",".join(sorted(list(tasks_unknown))),
+                        ",".join(sorted(list(tasks_known))),
+                    )
+                    logger.warning(
+                        f"WARNING: Unknown tasks: {unknown_str}. Known tasks: {known_str}."
+                    )
             # add task if subclass of mteb.tasks
             self.tasks.extend([x for x in self._tasks if isinstance(x, AbsTask)])
             return
 
         # Otherwise use filters to select tasks
         filtered_tasks = filter(
-            lambda x: (self._task_types is None) or (x.description["type"] in self._task_types), self.tasks_cls
+            lambda x: (self._task_types is None)
+            or (x.description["type"] in self._task_types),
+            self.tasks_cls,
         )
         filtered_tasks = filter(
-            lambda x: (self._task_categories is None) or (x.description["category"] in self._task_categories),
+            lambda x: (self._task_categories is None)
+            or (x.description["category"] in self._task_categories),
             filtered_tasks,
         )
         filtered_tasks = filter(
-            lambda x: (self._version is None) or (x.description["version"] >= self._version), filtered_tasks
+            lambda x: (self._version is None)
+            or (x.description["version"] >= self._version),
+            filtered_tasks,
         )
         # keep only tasks with at least one language in the filter
         filtered_tasks = filter(
-            lambda x: (not (self._task_langs)) or (len(set(x.description["eval_langs"]) & set(self._task_langs)) > 0),
+            lambda x: (not (self._task_langs))
+            or (len(set(x.description["eval_langs"]) & set(self._task_langs)) > 0),
             filtered_tasks,
         )
 
@@ -216,7 +234,7 @@ def run(
         eval_splits=None,
         overwrite_results=False,
         raise_error: bool = True,
-        **kwargs
+        **kwargs,
     ):
         """
         Run the evaluation pipeline on the selected tasks.
@@ -251,34 +269,48 @@ def run(
         evaluation_results = {}
         while len(self.tasks) > 0:
             task = self.tasks[0]
-            logger.info(f"\n\n********************** Evaluating {task.description['name']} **********************")
+            logger.info(
+                f"\n\n********************** Evaluating {task.description['name']} **********************"
+            )
 
             # skip evaluation if results folder exists and overwrite_results is False
             if output_folder is not None:
-                save_path = os.path.join(output_folder, f"{task.description['name']}{task.save_suffix}.json")
+                save_path = os.path.join(
+                    output_folder, f"{task.description['name']}{task.save_suffix}.json"
+                )
                 if os.path.exists(save_path) and overwrite_results is False:
-                    logger.warning(f"WARNING: {task.description['name']} results already exists. Skipping.")
+                    logger.warning(
+                        f"WARNING: {task.description['name']} results already exists. Skipping."
+                    )
                     del self.tasks[0]
                     continue
 
             try:
-                task_eval_splits = eval_splits if eval_splits is not None else task.description.get("eval_splits", [])
+                task_eval_splits = (
+                    eval_splits
+                    if eval_splits is not None
+                    else task.description.get("eval_splits", [])
+                )
 
                 # load data
                 logger.info(f"Loading dataset for {task.description['name']}")
                 task.load_data(eval_splits=task_eval_splits, **kwargs)
 
                 # run evaluation
                 task_results = {
-                    "mteb_version": __version__,
+                    "mteb_version": version("mteb"),
                     "dataset_revision": task.description.get("revision", None),
                     "mteb_dataset_name": task.description["name"],
                 }
                 for split in task_eval_splits:
                     tick = time()
-                    results = task.evaluate(model, split, output_folder=output_folder, **kwargs)
+                    results = task.evaluate(
+                        model, split, output_folder=output_folder, **kwargs
+                    )
                     tock = time()
-                    logger.info(f"Evaluation for {task.description['name']} on {split} took {tock - tick:.2f} seconds")
+                    logger.info(
+                        f"Evaluation for {task.description['name']} on {split} took {tock - tick:.2f} seconds"
+                    )
                     results["evaluation_time"] = round(tock - tick, 2)
                     task_results[split] = results
                     if verbosity >= 1:
@@ -295,7 +327,9 @@ def run(
                 logger.error(f"Error while evaluating {task.description['name']}: {e}")
                 if raise_error:
                     raise e
-                logger.error(f"Please check all the error logs at: {self.err_logs_path}")
+                logger.error(
+                    f"Please check all the error logs at: {self.err_logs_path}"
+                )
                 with open(self.err_logs_path, "a") as f_out:
                     f_out.write(f"{datetime.now()} >>> {task.description['name']}\n")
                     f_out.write(traceback.format_exc())

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,10 +1,125 @@
 [build-system]
-requires = ["pbr>=5.7.0", "setuptools>=36.6.0"]
-build-backend = "pbr.build"
+requires = ["setuptools>=42", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "mteb"
+version = "1.2.1.dev0"
+description = "Massive Text Embedding Benchmark"
+readme = "README.md"
+authors = [
+    { name = "MTEB Contributors", email = "niklas@huggingface.co" },
+    { email = "nouamane@huggingface.co" },
+    { email = "info@nils-reimers.de" }
+]
+license = { file = "LICENSE" }
+keywords = ["deep learning", "text embeddings", "benchmark"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Environment :: Console",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Information Technology",
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python"
+]
+requires-python = ">=3.8"
+dependencies = [
+    "datasets>=2.2.0",
+    "jsonlines",
+    "numpy",
+    "requests>=2.26.0",
+    "scikit_learn>=1.0.2",
+    "scipy",
+    "sentence_transformers>=2.2.0",
+    "torch",
+    "tqdm",
+    "rich",
+    "pytrec_eval"
+]
+
+
+[project.urls]
+homepage = "https://github.com/embeddings-benchmark/mteb"
+"Huggingface Organization" = "https://huggingface.co/mteb"
+"Source Code" = "https://github.com/embeddings-benchmark/mteb"
+
+[project.scripts]
+mteb = "mteb.cmd:main"
+
+[project.optional-dependencies]
+dev = [
+    "flake8",
+    "Flake8-pyproject>=1.2.3",
+    "isort",
+    "black==24.2.0",
+    "pytest",
+    "pytest-xdist"
+]
+
+
+[tool.setuptools.packages.find]
+exclude = ["tests", "results"]
 
 [tool.black]
 line-length = 119
 target-version = ['py35']
 
 [tool.isort]
-profile = "black"
+profile = "black"
+default_section = "FIRSTPARTY"
+ensure_newline_before_comments = true
+force_grid_wrap = 0
+include_trailing_comma = true
+known_first_party = "transformers"
+known_third_party = [
+    "absl",
+    "conllu",
+    "datasets",
+    "elasticsearch",
+    "fairseq",
+    "faiss-cpu",
+    "fastprogress",
+    "fire",
+    "fugashi",
+    "git",
+    "h5py",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "packaging",
+    "pandas",
+    "PIL",
+    "psutil",
+    "pytest",
+    "pytorch_lightning",
+    "rouge_score",
+    "sacrebleu",
+    "seqeval",
+    "sklearn",
+    "streamlit",
+    "tensorboardX",
+    "tensorflow",
+    "tensorflow_datasets",
+    "timeout_decorator",
+    "torch",
+    "torchaudio",
+    "torchtext",
+    "torchvision",
+    "torch_xla",
+    "tqdm",
+]
+line_length = 119
+lines_after_imports = 2
+multi_line_output = 3
+use_parentheses = true
+
+[tool.flake8]
+ignore = [
+    "E203",
+    "E501",
+    "E741",
+    "W503",
+    "W605",
+]
+max-line-length = 119