Skip to content

Commit

Permalink
Restructing the readme (#262)
Browse files Browse the repository at this point in the history
* restructing the readme

* removed double specification of versions and moved all setup to pyproject.toml

* correctly use flat-layout for the package
  • Loading branch information
KennethEnevoldsen authored Mar 20, 2024
1 parent 8a758bc commit 769157b
Show file tree
Hide file tree
Showing 17 changed files with 424 additions and 355 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,6 @@ dmypy.json

# error logs
error_logs.txt

# tests
tests/results
193 changes: 22 additions & 171 deletions README.md

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions docs/adding_a_model.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
## Adding a Model to the MTEB Leaderboard

The MTEB Leaderboard is available [here](https://huggingface.co/spaces/mteb/leaderboard). To submit:

1. Run on MTEB: You can reference [scripts/run_mteb_english.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/run_mteb_english.py) for all MTEB English datasets used in the main ranking, or [scripts/run_mteb_chinese.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/run_mteb_chinese.py) for the Chinese ones.
Advanced scripts with different models are available in the [mteb/mtebscripts repo](https://github.com/embeddings-benchmark/mtebscripts).
2. Format the json files into metadata using the script at `scripts/mteb_meta.py`. For example
`python scripts/mteb_meta.py path_to_results_folder`, which will create a `mteb_metadata.md` file. If you ran CQADupstack retrieval, make sure to merge the results first with `python scripts/merge_cqadupstack.py path_to_results_folder`.
3. Copy the content of the `mteb_metadata.md` file to the top of a `README.md` file of your model on the Hub. See [here](https://huggingface.co/Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit/blob/main/README.md) for an example.
4. Hit the Refresh button at the bottom of the leaderboard and you should see your scores 🥇
5. To have the scores appear without refreshing, you can open an issue on the [Community Tab of the LB](https://huggingface.co/spaces/mteb/leaderboard/discussions) and someone will restart the space to cache your average scores. The cache is updated anyways ~1x/week.
29 changes: 29 additions & 0 deletions docs/contributing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
## Contributing to MTEB
We welcome contributions such as new datasets to MTEB! This section describes how to set up the repository for development.

### Development Installation
If you want to submit a dataset or on other ways contribute to MTEB, you can install the package in development mode:

```bash
git clone https://github.com/embeddings-benchmark/mteb
cd mteb

# create your virtual environment and activate it
make install
```

### Running Tests
To run the tests, you can use the following command:

```bash
make test
# or if you want to run on multiple cores
make test-parallel
```

### Running linting
To run the linting before a PR you can use the following command:

```bash
make lint
```
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
136 changes: 136 additions & 0 deletions docs/tasks.md

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions mteb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
__version__ = "1.2.1.dev0"
from importlib.metadata import version

from mteb.evaluation import *

__version__ = version("mteb") # fetch version from install metadata


MTEB_MAIN_EN = [
"AmazonCounterfactualClassification",
Expand Down Expand Up @@ -70,5 +72,5 @@
"TweetSentimentExtractionClassification",
"TwentyNewsgroupsClustering",
"TwitterSemEval2015",
"TwitterURLCorpus"
"TwitterURLCorpus",
]
80 changes: 57 additions & 23 deletions mteb/evaluation/MTEB.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,18 @@
import pathlib
import traceback
from datetime import datetime
from importlib.metadata import version
from time import time
from typing import List, Union

import datasets

from .. import __version__
from ..abstasks import *
from ..abstasks import AbsTask, LangMapping
from ..tasks import *

logger = logging.getLogger(__name__)

from typing import List, Union


class MTEB:
def __init__(
Expand All @@ -27,7 +26,7 @@ def __init__(
tasks: List[Union[str, AbsTask]] = None,
version=None,
err_logs_path="error_logs.txt",
**kwargs
**kwargs,
):
"""
Create an Evaluation pipeline. The tasks selected
Expand Down Expand Up @@ -119,7 +118,9 @@ def _display_tasks(self, task_list, name=None):
if name:
console.rule(f"[bold]{name}\n", style="grey15")
for task_type in self.available_task_types:
current_type_tasks = list(filter(lambda x: x.description["type"] == task_type, task_list))
current_type_tasks = list(
filter(lambda x: x.description["type"] == task_type, task_list)
)
if len(current_type_tasks) == 0:
continue
else:
Expand All @@ -138,7 +139,9 @@ def _display_tasks(self, task_list, name=None):
if task.is_crosslingual
else ""
)
console.print(f"{prefix}{name}{category}{multilingual}{crosslingual}")
console.print(
f"{prefix}{name}{category}{multilingual}{crosslingual}"
)
console.print("\n")

@classmethod
Expand Down Expand Up @@ -168,31 +171,46 @@ def select_tasks(self, **kwargs):

# If `task_list` is specified, select list of tasks
if self._tasks is not None:
self.tasks = list(filter(lambda x: (x.description["name"] in self._tasks), self.tasks_cls))
self.tasks = list(
filter(lambda x: (x.description["name"] in self._tasks), self.tasks_cls)
)
if len(self.tasks) != len(self._tasks):
tasks_known = set([x.description["name"] for x in self.tasks_cls])
tasks_unknown = set(x for x in self._tasks if isinstance(x, str)) - tasks_known
tasks_unknown = (
set(x for x in self._tasks if isinstance(x, str)) - tasks_known
)
if tasks_unknown:
unknown_str, known_str = ",".join(sorted(list(tasks_unknown))), ",".join(sorted(list(tasks_known)))
logger.warning(f"WARNING: Unknown tasks: {unknown_str}. Known tasks: {known_str}.")
unknown_str, known_str = (
",".join(sorted(list(tasks_unknown))),
",".join(sorted(list(tasks_known))),
)
logger.warning(
f"WARNING: Unknown tasks: {unknown_str}. Known tasks: {known_str}."
)
# add task if subclass of mteb.tasks
self.tasks.extend([x for x in self._tasks if isinstance(x, AbsTask)])
return

# Otherwise use filters to select tasks
filtered_tasks = filter(
lambda x: (self._task_types is None) or (x.description["type"] in self._task_types), self.tasks_cls
lambda x: (self._task_types is None)
or (x.description["type"] in self._task_types),
self.tasks_cls,
)
filtered_tasks = filter(
lambda x: (self._task_categories is None) or (x.description["category"] in self._task_categories),
lambda x: (self._task_categories is None)
or (x.description["category"] in self._task_categories),
filtered_tasks,
)
filtered_tasks = filter(
lambda x: (self._version is None) or (x.description["version"] >= self._version), filtered_tasks
lambda x: (self._version is None)
or (x.description["version"] >= self._version),
filtered_tasks,
)
# keep only tasks with at least one language in the filter
filtered_tasks = filter(
lambda x: (not (self._task_langs)) or (len(set(x.description["eval_langs"]) & set(self._task_langs)) > 0),
lambda x: (not (self._task_langs))
or (len(set(x.description["eval_langs"]) & set(self._task_langs)) > 0),
filtered_tasks,
)

Expand All @@ -216,7 +234,7 @@ def run(
eval_splits=None,
overwrite_results=False,
raise_error: bool = True,
**kwargs
**kwargs,
):
"""
Run the evaluation pipeline on the selected tasks.
Expand Down Expand Up @@ -251,34 +269,48 @@ def run(
evaluation_results = {}
while len(self.tasks) > 0:
task = self.tasks[0]
logger.info(f"\n\n********************** Evaluating {task.description['name']} **********************")
logger.info(
f"\n\n********************** Evaluating {task.description['name']} **********************"
)

# skip evaluation if results folder exists and overwrite_results is False
if output_folder is not None:
save_path = os.path.join(output_folder, f"{task.description['name']}{task.save_suffix}.json")
save_path = os.path.join(
output_folder, f"{task.description['name']}{task.save_suffix}.json"
)
if os.path.exists(save_path) and overwrite_results is False:
logger.warning(f"WARNING: {task.description['name']} results already exists. Skipping.")
logger.warning(
f"WARNING: {task.description['name']} results already exists. Skipping."
)
del self.tasks[0]
continue

try:
task_eval_splits = eval_splits if eval_splits is not None else task.description.get("eval_splits", [])
task_eval_splits = (
eval_splits
if eval_splits is not None
else task.description.get("eval_splits", [])
)

# load data
logger.info(f"Loading dataset for {task.description['name']}")
task.load_data(eval_splits=task_eval_splits, **kwargs)

# run evaluation
task_results = {
"mteb_version": __version__,
"mteb_version": version("mteb"),
"dataset_revision": task.description.get("revision", None),
"mteb_dataset_name": task.description["name"],
}
for split in task_eval_splits:
tick = time()
results = task.evaluate(model, split, output_folder=output_folder, **kwargs)
results = task.evaluate(
model, split, output_folder=output_folder, **kwargs
)
tock = time()
logger.info(f"Evaluation for {task.description['name']} on {split} took {tock - tick:.2f} seconds")
logger.info(
f"Evaluation for {task.description['name']} on {split} took {tock - tick:.2f} seconds"
)
results["evaluation_time"] = round(tock - tick, 2)
task_results[split] = results
if verbosity >= 1:
Expand All @@ -295,7 +327,9 @@ def run(
logger.error(f"Error while evaluating {task.description['name']}: {e}")
if raise_error:
raise e
logger.error(f"Please check all the error logs at: {self.err_logs_path}")
logger.error(
f"Please check all the error logs at: {self.err_logs_path}"
)
with open(self.err_logs_path, "a") as f_out:
f_out.write(f"{datetime.now()} >>> {task.description['name']}\n")
f_out.write(traceback.format_exc())
Expand Down
121 changes: 118 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,125 @@
[build-system]
requires = ["pbr>=5.7.0", "setuptools>=36.6.0"]
build-backend = "pbr.build"
requires = ["setuptools>=42", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "mteb"
version = "1.2.1.dev0"
description = "Massive Text Embedding Benchmark"
readme = "README.md"
authors = [
{ name = "MTEB Contributors", email = "niklas@huggingface.co" },
{ email = "nouamane@huggingface.co" },
{ email = "info@nils-reimers.de" }
]
license = { file = "LICENSE" }
keywords = ["deep learning", "text embeddings", "benchmark"]
classifiers = [
"Development Status :: 4 - Beta",
"Environment :: Console",
"Intended Audience :: Developers",
"Intended Audience :: Information Technology",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python"
]
requires-python = ">=3.8"
dependencies = [
"datasets>=2.2.0",
"jsonlines",
"numpy",
"requests>=2.26.0",
"scikit_learn>=1.0.2",
"scipy",
"sentence_transformers>=2.2.0",
"torch",
"tqdm",
"rich",
"pytrec_eval"
]


[project.urls]
homepage = "https://github.com/embeddings-benchmark/mteb"
"Huggingface Organization" = "https://huggingface.co/mteb"
"Source Code" = "https://github.com/embeddings-benchmark/mteb"

[project.scripts]
mteb = "mteb.cmd:main"

[project.optional-dependencies]
dev = [
"flake8",
"Flake8-pyproject>=1.2.3",
"isort",
"black==24.2.0",
"pytest",
"pytest-xdist"
]


[tool.setuptools.packages.find]
exclude = ["tests", "results"]

[tool.black]
line-length = 119
target-version = ['py35']

[tool.isort]
profile = "black"
profile = "black"
default_section = "FIRSTPARTY"
ensure_newline_before_comments = true
force_grid_wrap = 0
include_trailing_comma = true
known_first_party = "transformers"
known_third_party = [
"absl",
"conllu",
"datasets",
"elasticsearch",
"fairseq",
"faiss-cpu",
"fastprogress",
"fire",
"fugashi",
"git",
"h5py",
"matplotlib",
"nltk",
"numpy",
"packaging",
"pandas",
"PIL",
"psutil",
"pytest",
"pytorch_lightning",
"rouge_score",
"sacrebleu",
"seqeval",
"sklearn",
"streamlit",
"tensorboardX",
"tensorflow",
"tensorflow_datasets",
"timeout_decorator",
"torch",
"torchaudio",
"torchtext",
"torchvision",
"torch_xla",
"tqdm",
]
line_length = 119
lines_after_imports = 2
multi_line_output = 3
use_parentheses = true

[tool.flake8]
ignore = [
"E203",
"E501",
"E741",
"W503",
"W605",
]
max-line-length = 119
Loading

0 comments on commit 769157b

Please sign in to comment.