embeddings-benchmark · KennethEnevoldsen · Mar 19, 2024 · Mar 18, 2024 · Mar 18, 2024 · Mar 18, 2024
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -27,8 +27,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install .
-        python -m pip install -r requirements.dev.txt
+        make install
     - name: Lint with flake8
       run: |
         make lint

diff --git a/.gitignore b/.gitignore
@@ -2,9 +2,6 @@
 .DS_Store
 .idea/
 
-# Result folder
-results/
-
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/README.md b/README.md
@@ -57,10 +57,13 @@ results = evaluation.run(model, output_folder=f"results/{model_name}")
 ```bash
 mteb --available_tasks
 
-mteb -m average_word_embeddings_komninos \
+mteb -m sentence-transformers/all-MiniLM-L6-v2 \
     -t Banking77Classification  \
-    --output_folder results/average_word_embeddings_komninos \
     --verbosity 3
+
+# if nothing is specified it with default to saving the results in the results/{model_name} folder
+
+mteb -m sentence-transformers/all-MiniLM-L6-v2
 ```
 
 * Using multiple GPUs in parallel can be done by just having a custom encode function that distributes the inputs to multiple GPUs like e.g. [here](https://github.com/microsoft/unilm/blob/b60c741f746877293bb85eed6806736fc8fa0ffd/e5/mteb_eval.py#L60) or [here](https://github.com/ContextualAI/gritlm/blob/09d8630f0c95ac6a456354bcb6f964d7b9b6a609/gritlm/gritlm.py#L75).

diff --git a/mteb/cmd.py b/mteb/cmd.py
@@ -10,7 +10,10 @@
 
 
 import argparse
+import datetime
+import json
 import logging
+from pathlib import Path
 
 from sentence_transformers import SentenceTransformer
 
@@ -20,6 +23,25 @@
 logger = logging.getLogger(__name__)
 
 
+def _name_to_path(name: str) -> str:
+    return name.replace("/", "__").replace(" ", "_")
+
+
+def _save_model_metadata(
+    model: SentenceTransformer, model_name: str, output_folder: Path
+) -> None:
+    save_path = output_folder / "model_meta.json"
+
+    model_meta = {
+        "model_name": model_name,
+        "time_of_run": str(datetime.datetime.today()),
+        "versions": model._model_config["__version__"],
+    }
+
+    with save_path.open("w") as f:
+        json.dump(model_meta, f)
+
+
 def main():
     parser = argparse.ArgumentParser()
 
@@ -60,11 +82,24 @@ def main():
         default=None,
         help="List of languages to be evaluated. if not set, all languages will be evaluated.",
     )
-    parser.add_argument("--device", type=int, default=None, help="Device to use for computation")
-    parser.add_argument("--batch_size", type=int, default=32, help="Batch size for computation")
-    parser.add_argument("--seed", type=int, default=42, help="Random seed for computation")
-    parser.add_argument("--output_folder", type=str, default="results", help="Output directory for results")
-    parser.add_argument("-v", "--verbosity", type=int, default=2, help="Verbosity level")
+    parser.add_argument(
+        "--device", type=int, default=None, help="Device to use for computation"
+    )
+    parser.add_argument(
+        "--batch_size", type=int, default=32, help="Batch size for computation"
+    )
+    parser.add_argument(
+        "--seed", type=int, default=42, help="Random seed for computation"
+    )
+    parser.add_argument(
+        "--output_folder",
+        type=str,
+        default=None,
+        help="Output directory for results. Will default to results/{model_name} if not set.",
+    )
+    parser.add_argument(
+        "-v", "--verbosity", type=int, default=2, help="Verbosity level"
+    )
 
     ## evaluation params
     parser.add_argument(
@@ -76,10 +111,23 @@ def main():
     )
 
     ## classification params
-    parser.add_argument("--k", type=int, default=None, help="Number of nearest neighbors to use for classification")
-    parser.add_argument("--n_experiments", type=int, default=None, help="Number of splits for bootstrapping")
     parser.add_argument(
-        "--samples_per_label", type=int, default=None, help="Number of samples per label for bootstrapping"
+        "--k",
+        type=int,
+        default=None,
+        help="Number of nearest neighbors to use for classification",
+    )
+    parser.add_argument(
+        "--n_experiments",
+        type=int,
+        default=None,
+        help="Number of splits for bootstrapping",
+    )
+    parser.add_argument(
+        "--samples_per_label",
+        type=int,
+        default=None,
+        help="Number of samples per label for bootstrapping",
     )
 
     ## retrieval params
@@ -120,12 +168,25 @@ def main():
     if args.model is None:
         raise ValueError("Please specify a model using the -m or --model argument")
 
+    if args.output_folder is None:
+        args.output_folder = f"results/{_name_to_path(args.model)}"
+
     model = SentenceTransformer(args.model, device=args.device)
     eval = MTEB(
-        task_categories=args.task_categories, task_types=args.task_types, task_langs=args.task_langs, tasks=args.tasks
+        task_categories=args.task_categories,
+        task_types=args.task_types,
+        task_langs=args.task_langs,
+        tasks=args.tasks,
+    )
+
+    eval.run(
+        model,
+        verbosity=args.verbosity,
+        output_folder=args.output_folder,
+        eval_splits=args.eval_splits,
     )
 
-    eval.run(model, verbosity=args.verbosity, output_folder=args.output_folder, eval_splits=args.eval_splits)
+    _save_model_metadata(model, args.model, Path(args.output_folder))
 
 
 if __name__ == "__main__":

diff --git a/requirements.dev.txt b/requirements.dev.txt
diff --git a/results/sentence-transformers__all-MiniLM-L6-v2/Banking77Classification.json b/results/sentence-transformers__all-MiniLM-L6-v2/Banking77Classification.json
@@ -0,0 +1,13 @@
+{
+  "dataset_revision": "0fd18e25b25c072e09e0d92ab615fda904d66300",
+  "mteb_dataset_name": "Banking77Classification",
+  "mteb_version": "1.2.1.dev0",
+  "test": {
+    "accuracy": 0.8004220779220779,
+    "accuracy_stderr": 0.0076199161375301375,
+    "evaluation_time": 26.35,
+    "f1": 0.7937884047247287,
+    "f1_stderr": 0.008720851120843838,
+    "main_score": 0.8004220779220779
+  }
+}
diff --git a/results/sentence-transformers__all-MiniLM-L6-v2/BornholmBitextMining.json b/results/sentence-transformers__all-MiniLM-L6-v2/BornholmBitextMining.json
@@ -0,0 +1,13 @@
+{
+  "dataset_revision": "3bc5cfb4ec514264fe2db5615fac9016f7251552",
+  "mteb_dataset_name": "BornholmBitextMining",
+  "mteb_version": "1.2.1.dev0",
+  "test": {
+    "accuracy": 0.36,
+    "evaluation_time": 3.59,
+    "f1": 0.2968132161955691,
+    "main_score": 0.2968132161955691,
+    "precision": 0.27690919913419915,
+    "recall": 0.36
+  }
+}
diff --git a/results/sentence-transformers__all-MiniLM-L6-v2/model.json b/results/sentence-transformers__all-MiniLM-L6-v2/model.json
@@ -0,0 +1 @@
+{"model_name": "sentence-transformers/all-MiniLM-L6-v2", "time_of_run": "2024-03-18 11:22:22.739054", "versions": {"sentence_transformers": "2.0.0", "transformers": "4.6.1", "pytorch": "1.8.1"}}
diff --git a/setup.py b/setup.py
@@ -80,11 +80,7 @@
     ],
     # optional dependencies
     extras_require={
-        "dev": [
-            "flake8",
-            "isort",
-            "black==24.2.0",
-        ]
+        "dev": ["flake8", "isort", "black==24.2.0", "pytest", "pytest-xdist"]
     },
     classifiers=[
         "Development Status :: 4 - Beta",
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"model_name": "sentence-transformers/all-MiniLM-L6-v2", "time_of_run": "2024-03-18 11:22:22.739054", "versions": {"sentence_transformers": "2.0.0", "transformers": "4.6.1", "pytorch": "1.8.1"}}
Copy link Contributor imenelydiaker Mar 18, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Maybe add a revision number to make sure it's the same model version that is used? Wdyt? Copy link Contributor Author KennethEnevoldsen Mar 18, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. I would actually love to, but couldn't figure out how to do it. I don't believe it is recorded in the model object. You can naturally fetch the latest from the repo, but then hitting the cache causes discrepancies. Copy link Contributor imenelydiaker Mar 19, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. I was thinking about doing the same as for datasets in mteb (`revision_id`). Just specifying the commit id from the HF repo that stores the model. Copy link Contributor Author KennethEnevoldsen Mar 19, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Yes I would love to do that. However, I am not sure the commit id is available in the model object (you would have to know it beforehand). I would love to add that (but seems like that is outside the scope of this PR) @imenelydiaker do anyone from your team have the time to give it a go? imenelydiaker reacted with thumbs up emoji Copy link Contributor imenelydiaker Mar 19, 2024 • edited Loading Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Okay I see what you mean. I can check this and open another PR. KennethEnevoldsen reacted with heart emoji KennethEnevoldsen reacted with rocket emoji