fix: Add training dataset to model meta

Adresses #1556
embeddings-benchmark · Dec 7, 2024 · 365bb0a · 365bb0a
1 parent d713525
commit 365bb0a
Showing 1 changed file with 4 additions and 3 deletions.
diff --git a/mteb/model_meta.py b/mteb/model_meta.py
@@ -72,8 +72,9 @@ class ModelMeta(BaseModel):
             in the Latin script.
         use_instructions: Whether the model uses instructions E.g. for prompt-based models. This also include models that require a specific format for
             input such as "query: {document}" or "passage: {document}".
-        zero_shot_benchmarks: A list of benchmarks on which the model has been evaluated in a zero-shot setting. By default we assume that all models
-            are evaluated non-zero-shot unless specified otherwise.
+        training_datasets: A dictionary of datasets that the model was trained on. Names should be names as their appear in `mteb` for example 
+            {"ArguAna": ["test"]} if the model is trained on the ArguAna test set. This field is used to determine if a model generalizes zero-shot to
+            a benchmark as well as mark dataset contaminations.
         adapted_from: Name of the model from which this model is adapted from. For quantizations, fine-tunes, long doc extensions, etc.
         superseded_by: Name of the model that supersedes this model, e.g. nvidia/NV-Embed-v2 supersedes v1.
     """
@@ -97,7 +98,7 @@ class ModelMeta(BaseModel):
     reference: STR_URL | None = None
     similarity_fn_name: DISTANCE_METRICS | None = None
     use_instructions: bool | None = None
-    zero_shot_benchmarks: list[str] | None = None
+    training_datasets: dict[str, list[str]] | None = None  
     adapted_from: str | None = None
     superseded_by: str | None = None