huggingface · lvwerra · Dec 9, 2022 · Nov 1, 2022 · Nov 1, 2022 · Nov 1, 2022
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -17,6 +17,8 @@
     title: Using the evaluator
   - local: custom_evaluator
     title: Using the evaluator with custom pipelines
+  - local: evaluation_suite
+    title: Creating an EvaluationSuite
   title: "How-to guides"
 - sections:
     - local: types_of_evaluations

diff --git a/docs/source/base_evaluator.mdx b/docs/source/base_evaluator.mdx
@@ -11,6 +11,7 @@ Currently supported tasks are:
 - `"summarization"`: will use the [`SummarizationEvaluator`].
 - `"translation"`: will use the [`TranslationEvaluator`].
 
+To run an `Evaluator` with several tasks in a single call, use the [EvaluationSuite](evaluation_suite), which runs evaluation on a collection of `SubTask`s.
 
 Each task has its own set of requirements for the dataset format and pipeline output, make sure to check them out for your custom use case. Let's have a look at some of them and see how you can use the evaluator to evalute a single or multiple of models, datasets, and metrics at the same time.
 

diff --git a/docs/source/evaluation_suite.mdx b/docs/source/evaluation_suite.mdx
@@ -0,0 +1,50 @@
+# Creating an EvaluationSuite
+
+The `EvaluationSuite` provides a way to compose any number of ([evaluator](base_evaluator), dataset, metric) tuples to evaluate a model on a collection of several evaluation tasks.
+
+A new `EvaluationSuite` is made up of `SubTask` classes, and can be defined by subclassing the `EvaluationSuite` class. Files can be uploaded to a Space on the Hugging Face Hub or saved locally.
+
+Datasets which require additional preprocessing before being used with an `Evaluator` can be processed with `datasets` transformations by setting the `preprocessor` attribute to a preprocessing function. Keyword arguments for the `Evaluator` can be passed down through `args_for_task`.
+
+```python
+import evaluate
+from evaluate.evaluation_suite import SubTask
+
+class Suite(evaluate.EvaluationSuite):
+
+    def __init__(self, name):
+        super().__init__(name)
+        self.preprocessor = lambda x: {"text": x["text"].lower()}
+        self.suite = [
+            SubTask(
+                task_type="text-classification",
+                data="glue",
+                subset="cola",
+                split="test[:10]",
+                args_for_task={
+                    "metric": "accuracy",
+                    "input_column": "sentence",
+                    "label_column": "label",
+                    "label_mapping": {
+                        "LABEL_0": 0.0,
+                        "LABEL_1": 1.0
+                    }
+                }
+            )
+        ]
+```
+
+An `EvaluationSuite` can be loaded by name from the Hugging Face Hub, or locally by providing a path, and run with `.run(model_or_pipeline)`.
+
+```python
+from evaluate import EvaluationSuite
+
+suite = EvaluationSuite.load('mathemakitten/glue')
+results = suite.run("gpt2")
+```
+
+The evaluation results are printed along with their task names and information about the time it took to obtain predictions through the pipeline.
+
+```python
+{'glue/cola': {'accuracy': 0.0, 'total_time_in_seconds': 0.9766696180449799, 'samples_per_second': 10.238876909079256, 'latency_in_seconds': 0.09766696180449798}, 'glue/sst2': {'accuracy': 0.5, 'total_time_in_seconds': 1.1422595420153812, 'samples_per_second': 8.754577775166744, 'latency_in_seconds': 0.11422595420153811}, 'glue/qqp': {'accuracy': 0.6, 'total_time_in_seconds': 1.3553926559980027, 'samples_per_second': 7.377935800188323, 'latency_in_seconds': 0.13553926559980026}, 'glue/mrpc': {'accuracy': 0.6, 'total_time_in_seconds': 2.021696529001929, 'samples_per_second': 4.946340786832532, 'latency_in_seconds': 0.2021696529001929}, 'glue/mnli': {'accuracy': 0.2, 'total_time_in_seconds': 2.0380110969999805, 'samples_per_second': 4.9067446270142145, 'latency_in_seconds': 0.20380110969999807}, 'glue/qnli': {'accuracy': 0.3, 'total_time_in_seconds': 2.082032073987648, 'samples_per_second': 4.802999975330509, 'latency_in_seconds': 0.20820320739876477}, 'glue/rte': {'accuracy': 0.7, 'total_time_in_seconds': 2.8592985830036923, 'samples_per_second': 3.4973612267855576, 'latency_in_seconds': 0.2859298583003692}, 'glue/wnli': {'accuracy': 0.5, 'total_time_in_seconds': 1.5406486629508436, 'samples_per_second': 6.490772517107661, 'latency_in_seconds': 0.15406486629508437}}
+```
diff --git a/src/evaluate/__init__.py b/src/evaluate/__init__.py
@@ -26,6 +26,7 @@
 
 del version
 
+from .evaluation_suite import EvaluationSuite
 from .evaluator import (
     Evaluator,
     ImageClassificationEvaluator,

diff --git a/src/evaluate/evaluation_suite/__init__.py b/src/evaluate/evaluation_suite/__init__.py
@@ -0,0 +1,102 @@
+import importlib
+import inspect
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable, Dict, Optional, Union
+
+from datasets import Dataset, DownloadMode, load_dataset
+from datasets.utils.version import Version
+
+from ..evaluator import evaluator
+from ..loading import evaluation_module_factory
+from ..utils.file_utils import DownloadConfig
+from ..utils.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class SubTask:
+    task_type: str
+    data: Optional[Union[str, Dataset]] = None
+    subset: Optional[str] = None
+    split: Optional[str] = None
+    data_preprocessor: Optional[Callable] = None
+    args_for_task: Optional[dict] = None
+
+
+def import_main_class(module_path):
+    """Import a module at module_path and return the EvaluationSuite class"""
+    module = importlib.import_module(module_path)
+
+    module_main_cls = None
+    for name, obj in module.__dict__.items():
+        if isinstance(obj, type) and obj.__name__ == "Suite":
+            if inspect.isabstract(obj):
+                continue
+            module_main_cls = obj
+            break
+
+    return module_main_cls
+
+
+class EvaluationSuite:
+    """
+    This class instantiates an evaluation suite made up of multiple tasks, where each task consists of a dataset and
+    an associated metric, and runs evaluation on a model or pipeline. Evaluation suites can be a Python script found
+    either locally or uploaded as a Space on the Hugging Face Hub.
+    Usage:
+    ```python
+    from evaluate import EvaluationSuite
+    suite = EvaluationSuite.load('mathemakitten/glue-evaluation-suite')
+    results = suite.run("gpt2")
+    ```
+    """
+
+    def __init__(self, name):
+        self.name = name
+
+    @staticmethod
+    def load(
+        path: str,
+        download_mode: Optional[DownloadMode] = None,
+        revision: Optional[Union[str, Version]] = None,
+        download_config: Optional[DownloadConfig] = None,
+    ):
+        download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
+        evaluation_module = evaluation_module_factory(
+            path, module_type=None, revision=revision, download_config=download_config, download_mode=download_mode
+        )
+        name = Path(path).stem
+        evaluation_cls = import_main_class(evaluation_module.module_path)
+        evaluation_instance = evaluation_cls(name)
+
+        return evaluation_instance
+
+    def __repr__(self):
+        tasks = [task.data + "/" + task.subset if task.subset else task.data for task in self.suite]
+        return f'EvaluationSuite name: "{self.name}", ' f"Tasks: {tasks})"
+
+    def run(
+        self, model_or_pipeline: Union[str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"]  # noqa: F821
+    ) -> Dict[str, float]:
+
+        results_all = {}
+        for task in self.suite:
+
+            if task.data_preprocessor:  # task requires extra preprocessing
+                ds = load_dataset(task.data, name=task.subset, split=task.split)
+                task.data = ds.map(task.data_preprocessor)
+
+            task_evaluator = evaluator(task.task_type)
+            args_for_task = task.args_for_task
+            args_for_task["model_or_pipeline"] = model_or_pipeline
+            args_for_task["data"] = task.data
+            args_for_task["subset"] = task.subset
+            args_for_task["split"] = task.split
+            results = task_evaluator.compute(**args_for_task)
+
+            task_id = task.data + "/" + task.subset if task.subset else task.data
+            results_all[task_id] = results
+        return results_all
diff --git a/src/evaluate/loading.py b/src/evaluate/loading.py
@@ -73,10 +73,7 @@ def init_dynamic_modules(
 
 
 def import_main_class(module_path) -> Optional[Union[Type[DatasetBuilder], Type[EvaluationModule]]]:
-    """Import a module at module_path and return its main class:
-    - a DatasetBuilder if dataset is True
-    - a Metric if dataset is False
-    """
+    """Import a module at module_path and return its main class, a Metric by default"""
     module = importlib.import_module(module_path)
     main_cls_type = EvaluationModule
 

diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
@@ -31,6 +31,7 @@
 )
 
 from evaluate import (
+    EvaluationSuite,
     Evaluator,
     ImageClassificationEvaluator,
     QuestionAnsweringEvaluator,
@@ -849,3 +850,8 @@ def test_translation(self):
             data=self.data,
         )
         self.assertEqual(results["bleu"], 0)
+
+
+class TestEvaluationSuite(TestCase):
+    suite = EvaluationSuite.load("evaluate/evaluation-suite-ci")
+    suite.run("gpt2")