diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py index 054a06a6..b0164b2d 100644 --- a/src/lighteval/__main__.py +++ b/src/lighteval/__main__.py @@ -22,19 +22,20 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. + import argparse import os from dataclasses import asdict from pprint import pformat -from lighteval.parsers import parser_accelerate, parser_nanotron, parser_utils_tasks +from lighteval.parsers import parser_accelerate, parser_baseline, parser_nanotron, parser_utils_tasks from lighteval.tasks.registry import Registry, taskinfo_selector CACHE_DIR = os.getenv("HF_HOME") -def cli_evaluate(): +def cli_evaluate(): # noqa: C901 parser = argparse.ArgumentParser(description="CLI tool for lighteval, a lightweight framework for LLM evaluation") subparsers = parser.add_subparsers(help="help for subcommand", dest="subcommand") @@ -46,9 +47,12 @@ def cli_evaluate(): parser_b = subparsers.add_parser("nanotron", help="use nanotron as backend for evaluation.") parser_nanotron(parser_b) + parser_c = subparsers.add_parser("baseline", help="compute baseline for a task") + parser_baseline(parser_c) + # Subparser for task utils functions - parser_c = subparsers.add_parser("tasks", help="display information about available tasks and samples.") - parser_utils_tasks(parser_c) + parser_d = subparsers.add_parser("tasks", help="display information about available tasks and samples.") + parser_utils_tasks(parser_d) args = parser.parse_args() @@ -62,9 +66,15 @@ def cli_evaluate(): main_nanotron(args.checkpoint_config_path, args.lighteval_config_path, args.cache_dir) + elif args.subcommand == "baseline": + from lighteval.main_baseline import main as main_baseline + + main_baseline(args) + elif args.subcommand == "tasks": + registry = Registry(cache_dir=args.cache_dir, custom_tasks=args.custom_tasks) if args.list: - Registry(cache_dir="").print_all_tasks() + registry.print_all_tasks() if args.inspect: print(f"Loading the tasks dataset to cache folder: {args.cache_dir}") @@ -72,8 +82,8 @@ def cli_evaluate(): "All examples will be displayed without few shot, as few shot sample construction requires loading a model and using its tokenizer. " ) # Loading task - task_names_list, _ = taskinfo_selector(args.inspect) - task_dict = Registry(cache_dir=args.cache_dir).get_task_dict(task_names_list) + task_names_list, _ = taskinfo_selector(args.inspect, task_registry=registry) + task_dict = registry.get_task_dict(task_names_list) for name, task in task_dict.items(): print("-" * 10, name, "-" * 10) if args.show_config: @@ -84,7 +94,6 @@ def cli_evaluate(): print("-" * 10, "SAMPLES") print(f"-- sample {ix} --") print(pformat(asdict(sample), indent=1)) - else: print("You did not provide any argument. Exiting") diff --git a/src/lighteval/main_baseline.py b/src/lighteval/main_baseline.py new file mode 100644 index 00000000..f824d94f --- /dev/null +++ b/src/lighteval/main_baseline.py @@ -0,0 +1,88 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from lighteval.logging.evaluation_tracker import EvaluationTracker +from lighteval.metrics.utils.metric_utils import MetricCategory +from lighteval.models.abstract_model import ModelInfo +from lighteval.tasks.lighteval_task import LightevalTask +from lighteval.tasks.registry import Registry, taskinfo_selector +from lighteval.utils.utils import as_list + + +def main(args): + """ + Compute baselines for given tasks. + + It has been tested with generative and accuracy tasks, but may not work correctly for other task types. + + The baseline is computed as follows: + - For multiple-choice tasks: It assumes random guessing, so the score is n_correct/number_of_choices. + - For other metrics: It assigns a score of 0, which may not be appropriate for all task types. + + Note: + This baseline computation may not be suitable for all task types and should be used with caution. + """ + task_registry = Registry(cache_dir=args.cache_dir, custom_tasks=args.custom_tasks) + task_names_list, fewshots_dict = taskinfo_selector(args.tasks, task_registry) + task_dict = task_registry.get_task_dict(task_names_list) + + evaluation_tracker = EvaluationTracker( + output_dir=args.output_dir, + save_details=False, + push_to_hub=False, + push_to_tensorboard=False, + public=False, + hub_results_org=None, + ) + evaluation_tracker.general_config_logger.log_model_info( + ModelInfo( + model_name="lighteval/baseline", + model_sha=None, + model_dtype=None, + model_size=None, + ) + ) + evaluation_tracker.task_config_logger.log(task_dict) + + LightevalTask.load_datasets(list(task_dict.values()), args.dataset_loading_processes) + + for task_name, task in task_dict.items(): + task_docs = list(task.eval_docs()) + n_samples = min(args.max_samples, len(task_docs)) if args.max_samples else len(task_docs) + + p_correct_score = [ + len(as_list(task_doc.gold_index)) / len(task_doc.choices) for task_doc in task_docs[:n_samples] + ] + + metric_results = { + metric.metric_name: p_correct_score + if metric.category + in [MetricCategory.MULTICHOICE, MetricCategory.MULTICHOICE_PMI, MetricCategory.MULTICHOICE_ONE_TOKEN] + else 0 + for metric in task.metrics + } + + for fewshots, _ in fewshots_dict[task_name]: + evaluation_tracker.metrics_logger.log(f"{task_name}|{fewshots}", metric_results) + + evaluation_tracker.metrics_logger.aggregate(task_dict=task_dict, bootstrap_iters=1000) + evaluation_tracker.save() diff --git a/src/lighteval/parsers.py b/src/lighteval/parsers.py index 3a4b85af..3988fdbb 100644 --- a/src/lighteval/parsers.py +++ b/src/lighteval/parsers.py @@ -104,6 +104,44 @@ def parser_accelerate(parser=None): return parser +def parser_baseline(parser=None): + if parser is None: + parser = argparse.ArgumentParser( + description="CLI tool for lighteval, a lightweight framework for LLM evaluation" + ) + + parser.add_argument( + "--custom_tasks", + type=str, + default=None, + help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formating functions)", + ) + + parser.add_argument( + "--tasks", + type=str, + required=True, + help="Task to compute the baseline for", + ) + parser.add_argument("--max_samples", type=int, default=None, help="Maximum number of samples to evaluate on") + parser.add_argument( + "--dataset_loading_processes", type=int, default=1, help="Number of processes to use for loading the datasets" + ) + + parser.add_argument( + "--cache_dir", type=str, default=CACHE_DIR, help="Cache directory used to store datasets and models" + ) + # Ooutput related + parser.add_argument( + "--output_dir", + required=True, + type=str, + help="Directory to save the results, fsspec compliant (e.g. s3://bucket/path)", + ) + + return parser + + def parser_nanotron(parser=None): if parser is None: parser = argparse.ArgumentParser( @@ -142,6 +180,7 @@ def parser_utils_tasks(parser=None): default=None, help="Id of tasks or path to a text file with a list of tasks (e.g. 'original|mmlu:abstract_algebra|5') for which you want to manually inspect samples.", ) + parser.add_argument("--custom_tasks", type=str, default=None, help="Path to a file with custom tasks") parser.add_argument("--num_samples", type=int, default=10, help="Number of samples to display") parser.add_argument("--show_config", default=False, action="store_true", help="Will display the full task config") parser.add_argument(