Skip to content

Commit

Permalink
Adds Baseline workflow + fixes (#363)
Browse files Browse the repository at this point in the history
* add baseline + fix tasks arg

* comments  :)

* different model name so that the naming is consitent with normal models

---------

Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>
  • Loading branch information
3 people authored Oct 23, 2024
1 parent 7fe7b12 commit 5d7a6b9
Show file tree
Hide file tree
Showing 3 changed files with 144 additions and 8 deletions.
25 changes: 17 additions & 8 deletions src/lighteval/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,20 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


import argparse
import os
from dataclasses import asdict
from pprint import pformat

from lighteval.parsers import parser_accelerate, parser_nanotron, parser_utils_tasks
from lighteval.parsers import parser_accelerate, parser_baseline, parser_nanotron, parser_utils_tasks
from lighteval.tasks.registry import Registry, taskinfo_selector


CACHE_DIR = os.getenv("HF_HOME")


def cli_evaluate():
def cli_evaluate(): # noqa: C901
parser = argparse.ArgumentParser(description="CLI tool for lighteval, a lightweight framework for LLM evaluation")
subparsers = parser.add_subparsers(help="help for subcommand", dest="subcommand")

Expand All @@ -46,9 +47,12 @@ def cli_evaluate():
parser_b = subparsers.add_parser("nanotron", help="use nanotron as backend for evaluation.")
parser_nanotron(parser_b)

parser_c = subparsers.add_parser("baseline", help="compute baseline for a task")
parser_baseline(parser_c)

# Subparser for task utils functions
parser_c = subparsers.add_parser("tasks", help="display information about available tasks and samples.")
parser_utils_tasks(parser_c)
parser_d = subparsers.add_parser("tasks", help="display information about available tasks and samples.")
parser_utils_tasks(parser_d)

args = parser.parse_args()

Expand All @@ -62,18 +66,24 @@ def cli_evaluate():

main_nanotron(args.checkpoint_config_path, args.lighteval_config_path, args.cache_dir)

elif args.subcommand == "baseline":
from lighteval.main_baseline import main as main_baseline

main_baseline(args)

elif args.subcommand == "tasks":
registry = Registry(cache_dir=args.cache_dir, custom_tasks=args.custom_tasks)
if args.list:
Registry(cache_dir="").print_all_tasks()
registry.print_all_tasks()

if args.inspect:
print(f"Loading the tasks dataset to cache folder: {args.cache_dir}")
print(
"All examples will be displayed without few shot, as few shot sample construction requires loading a model and using its tokenizer. "
)
# Loading task
task_names_list, _ = taskinfo_selector(args.inspect)
task_dict = Registry(cache_dir=args.cache_dir).get_task_dict(task_names_list)
task_names_list, _ = taskinfo_selector(args.inspect, task_registry=registry)
task_dict = registry.get_task_dict(task_names_list)
for name, task in task_dict.items():
print("-" * 10, name, "-" * 10)
if args.show_config:
Expand All @@ -84,7 +94,6 @@ def cli_evaluate():
print("-" * 10, "SAMPLES")
print(f"-- sample {ix} --")
print(pformat(asdict(sample), indent=1))

else:
print("You did not provide any argument. Exiting")

Expand Down
88 changes: 88 additions & 0 deletions src/lighteval/main_baseline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# MIT License

# Copyright (c) 2024 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from lighteval.logging.evaluation_tracker import EvaluationTracker
from lighteval.metrics.utils.metric_utils import MetricCategory
from lighteval.models.abstract_model import ModelInfo
from lighteval.tasks.lighteval_task import LightevalTask
from lighteval.tasks.registry import Registry, taskinfo_selector
from lighteval.utils.utils import as_list


def main(args):
"""
Compute baselines for given tasks.
It has been tested with generative and accuracy tasks, but may not work correctly for other task types.
The baseline is computed as follows:
- For multiple-choice tasks: It assumes random guessing, so the score is n_correct/number_of_choices.
- For other metrics: It assigns a score of 0, which may not be appropriate for all task types.
Note:
This baseline computation may not be suitable for all task types and should be used with caution.
"""
task_registry = Registry(cache_dir=args.cache_dir, custom_tasks=args.custom_tasks)
task_names_list, fewshots_dict = taskinfo_selector(args.tasks, task_registry)
task_dict = task_registry.get_task_dict(task_names_list)

evaluation_tracker = EvaluationTracker(
output_dir=args.output_dir,
save_details=False,
push_to_hub=False,
push_to_tensorboard=False,
public=False,
hub_results_org=None,
)
evaluation_tracker.general_config_logger.log_model_info(
ModelInfo(
model_name="lighteval/baseline",
model_sha=None,
model_dtype=None,
model_size=None,
)
)
evaluation_tracker.task_config_logger.log(task_dict)

LightevalTask.load_datasets(list(task_dict.values()), args.dataset_loading_processes)

for task_name, task in task_dict.items():
task_docs = list(task.eval_docs())
n_samples = min(args.max_samples, len(task_docs)) if args.max_samples else len(task_docs)

p_correct_score = [
len(as_list(task_doc.gold_index)) / len(task_doc.choices) for task_doc in task_docs[:n_samples]
]

metric_results = {
metric.metric_name: p_correct_score
if metric.category
in [MetricCategory.MULTICHOICE, MetricCategory.MULTICHOICE_PMI, MetricCategory.MULTICHOICE_ONE_TOKEN]
else 0
for metric in task.metrics
}

for fewshots, _ in fewshots_dict[task_name]:
evaluation_tracker.metrics_logger.log(f"{task_name}|{fewshots}", metric_results)

evaluation_tracker.metrics_logger.aggregate(task_dict=task_dict, bootstrap_iters=1000)
evaluation_tracker.save()
39 changes: 39 additions & 0 deletions src/lighteval/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,44 @@ def parser_accelerate(parser=None):
return parser


def parser_baseline(parser=None):
if parser is None:
parser = argparse.ArgumentParser(
description="CLI tool for lighteval, a lightweight framework for LLM evaluation"
)

parser.add_argument(
"--custom_tasks",
type=str,
default=None,
help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formating functions)",
)

parser.add_argument(
"--tasks",
type=str,
required=True,
help="Task to compute the baseline for",
)
parser.add_argument("--max_samples", type=int, default=None, help="Maximum number of samples to evaluate on")
parser.add_argument(
"--dataset_loading_processes", type=int, default=1, help="Number of processes to use for loading the datasets"
)

parser.add_argument(
"--cache_dir", type=str, default=CACHE_DIR, help="Cache directory used to store datasets and models"
)
# Ooutput related
parser.add_argument(
"--output_dir",
required=True,
type=str,
help="Directory to save the results, fsspec compliant (e.g. s3://bucket/path)",
)

return parser


def parser_nanotron(parser=None):
if parser is None:
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -142,6 +180,7 @@ def parser_utils_tasks(parser=None):
default=None,
help="Id of tasks or path to a text file with a list of tasks (e.g. 'original|mmlu:abstract_algebra|5') for which you want to manually inspect samples.",
)
parser.add_argument("--custom_tasks", type=str, default=None, help="Path to a file with custom tasks")
parser.add_argument("--num_samples", type=int, default=10, help="Number of samples to display")
parser.add_argument("--show_config", default=False, action="store_true", help="Will display the full task config")
parser.add_argument(
Expand Down

0 comments on commit 5d7a6b9

Please sign in to comment.