Skip to content

Commit

Permalink
feat: 1049 add standard evaluation benchmarks to lfai evals (#1078)
Browse files Browse the repository at this point in the history
* refactor eval structure
* add MMLU benchmark
* add HumanEval benchmarks
* add DeepEval compatible LLM class using LFAI
* update evals README
* upgrade DeepEval to v1.3.0
  • Loading branch information
jalling97 authored Sep 25, 2024
1 parent fb2b437 commit 8e68b8b
Show file tree
Hide file tree
Showing 16 changed files with 334 additions and 115 deletions.
9 changes: 9 additions & 0 deletions src/leapfrogai_evals/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,12 @@ QA_NUM_SAMPLES=25
QA_NUM_DOCUMENTS=5
#QA_VECTOR_STORE_ID= # set this to a vectore store id if you want to use an already existing vector store with the files present
QA_CLEANUP_VECTOR_STORE=True # recommend setting this to False if a vector store id is provided

# MMLU
MMLU_NUM_TASKS=6
MMLU_NUM_SHOTS=5

# HumanEval
HUMAN_EVAL_NUM_SAMPLES_PER_TASK=3
HUMAN_EVAL_NUM_TASKS=50
HUMAN_EVAL_K=1
18 changes: 18 additions & 0 deletions src/leapfrogai_evals/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,21 @@ The LeapfrogAI NIAH evaluation uses the following process:
- delete the vector store

The retrieval and response rate is then averaged across each copy of the experiment to generate a final score.

## Established Benchmark Evaluations

### MMLU

The [Massive Multitask Language Understanding (MMLU)](https://arxiv.org/abs/2009.03300) benchmark is widely-used in the evaluation of large language models on academic and professional tasks. It consists of 57 tasks across disciplines including mathe, history, biology, law, etc. These tasks reflect the kind of questions students and professionals might encounter, making the benchmark a good test of a model's knowledge and reasoning abilities across different fields.

MMLU is a multiple-choice evaluation, meaning that it focuses more on the logical reasoning behind the questions and less about the specific generation process.

Within the LeapfrogAI evaluation framework, The [DeepEval Implementation of MMLU](https://docs.confident-ai.com/docs/benchmarks-mmlu) is utilized. Additionally, a default subset of 6 topically relevant tasks out of the total 57 tasks are tested against, including: College Computer Science, US Foreign Policy, High School Government and Politics, Formal Logic, Computer Security, and Security Studies. This subset was chosen as the full MMLU evaluation takes a long time to process. Larger evaluation sessions should utilize the entire task set.

### HumanEval

The [HumanEval](https://github.com/openai/human-eval) benchmark is a dataset designed to evaluate an LLM's code generation capabilities. The benchmark consists of 164 hand-crafted programming challenges comparable to simple software interview questions.

HumanEval code snippets are generated from docstrings into Python-executable code. The primary metric utilized is the pass@k metric, which measures the probability that at least one of the top k code snippets generated by the LLM passes all the test cases.

Within the LeapfrogAI evaluation framework, The [DeepEval Implementation of HumanEval](https://docs.confident-ai.com/docs/benchmarks-human-eval) is utilized. Additionally, a default subset of 50 code generation tasks out of the total 164 are tested against. This was chosen as the full HumanEval evaluation takes a long time to process. Larger evaluation sessions should utilize the entire task set.
7 changes: 7 additions & 0 deletions src/leapfrogai_evals/evals/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# __init__.py
# ruff: noqa: F401

from leapfrogai_evals.evals.human_eval import human_eval
from leapfrogai_evals.evals.mmlu import mmlu
from leapfrogai_evals.evals.niah_eval import niah_eval
from leapfrogai_evals.evals.qa_eval import qa_eval
50 changes: 50 additions & 0 deletions src/leapfrogai_evals/evals/human_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import logging
import numpy as np
import os

from deepeval.benchmarks import HumanEval
from deepeval.benchmarks.tasks import HumanEvalTask
from tqdm import tqdm
from typing import Optional

from leapfrogai_evals.models import LFAI_Model


def human_eval(
num_samples: Optional[int] = None,
k: Optional[int] = None,
num_tasks: Optional[int] = None,
) -> dict:
"""Runs the HumanEval benchmark on a subset of tasks"""
eval_results = dict()
task_scores = dict()
num_tasks = num_tasks or int(
os.getenv("HUMAN_EVAL_NUM_TASKS", default=len(list(HumanEvalTask)))
)
logging.info(f"Running the HumanEval benchmark on {num_tasks} tasks")
failed_tasks = 0
for task in tqdm(list(HumanEvalTask)[:num_tasks]):
task_benchmark = HumanEval(
n=num_samples or int(os.getenv("HUMAN_EVAL_NUM_SAMPLES_PER_TASK")),
tasks=[task],
)
try:
task_benchmark.evaluate(
model=LFAI_Model(), k=k or int(os.getenv("HUMAN_EVAL_K"))
)
task_scores[task.name] = task_benchmark.overall_score
except Exception as exc:
logging.info(
f"HumanEval task {task.name} failed with error {exc}", exc_info=exc
)
task_scores[task.name] = 0.0
failed_tasks += 1

human_eval_avg_score = np.mean(list(task_scores.values()))
logging.info(f"HumanEval overall score: {human_eval_avg_score}")
logging.info(f"HumanEval failed task count: {failed_tasks}")
logging.info(f"HumanEval task scores:\n {task_scores}")

# add the evaluation score to the final results
eval_results["HumanEval"] = human_eval_avg_score
return eval_results
28 changes: 28 additions & 0 deletions src/leapfrogai_evals/evals/mmlu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import logging
import os

from deepeval.benchmarks import MMLU
from deepeval.benchmarks.tasks import MMLUTask
from typing import Optional

from leapfrogai_evals.models import LFAI_Model


def mmlu(num_tasks: Optional[int] = None, n_shots: Optional[int] = None) -> dict:
"""Runs the Massive Multitask Language Understanding (MMLU) benchmark on a subset of tasks"""
eval_results = dict()
num_tasks = num_tasks or int(
os.getenv("MMLU_NUM_TASKS", default=len(list(MMLUTask)))
)
logging.info(f"Running the MMLU benchmark on {num_tasks} tasks")
tasks = list(MMLUTask)[:num_tasks]
mmlu_benchmark = MMLU(
tasks=tasks, n_shots=n_shots or int(os.getenv("MMLU_NUM_SHOTS"))
)
mmlu_benchmark.evaluate(model=LFAI_Model())
logging.info(f"MMLU overall score: {mmlu_benchmark.overall_score}")
logging.info(f"MMLU task scores:\n {mmlu_benchmark.task_scores}")

# add the evaluation score to the final results
eval_results["MMLU"] = mmlu_benchmark.overall_score
return eval_results
53 changes: 53 additions & 0 deletions src/leapfrogai_evals/evals/niah_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import logging
import numpy as np

from deepeval.test_case import LLMTestCase

from leapfrogai_evals.metrics import NIAH_Retrieval, NIAH_Response
from leapfrogai_evals.runners import NIAH_Runner


def niah_eval(*args, **kwargs) -> dict:
"""Run the Needle in a Haystack evaluation"""
logging.info("Beginning Needle in a Haystack Evaluation...")
eval_results = dict()
niah_test_cases = []

niah_runner = NIAH_Runner(*args, **kwargs)
niah_runner.run_experiment()

# build test cases out of the niah_dataset
for row in niah_runner.niah_data:
niah_test_cases.append(
LLMTestCase(
input=niah_runner.message_prompt,
actual_output=row["response"],
context=[row["context"]],
additional_metadata={
"retrieval_score": row["retrieval_score"],
"response_score": row["response_score"],
},
)
)

# run metrics
# TODO: Give ability to choose which metrics to run
retrieval_metric = NIAH_Retrieval()
response_metric = NIAH_Response()
metrics = [retrieval_metric, response_metric]

# record scores and return results
for metric in metrics:
scores = []
successes = []
for test_case in niah_test_cases:
metric.measure(test_case)
scores.append(metric.score)
successes.append(metric.is_successful())
eval_results[f"Average {metric.__name__}"] = np.mean(scores)
logging.info(f"{metric.__name__} Results:")
logging.info(f"average score: {np.mean(scores)}")
logging.info(f"scores: {scores}")
logging.info(f"successes: {successes}")

return eval_results
72 changes: 72 additions & 0 deletions src/leapfrogai_evals/evals/qa_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import logging
import numpy as np
import os

from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase

from leapfrogai_evals.metrics import AnnotationRelevancyMetric, CorrectnessMetric
from leapfrogai_evals.models import * # noqa (imports all models)
from leapfrogai_evals.runners import QA_Runner


def qa_eval(*args, **kwargs) -> dict:
"""Runs the Question/Answer evaluation"""
logging.info("Beginning Question/Answer Evaluation...")
eval_results = dict()
qa_test_cases = []

qa_runner = QA_Runner(*args, **kwargs)
qa_runner.run_experiment()

# build test cases out of the qa_dataset
for row in qa_runner.qa_data:
qa_test_cases.append(
LLMTestCase(
input=row["input"],
actual_output=row["actual_output"],
context=row["context"],
expected_output=row["expected_output"],
additional_metadata={
"actual_annotations": row["actual_annotations"],
"expected_annotations": row["expected_annotations"],
},
# retrieval_context = row['retrieval_context'] # TODO: add this for more metrics
)
)

# Create judge llm
try:
judge_model = globals()[os.environ.get("LLM_JUDGE")]()
except KeyError:
judge_model = os.environ.get("LLM_JUDGE")

# run metrics
# TODO: Give ability to choose which metrics to run
correctness_metric = CorrectnessMetric(model=judge_model)
answer_relevancy_metric = AnswerRelevancyMetric(model=judge_model)
annotation_relevancy_metric = AnnotationRelevancyMetric()
metrics = [
correctness_metric,
answer_relevancy_metric,
annotation_relevancy_metric,
]

# record scores and return results
for metric in metrics:
scores = []
successes = []
reasons = []
for test_case in qa_test_cases:
metric.measure(test_case)
scores.append(metric.score)
successes.append(metric.is_successful())
reasons.append(metric.reason)
eval_results[f"Average {metric.__name__}"] = np.mean(scores)
logging.info(f"{metric.__name__} Results:")
logging.info(f"average score: {np.mean(scores)}")
logging.info(f"scores: {scores}")
logging.info(f"successes: {successes}")
logging.info(f"reasons: {reasons}")

return eval_results
Empty file.
119 changes: 6 additions & 113 deletions src/leapfrogai_evals/main.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,11 @@
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric

import logging
import numpy as np
import os
from dotenv import load_dotenv
import time
from dotenv import load_dotenv
from typing import Optional, List

from leapfrogai_evals.judges.claude_sonnet import ClaudeSonnet # noqa
from leapfrogai_evals.metrics.annotation_relevancy import AnnotationRelevancyMetric
from leapfrogai_evals.metrics.correctness import CorrectnessMetric
from leapfrogai_evals.metrics.niah_metrics import NIAH_Retrieval, NIAH_Response
from leapfrogai_evals.runners.niah_runner import NIAH_Runner
from leapfrogai_evals.runners.qa_runner import QA_Runner
from leapfrogai_evals.evals import human_eval, mmlu, niah_eval, qa_eval # noqa

ALL_EVALS = ["niah_eval", "qa_eval"]
ALL_EVALS = ["niah_eval", "qa_eval", "mmlu", "human_eval"]


class RAGEvaluator:
Expand Down Expand Up @@ -55,8 +45,9 @@ def run_evals(self, *args, **kwargs) -> None:

start_time = time.time()
for eval_name in self.eval_list:
eval = getattr(self, eval_name)
eval(*args, **kwargs)
eval = globals()[eval_name]
eval_result = eval(*args, **kwargs)
self.eval_results.update(eval_result)
end_time = time.time()

self.eval_results["Eval Execution Runtime (seconds)"] = end_time - start_time
Expand All @@ -65,104 +56,6 @@ def run_evals(self, *args, **kwargs) -> None:
for key, value in self.eval_results.items():
logging.info(f"{key}: {value}")

def niah_eval(self, *args, **kwargs) -> None:
"""Run the Needle in a Haystack evaluation"""
logging.info("Beginning Needle in a Haystack Evaluation...")
self.niah_test_cases = []

niah_runner = NIAH_Runner(*args, **kwargs)
niah_runner.run_experiment()

# build test cases out of the niah_dataset
for row in niah_runner.niah_data:
self.niah_test_cases.append(
LLMTestCase(
input=niah_runner.message_prompt,
actual_output=row["response"],
context=[row["context"]],
additional_metadata={
"retrieval_score": row["retrieval_score"],
"response_score": row["response_score"],
},
)
)

# run metrics
# TODO: Give ability to choose which metrics to run
retrieval_metric = NIAH_Retrieval()
response_metric = NIAH_Response()
metrics = [retrieval_metric, response_metric]

for metric in metrics:
scores = []
successes = []
for test_case in self.niah_test_cases:
metric.measure(test_case)
scores.append(metric.score)
successes.append(metric.is_successful())
self.eval_results[f"Average {metric.__name__}"] = np.mean(scores)
logging.info(f"{metric.__name__} Results:")
logging.info(f"average score: {np.mean(scores)}")
logging.info(f"scores: {scores}")
logging.info(f"successes: {successes}")

def qa_eval(self, *args, **kwargs) -> None:
"""Runs the Question/Answer evaluation"""
logging.info("Beginning Question/Answer Evaluation...")
self.qa_test_cases = []

qa_runner = QA_Runner(*args, **kwargs)
qa_runner.run_experiment()

# build test cases out of the qa_dataset
for row in qa_runner.qa_data:
self.qa_test_cases.append(
LLMTestCase(
input=row["input"],
actual_output=row["actual_output"],
context=row["context"],
expected_output=row["expected_output"],
additional_metadata={
"actual_annotations": row["actual_annotations"],
"expected_annotations": row["expected_annotations"],
},
# retrieval_context = row['retrieval_context'] # TODO: add this for more metrics
)
)

# Create judge llm
try:
judge_model = globals()[os.environ.get("LLM_JUDGE")]()
except KeyError:
judge_model = os.environ.get("LLM_JUDGE")

# run metrics
# TODO: Give ability to choose which metrics to run
correctness_metric = CorrectnessMetric(model=judge_model)
answer_relevancy_metric = AnswerRelevancyMetric(model=judge_model)
annotation_relevancy_metric = AnnotationRelevancyMetric()
metrics = [
correctness_metric,
answer_relevancy_metric,
annotation_relevancy_metric,
]

for metric in metrics:
scores = []
successes = []
reasons = []
for test_case in self.qa_test_cases:
metric.measure(test_case)
scores.append(metric.score)
successes.append(metric.is_successful())
reasons.append(metric.reason)
self.eval_results[f"Average {metric.__name__}"] = np.mean(scores)
logging.info(f"{metric.__name__} Results:")
logging.info(f"average score: {np.mean(scores)}")
logging.info(f"scores: {scores}")
logging.info(f"successes: {successes}")
logging.info(f"reasons: {reasons}")


if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
Expand Down
6 changes: 6 additions & 0 deletions src/leapfrogai_evals/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# __init__.py
# ruff: noqa: F401

from leapfrogai_evals.metrics.annotation_relevancy import AnnotationRelevancyMetric
from leapfrogai_evals.metrics.correctness import CorrectnessMetric
from leapfrogai_evals.metrics.niah_metrics import NIAH_Response, NIAH_Retrieval
Loading

0 comments on commit 8e68b8b

Please sign in to comment.