From 8e68b8bfbf9566e4e9a0639187b935c9526c4339 Mon Sep 17 00:00:00 2001 From: John Alling <44934218+jalling97@users.noreply.github.com> Date: Wed, 25 Sep 2024 08:15:33 -0400 Subject: [PATCH] feat: 1049 add standard evaluation benchmarks to lfai evals (#1078) * refactor eval structure * add MMLU benchmark * add HumanEval benchmarks * add DeepEval compatible LLM class using LFAI * update evals README * upgrade DeepEval to v1.3.0 --- src/leapfrogai_evals/.env.example | 9 ++ src/leapfrogai_evals/README.md | 18 +++ src/leapfrogai_evals/evals/__init__.py | 7 ++ src/leapfrogai_evals/evals/human_eval.py | 50 ++++++++ src/leapfrogai_evals/evals/mmlu.py | 28 +++++ src/leapfrogai_evals/evals/niah_eval.py | 53 ++++++++ src/leapfrogai_evals/evals/qa_eval.py | 72 +++++++++++ src/leapfrogai_evals/judges/__init__.py | 0 src/leapfrogai_evals/main.py | 119 +----------------- src/leapfrogai_evals/metrics/__init__.py | 6 + src/leapfrogai_evals/models/__init__.py | 5 + .../{judges => models}/claude_sonnet.py | 0 src/leapfrogai_evals/models/lfai.py | 73 +++++++++++ src/leapfrogai_evals/pyproject.toml | 2 +- src/leapfrogai_evals/runners/__init__.py | 5 + src/leapfrogai_evals/runners/niah_runner.py | 2 +- 16 files changed, 334 insertions(+), 115 deletions(-) create mode 100644 src/leapfrogai_evals/evals/__init__.py create mode 100644 src/leapfrogai_evals/evals/human_eval.py create mode 100644 src/leapfrogai_evals/evals/mmlu.py create mode 100644 src/leapfrogai_evals/evals/niah_eval.py create mode 100644 src/leapfrogai_evals/evals/qa_eval.py delete mode 100644 src/leapfrogai_evals/judges/__init__.py create mode 100644 src/leapfrogai_evals/models/__init__.py rename src/leapfrogai_evals/{judges => models}/claude_sonnet.py (100%) create mode 100644 src/leapfrogai_evals/models/lfai.py diff --git a/src/leapfrogai_evals/.env.example b/src/leapfrogai_evals/.env.example index cfc928bc3..1235737c6 100644 --- a/src/leapfrogai_evals/.env.example +++ b/src/leapfrogai_evals/.env.example @@ -26,3 +26,12 @@ QA_NUM_SAMPLES=25 QA_NUM_DOCUMENTS=5 #QA_VECTOR_STORE_ID= # set this to a vectore store id if you want to use an already existing vector store with the files present QA_CLEANUP_VECTOR_STORE=True # recommend setting this to False if a vector store id is provided + +# MMLU +MMLU_NUM_TASKS=6 +MMLU_NUM_SHOTS=5 + +# HumanEval +HUMAN_EVAL_NUM_SAMPLES_PER_TASK=3 +HUMAN_EVAL_NUM_TASKS=50 +HUMAN_EVAL_K=1 diff --git a/src/leapfrogai_evals/README.md b/src/leapfrogai_evals/README.md index fb492113c..857ad8958 100644 --- a/src/leapfrogai_evals/README.md +++ b/src/leapfrogai_evals/README.md @@ -115,3 +115,21 @@ The LeapfrogAI NIAH evaluation uses the following process: - delete the vector store The retrieval and response rate is then averaged across each copy of the experiment to generate a final score. + +## Established Benchmark Evaluations + +### MMLU + +The [Massive Multitask Language Understanding (MMLU)](https://arxiv.org/abs/2009.03300) benchmark is widely-used in the evaluation of large language models on academic and professional tasks. It consists of 57 tasks across disciplines including mathe, history, biology, law, etc. These tasks reflect the kind of questions students and professionals might encounter, making the benchmark a good test of a model's knowledge and reasoning abilities across different fields. + +MMLU is a multiple-choice evaluation, meaning that it focuses more on the logical reasoning behind the questions and less about the specific generation process. + +Within the LeapfrogAI evaluation framework, The [DeepEval Implementation of MMLU](https://docs.confident-ai.com/docs/benchmarks-mmlu) is utilized. Additionally, a default subset of 6 topically relevant tasks out of the total 57 tasks are tested against, including: College Computer Science, US Foreign Policy, High School Government and Politics, Formal Logic, Computer Security, and Security Studies. This subset was chosen as the full MMLU evaluation takes a long time to process. Larger evaluation sessions should utilize the entire task set. + +### HumanEval + +The [HumanEval](https://github.com/openai/human-eval) benchmark is a dataset designed to evaluate an LLM's code generation capabilities. The benchmark consists of 164 hand-crafted programming challenges comparable to simple software interview questions. + +HumanEval code snippets are generated from docstrings into Python-executable code. The primary metric utilized is the pass@k metric, which measures the probability that at least one of the top k code snippets generated by the LLM passes all the test cases. + +Within the LeapfrogAI evaluation framework, The [DeepEval Implementation of HumanEval](https://docs.confident-ai.com/docs/benchmarks-human-eval) is utilized. Additionally, a default subset of 50 code generation tasks out of the total 164 are tested against. This was chosen as the full HumanEval evaluation takes a long time to process. Larger evaluation sessions should utilize the entire task set. diff --git a/src/leapfrogai_evals/evals/__init__.py b/src/leapfrogai_evals/evals/__init__.py new file mode 100644 index 000000000..bed4b08ad --- /dev/null +++ b/src/leapfrogai_evals/evals/__init__.py @@ -0,0 +1,7 @@ +# __init__.py +# ruff: noqa: F401 + +from leapfrogai_evals.evals.human_eval import human_eval +from leapfrogai_evals.evals.mmlu import mmlu +from leapfrogai_evals.evals.niah_eval import niah_eval +from leapfrogai_evals.evals.qa_eval import qa_eval diff --git a/src/leapfrogai_evals/evals/human_eval.py b/src/leapfrogai_evals/evals/human_eval.py new file mode 100644 index 000000000..15cef0da5 --- /dev/null +++ b/src/leapfrogai_evals/evals/human_eval.py @@ -0,0 +1,50 @@ +import logging +import numpy as np +import os + +from deepeval.benchmarks import HumanEval +from deepeval.benchmarks.tasks import HumanEvalTask +from tqdm import tqdm +from typing import Optional + +from leapfrogai_evals.models import LFAI_Model + + +def human_eval( + num_samples: Optional[int] = None, + k: Optional[int] = None, + num_tasks: Optional[int] = None, +) -> dict: + """Runs the HumanEval benchmark on a subset of tasks""" + eval_results = dict() + task_scores = dict() + num_tasks = num_tasks or int( + os.getenv("HUMAN_EVAL_NUM_TASKS", default=len(list(HumanEvalTask))) + ) + logging.info(f"Running the HumanEval benchmark on {num_tasks} tasks") + failed_tasks = 0 + for task in tqdm(list(HumanEvalTask)[:num_tasks]): + task_benchmark = HumanEval( + n=num_samples or int(os.getenv("HUMAN_EVAL_NUM_SAMPLES_PER_TASK")), + tasks=[task], + ) + try: + task_benchmark.evaluate( + model=LFAI_Model(), k=k or int(os.getenv("HUMAN_EVAL_K")) + ) + task_scores[task.name] = task_benchmark.overall_score + except Exception as exc: + logging.info( + f"HumanEval task {task.name} failed with error {exc}", exc_info=exc + ) + task_scores[task.name] = 0.0 + failed_tasks += 1 + + human_eval_avg_score = np.mean(list(task_scores.values())) + logging.info(f"HumanEval overall score: {human_eval_avg_score}") + logging.info(f"HumanEval failed task count: {failed_tasks}") + logging.info(f"HumanEval task scores:\n {task_scores}") + + # add the evaluation score to the final results + eval_results["HumanEval"] = human_eval_avg_score + return eval_results diff --git a/src/leapfrogai_evals/evals/mmlu.py b/src/leapfrogai_evals/evals/mmlu.py new file mode 100644 index 000000000..027b317a7 --- /dev/null +++ b/src/leapfrogai_evals/evals/mmlu.py @@ -0,0 +1,28 @@ +import logging +import os + +from deepeval.benchmarks import MMLU +from deepeval.benchmarks.tasks import MMLUTask +from typing import Optional + +from leapfrogai_evals.models import LFAI_Model + + +def mmlu(num_tasks: Optional[int] = None, n_shots: Optional[int] = None) -> dict: + """Runs the Massive Multitask Language Understanding (MMLU) benchmark on a subset of tasks""" + eval_results = dict() + num_tasks = num_tasks or int( + os.getenv("MMLU_NUM_TASKS", default=len(list(MMLUTask))) + ) + logging.info(f"Running the MMLU benchmark on {num_tasks} tasks") + tasks = list(MMLUTask)[:num_tasks] + mmlu_benchmark = MMLU( + tasks=tasks, n_shots=n_shots or int(os.getenv("MMLU_NUM_SHOTS")) + ) + mmlu_benchmark.evaluate(model=LFAI_Model()) + logging.info(f"MMLU overall score: {mmlu_benchmark.overall_score}") + logging.info(f"MMLU task scores:\n {mmlu_benchmark.task_scores}") + + # add the evaluation score to the final results + eval_results["MMLU"] = mmlu_benchmark.overall_score + return eval_results diff --git a/src/leapfrogai_evals/evals/niah_eval.py b/src/leapfrogai_evals/evals/niah_eval.py new file mode 100644 index 000000000..9515f7030 --- /dev/null +++ b/src/leapfrogai_evals/evals/niah_eval.py @@ -0,0 +1,53 @@ +import logging +import numpy as np + +from deepeval.test_case import LLMTestCase + +from leapfrogai_evals.metrics import NIAH_Retrieval, NIAH_Response +from leapfrogai_evals.runners import NIAH_Runner + + +def niah_eval(*args, **kwargs) -> dict: + """Run the Needle in a Haystack evaluation""" + logging.info("Beginning Needle in a Haystack Evaluation...") + eval_results = dict() + niah_test_cases = [] + + niah_runner = NIAH_Runner(*args, **kwargs) + niah_runner.run_experiment() + + # build test cases out of the niah_dataset + for row in niah_runner.niah_data: + niah_test_cases.append( + LLMTestCase( + input=niah_runner.message_prompt, + actual_output=row["response"], + context=[row["context"]], + additional_metadata={ + "retrieval_score": row["retrieval_score"], + "response_score": row["response_score"], + }, + ) + ) + + # run metrics + # TODO: Give ability to choose which metrics to run + retrieval_metric = NIAH_Retrieval() + response_metric = NIAH_Response() + metrics = [retrieval_metric, response_metric] + + # record scores and return results + for metric in metrics: + scores = [] + successes = [] + for test_case in niah_test_cases: + metric.measure(test_case) + scores.append(metric.score) + successes.append(metric.is_successful()) + eval_results[f"Average {metric.__name__}"] = np.mean(scores) + logging.info(f"{metric.__name__} Results:") + logging.info(f"average score: {np.mean(scores)}") + logging.info(f"scores: {scores}") + logging.info(f"successes: {successes}") + + return eval_results diff --git a/src/leapfrogai_evals/evals/qa_eval.py b/src/leapfrogai_evals/evals/qa_eval.py new file mode 100644 index 000000000..88cb60926 --- /dev/null +++ b/src/leapfrogai_evals/evals/qa_eval.py @@ -0,0 +1,72 @@ +import logging +import numpy as np +import os + +from deepeval.metrics import AnswerRelevancyMetric +from deepeval.test_case import LLMTestCase + +from leapfrogai_evals.metrics import AnnotationRelevancyMetric, CorrectnessMetric +from leapfrogai_evals.models import * # noqa (imports all models) +from leapfrogai_evals.runners import QA_Runner + + +def qa_eval(*args, **kwargs) -> dict: + """Runs the Question/Answer evaluation""" + logging.info("Beginning Question/Answer Evaluation...") + eval_results = dict() + qa_test_cases = [] + + qa_runner = QA_Runner(*args, **kwargs) + qa_runner.run_experiment() + + # build test cases out of the qa_dataset + for row in qa_runner.qa_data: + qa_test_cases.append( + LLMTestCase( + input=row["input"], + actual_output=row["actual_output"], + context=row["context"], + expected_output=row["expected_output"], + additional_metadata={ + "actual_annotations": row["actual_annotations"], + "expected_annotations": row["expected_annotations"], + }, + # retrieval_context = row['retrieval_context'] # TODO: add this for more metrics + ) + ) + + # Create judge llm + try: + judge_model = globals()[os.environ.get("LLM_JUDGE")]() + except KeyError: + judge_model = os.environ.get("LLM_JUDGE") + + # run metrics + # TODO: Give ability to choose which metrics to run + correctness_metric = CorrectnessMetric(model=judge_model) + answer_relevancy_metric = AnswerRelevancyMetric(model=judge_model) + annotation_relevancy_metric = AnnotationRelevancyMetric() + metrics = [ + correctness_metric, + answer_relevancy_metric, + annotation_relevancy_metric, + ] + + # record scores and return results + for metric in metrics: + scores = [] + successes = [] + reasons = [] + for test_case in qa_test_cases: + metric.measure(test_case) + scores.append(metric.score) + successes.append(metric.is_successful()) + reasons.append(metric.reason) + eval_results[f"Average {metric.__name__}"] = np.mean(scores) + logging.info(f"{metric.__name__} Results:") + logging.info(f"average score: {np.mean(scores)}") + logging.info(f"scores: {scores}") + logging.info(f"successes: {successes}") + logging.info(f"reasons: {reasons}") + + return eval_results diff --git a/src/leapfrogai_evals/judges/__init__.py b/src/leapfrogai_evals/judges/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/leapfrogai_evals/main.py b/src/leapfrogai_evals/main.py index a32daaa66..3813474f9 100644 --- a/src/leapfrogai_evals/main.py +++ b/src/leapfrogai_evals/main.py @@ -1,21 +1,11 @@ -from deepeval.test_case import LLMTestCase -from deepeval.metrics import AnswerRelevancyMetric - import logging -import numpy as np -import os -from dotenv import load_dotenv import time +from dotenv import load_dotenv from typing import Optional, List -from leapfrogai_evals.judges.claude_sonnet import ClaudeSonnet # noqa -from leapfrogai_evals.metrics.annotation_relevancy import AnnotationRelevancyMetric -from leapfrogai_evals.metrics.correctness import CorrectnessMetric -from leapfrogai_evals.metrics.niah_metrics import NIAH_Retrieval, NIAH_Response -from leapfrogai_evals.runners.niah_runner import NIAH_Runner -from leapfrogai_evals.runners.qa_runner import QA_Runner +from leapfrogai_evals.evals import human_eval, mmlu, niah_eval, qa_eval # noqa -ALL_EVALS = ["niah_eval", "qa_eval"] +ALL_EVALS = ["niah_eval", "qa_eval", "mmlu", "human_eval"] class RAGEvaluator: @@ -55,8 +45,9 @@ def run_evals(self, *args, **kwargs) -> None: start_time = time.time() for eval_name in self.eval_list: - eval = getattr(self, eval_name) - eval(*args, **kwargs) + eval = globals()[eval_name] + eval_result = eval(*args, **kwargs) + self.eval_results.update(eval_result) end_time = time.time() self.eval_results["Eval Execution Runtime (seconds)"] = end_time - start_time @@ -65,104 +56,6 @@ def run_evals(self, *args, **kwargs) -> None: for key, value in self.eval_results.items(): logging.info(f"{key}: {value}") - def niah_eval(self, *args, **kwargs) -> None: - """Run the Needle in a Haystack evaluation""" - logging.info("Beginning Needle in a Haystack Evaluation...") - self.niah_test_cases = [] - - niah_runner = NIAH_Runner(*args, **kwargs) - niah_runner.run_experiment() - - # build test cases out of the niah_dataset - for row in niah_runner.niah_data: - self.niah_test_cases.append( - LLMTestCase( - input=niah_runner.message_prompt, - actual_output=row["response"], - context=[row["context"]], - additional_metadata={ - "retrieval_score": row["retrieval_score"], - "response_score": row["response_score"], - }, - ) - ) - - # run metrics - # TODO: Give ability to choose which metrics to run - retrieval_metric = NIAH_Retrieval() - response_metric = NIAH_Response() - metrics = [retrieval_metric, response_metric] - - for metric in metrics: - scores = [] - successes = [] - for test_case in self.niah_test_cases: - metric.measure(test_case) - scores.append(metric.score) - successes.append(metric.is_successful()) - self.eval_results[f"Average {metric.__name__}"] = np.mean(scores) - logging.info(f"{metric.__name__} Results:") - logging.info(f"average score: {np.mean(scores)}") - logging.info(f"scores: {scores}") - logging.info(f"successes: {successes}") - - def qa_eval(self, *args, **kwargs) -> None: - """Runs the Question/Answer evaluation""" - logging.info("Beginning Question/Answer Evaluation...") - self.qa_test_cases = [] - - qa_runner = QA_Runner(*args, **kwargs) - qa_runner.run_experiment() - - # build test cases out of the qa_dataset - for row in qa_runner.qa_data: - self.qa_test_cases.append( - LLMTestCase( - input=row["input"], - actual_output=row["actual_output"], - context=row["context"], - expected_output=row["expected_output"], - additional_metadata={ - "actual_annotations": row["actual_annotations"], - "expected_annotations": row["expected_annotations"], - }, - # retrieval_context = row['retrieval_context'] # TODO: add this for more metrics - ) - ) - - # Create judge llm - try: - judge_model = globals()[os.environ.get("LLM_JUDGE")]() - except KeyError: - judge_model = os.environ.get("LLM_JUDGE") - - # run metrics - # TODO: Give ability to choose which metrics to run - correctness_metric = CorrectnessMetric(model=judge_model) - answer_relevancy_metric = AnswerRelevancyMetric(model=judge_model) - annotation_relevancy_metric = AnnotationRelevancyMetric() - metrics = [ - correctness_metric, - answer_relevancy_metric, - annotation_relevancy_metric, - ] - - for metric in metrics: - scores = [] - successes = [] - reasons = [] - for test_case in self.qa_test_cases: - metric.measure(test_case) - scores.append(metric.score) - successes.append(metric.is_successful()) - reasons.append(metric.reason) - self.eval_results[f"Average {metric.__name__}"] = np.mean(scores) - logging.info(f"{metric.__name__} Results:") - logging.info(f"average score: {np.mean(scores)}") - logging.info(f"scores: {scores}") - logging.info(f"successes: {successes}") - logging.info(f"reasons: {reasons}") - if __name__ == "__main__": logging.basicConfig(level=logging.INFO) diff --git a/src/leapfrogai_evals/metrics/__init__.py b/src/leapfrogai_evals/metrics/__init__.py index e69de29bb..428d526d5 100644 --- a/src/leapfrogai_evals/metrics/__init__.py +++ b/src/leapfrogai_evals/metrics/__init__.py @@ -0,0 +1,6 @@ +# __init__.py +# ruff: noqa: F401 + +from leapfrogai_evals.metrics.annotation_relevancy import AnnotationRelevancyMetric +from leapfrogai_evals.metrics.correctness import CorrectnessMetric +from leapfrogai_evals.metrics.niah_metrics import NIAH_Response, NIAH_Retrieval diff --git a/src/leapfrogai_evals/models/__init__.py b/src/leapfrogai_evals/models/__init__.py new file mode 100644 index 000000000..49680c72f --- /dev/null +++ b/src/leapfrogai_evals/models/__init__.py @@ -0,0 +1,5 @@ +# __init__.py +# ruff: noqa: F401 + +from leapfrogai_evals.models.claude_sonnet import ClaudeSonnet +from leapfrogai_evals.models.lfai import LFAI_Model diff --git a/src/leapfrogai_evals/judges/claude_sonnet.py b/src/leapfrogai_evals/models/claude_sonnet.py similarity index 100% rename from src/leapfrogai_evals/judges/claude_sonnet.py rename to src/leapfrogai_evals/models/claude_sonnet.py diff --git a/src/leapfrogai_evals/models/lfai.py b/src/leapfrogai_evals/models/lfai.py new file mode 100644 index 000000000..fca7c8de4 --- /dev/null +++ b/src/leapfrogai_evals/models/lfai.py @@ -0,0 +1,73 @@ +import os + +import openai +from pydantic import BaseModel +from deepeval.models.base_model import DeepEvalBaseLLM +import asyncio +from typing import Optional, List + + +class LFAI_Model(DeepEvalBaseLLM): + """ + A DeepEval LLM class that utilizes the LeapfrogAI API to prompt certain benchmarks + + Do not use this particular class as an LLM judge. + This model class is needed to run generation-focused benchmarks, not be used as a judge. + Use Claude Sonnet or another larger model for LLM-as-judge evaluations + """ + + def __init__( + self, + api_key: Optional[str] = None, + base_url: Optional[str] = None, + model: Optional[str] = None, + ): + self.model = model or os.getenv("MODEL_TO_EVALUATE") + self.api_key = api_key or os.getenv("LEAPFROGAI_API_KEY") + self.base_url = base_url or os.getenv("LEAPFROGAI_API_URL") + self.client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url) + + def load_model(self): + """Returns the current model selected""" + return self.model + + def generate( + self, + prompt: str, + max_tokens: int = 1024, + temperature: float = 0.75, + top_p: float = 1.0, + stop: str = "", + ) -> str: + """Generates a response from LeapfrogAI API using the OpenAI SDK""" + response = self.client.chat.completions.create( + temperature=temperature, + model=self.model, + max_tokens=max_tokens, + messages=[ + { + "role": "user", + "content": prompt, + } + ], + top_p=top_p, + stop=stop, + ) + response_text = response.choices[0].message.content + if response_text.endswith(stop): + response_text = response_text[: -len(stop)] + response_text = response_text.strip() + return response_text + + def generate_samples(self, n: int, *args, **kwargs) -> List[str]: + """Generates a list of n responses using the generate() function""" + samples = [self.generate(*args, **kwargs) for ii in range(n)] + return samples + + async def a_generate(self, prompt: str, *args, **kwargs) -> BaseModel: + """Async implementation of the generate function""" + loop = asyncio.get_running_loop() + return await loop.run_in_executor(None, self.generate, prompt, *args, **kwargs) + + def get_model_name(self): + return f"LeapfrogAI {self.model}" diff --git a/src/leapfrogai_evals/pyproject.toml b/src/leapfrogai_evals/pyproject.toml index c80752f13..8d671cafd 100644 --- a/src/leapfrogai_evals/pyproject.toml +++ b/src/leapfrogai_evals/pyproject.toml @@ -7,7 +7,7 @@ version = "0.13.0" # x-release-please-end dependencies = [ - "deepeval == 1.1.6", + "deepeval == 1.3.0", "openai == 1.42.0", "tqdm == 4.66.5", "python-dotenv == 1.0.1", diff --git a/src/leapfrogai_evals/runners/__init__.py b/src/leapfrogai_evals/runners/__init__.py index e69de29bb..5df8b4756 100644 --- a/src/leapfrogai_evals/runners/__init__.py +++ b/src/leapfrogai_evals/runners/__init__.py @@ -0,0 +1,5 @@ +# __init__.py +# ruff: noqa: F401 + +from leapfrogai_evals.runners.niah_runner import NIAH_Runner +from leapfrogai_evals.runners.qa_runner import QA_Runner diff --git a/src/leapfrogai_evals/runners/niah_runner.py b/src/leapfrogai_evals/runners/niah_runner.py index e200a604a..7da66e7df 100644 --- a/src/leapfrogai_evals/runners/niah_runner.py +++ b/src/leapfrogai_evals/runners/niah_runner.py @@ -181,7 +181,7 @@ def run_experiment(self, cleanup: bool = True) -> None: # # response_score # # 1 if needle text was returned by the LLM's final response else 0 secret_code = row["secret_code"] - logging.debug(f"Response message: {response.content[0].text.value}") + logging.info(f"Response message: {response.content[0].text.value}") if secret_code in response.content[0].text.value: logging.debug("Setting response_score to 1.0") response_score = 1.0