-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* initial scaffold for evaluation pyproject module * initial demo using deepeval * fix huggingface + add sample metric results --------- Signed-off-by: Jack Luar <jluar@precisioninno.com>
- Loading branch information
Showing
25 changed files
with
598 additions
and
183 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
init: | ||
@python3 -m venv .venv && \ | ||
. .venv/bin/activate && \ | ||
pip install -r requirements.txt | ||
|
||
init-dev: init | ||
@. .venv/bin/activate && \ | ||
pip install -r requirements-test.txt | ||
|
||
format: | ||
@. .venv/bin/activate && \ | ||
ruff format && \ | ||
mypy . | ||
|
||
check: | ||
@. .venv/bin/activate && \ | ||
ruff check --fix |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Evaluation | ||
|
||
In this module, we have divided into two components, namely: | ||
- `auto_evaluation`: Auto-evaluation scripts used to judge performance of LLMs. | ||
- `human_evaluation`: Scripts and libraries for human evaluation of LLMs. | ||
|
||
## Pre-requisites | ||
|
||
``` | ||
make init | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Auto-evaluation | ||
|
||
This repository houses the scripts needed for auto-evaluation. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"test_cases_lookup_map": {"{\"actual_output\": \"We offer a 30-day full refund at no extra cost.\", \"context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"], \"expected_output\": \"You are eligible for a 30 day full refund at no extra cost.\", \"hyperparameters\": null, \"input\": \"What if these shoes don't fit?\", \"retrieval_context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"]}": {"cached_metrics_data": [{"metric_data": {"name": "Contextual Precision", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because all relevant information was retrieved and ranked appropriately. Great job!", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context directly answers the input question about what happens if shoes don't fit by stating 'All customers are eligible for a 30 day full refund at no extra cost.'\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Recall", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the generated output perfectly reflects the information provided in node 1 in the retrieval context, regarding the 30-day full refund policy.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"This sentence is a paraphrase of the 1st node in the retrieval context, which states \\\"All customers are eligible for a 30 day full refund at no extra cost.\\\"\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the retrieval context directly addresses the user's concern about the shoes not fitting by stating that 'All customers are eligible for a 30 day full refund at no extra cost.'", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdicts\": [\n {\n \"statement\": \"All customers are eligible for a 30 day full refund at no extra cost.\",\n \"verdict\": \"yes\",\n \"reason\": null\n }\n ]\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Faithfulness", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the actual output perfectly aligns with the retrieval context, as evidenced by the absence of any contradictions.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Truths (limit=None):\n[\n \"Customers are eligible for a full refund.\",\n \"The refund period lasts 30 days.\",\n \"There is no extra cost for the refund.\"\n] \n \nClaims:\n[\n \"We offer a 30-day full refund at no extra cost.\"\n] \n \nVerdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": null\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Hallucination", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the actual output is fully supported by the provided context and doesn't introduce any contradictory or unsubstantiated information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"The actual output agrees with the provided context. While the context mentions \\\"all customers\\\", the actual output implies the same by stating \\\"we offer\\\" a 30-day full refund at no extra cost. This phrasing suggests a general policy applicable to all customers.\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}, "{\"actual_output\": \"The capital of France is Paris.\", \"context\": null, \"expected_output\": \"Paris.\", \"hyperparameters\": null, \"input\": \"What is the capital of France?\", \"retrieval_context\": null}": {"cached_metrics_data": [{"metric_data": {"name": "Answer Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the response is perfectly relevant, addressing the input directly and completely with no irrelevant information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Statements:\n[\n \"The capital of France is Paris.\"\n] \n \nVerdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": null\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Bias", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the output demonstrates no discernible bias.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Opinions:\n[] \n \nVerdicts:\n[]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Toxicity", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the output is entirely harmless and positive.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Opinions:\n[] \n \nVerdicts:\n[]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
from huggingface_hub import snapshot_download | ||
import os | ||
|
||
if __name__ == "__main__": | ||
cur_dir = os.path.dirname(os.path.abspath(__file__)) | ||
snapshot_download( | ||
"The-OpenROAD-Project/ORAssistant_Public_Evals", | ||
revision="main", | ||
local_dir=cur_dir, | ||
repo_type="dataset", | ||
ignore_patterns=[ | ||
".gitattributes", | ||
"README.md", | ||
], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
import os | ||
|
||
from dotenv import load_dotenv | ||
from src.models.vertex_ai import GoogleVertexAILangChain | ||
|
||
# from src.metrics.geval import make_correctness_metric | ||
from src.metrics.content import ( | ||
make_bias_metric, | ||
make_toxicity_metric, | ||
make_answer_relevancy_metric, | ||
) | ||
from src.metrics.retrieval import ( | ||
make_contextual_precision_metric, | ||
make_contextual_recall_metric, | ||
make_contextual_relevancy_metric, | ||
make_faithfulness_metric, | ||
make_hallucination_metric, | ||
) | ||
from deepeval.test_case import LLMTestCase | ||
from deepeval import evaluate | ||
|
||
cur_dir = os.path.dirname(__file__) | ||
root_dir = os.path.join(cur_dir, "../../") | ||
load_dotenv(os.path.join(root_dir, ".env")) | ||
|
||
if __name__ == "__main__": | ||
model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002") | ||
print("Retrieval metrics") | ||
precision, recall, relevancy, faithfulness, hallucination = ( | ||
make_contextual_precision_metric(model), | ||
make_contextual_recall_metric(model), | ||
make_contextual_relevancy_metric(model), | ||
make_faithfulness_metric(model), | ||
make_hallucination_metric(model), | ||
) | ||
|
||
test_case = LLMTestCase( | ||
input="What if these shoes don't fit?", | ||
actual_output="We offer a 30-day full refund at no extra cost.", | ||
expected_output="You are eligible for a 30 day full refund at no extra cost.", | ||
context=[ | ||
"All customers are eligible for a 30 day full refund at no extra cost." | ||
], | ||
retrieval_context=[ | ||
"All customers are eligible for a 30 day full refund at no extra cost." | ||
], | ||
) | ||
evaluate([test_case], [precision, recall, relevancy, faithfulness, hallucination]) | ||
os.rename(".deepeval-cache.json", "retrieval_metrics.json") | ||
|
||
print("Content metrics") | ||
answer_relevancy, bias, toxicity = ( | ||
make_answer_relevancy_metric(model), | ||
make_bias_metric(model), | ||
make_toxicity_metric(model), | ||
) | ||
|
||
test_case = LLMTestCase( | ||
input="What is the capital of France?", | ||
actual_output="The capital of France is Paris.", | ||
expected_output="Paris.", | ||
) | ||
evaluate([test_case], [answer_relevancy, bias, toxicity]) | ||
os.rename(".deepeval-cache.json", "content_metrics.json") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"test_cases_lookup_map": {"{\"actual_output\": \"We offer a 30-day full refund at no extra cost.\", \"context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"], \"expected_output\": \"You are eligible for a 30 day full refund at no extra cost.\", \"hyperparameters\": null, \"input\": \"What if these shoes don't fit?\", \"retrieval_context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"]}": {"cached_metrics_data": [{"metric_data": {"name": "Contextual Precision", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because all relevant information was retrieved and ranked appropriately. Great job!", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context directly answers the input question about what happens if shoes don't fit by stating 'All customers are eligible for a 30 day full refund at no extra cost.'\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Recall", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the generated output perfectly reflects the information provided in node 1 in the retrieval context, regarding the 30-day full refund policy.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"This sentence is a paraphrase of the 1st node in the retrieval context, which states \\\"All customers are eligible for a 30 day full refund at no extra cost.\\\"\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the retrieval context directly addresses the user's concern about the shoes not fitting by stating that 'All customers are eligible for a 30 day full refund at no extra cost.'", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdicts\": [\n {\n \"statement\": \"All customers are eligible for a 30 day full refund at no extra cost.\",\n \"verdict\": \"yes\",\n \"reason\": null\n }\n ]\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Faithfulness", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the actual output perfectly aligns with the retrieval context, as evidenced by the absence of any contradictions.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Truths (limit=None):\n[\n \"Customers are eligible for a full refund.\",\n \"The refund period lasts 30 days.\",\n \"There is no extra cost for the refund.\"\n] \n \nClaims:\n[\n \"We offer a 30-day full refund at no extra cost.\"\n] \n \nVerdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": null\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Hallucination", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the actual output is fully supported by the provided context and doesn't introduce any contradictory or unsubstantiated information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"The actual output agrees with the provided context. While the context mentions \\\"all customers\\\", the actual output implies the same by stating \\\"we offer\\\" a 30-day full refund at no extra cost. This phrasing suggests a general policy applicable to all customers.\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}}} |
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
from deepeval.metrics import ( | ||
FaithfulnessMetric, | ||
AnswerRelevancyMetric, | ||
BiasMetric, | ||
ToxicityMetric, | ||
) | ||
from deepeval.models.base_model import DeepEvalBaseLLM | ||
|
||
ANSRELEVANCY_THRESHOLD = 0.7 | ||
HALLUCINATION_THRESHOLD = 0.7 | ||
BIAS_THRESHOLD = 0.7 | ||
TOXICITY_THRESHOLD = 0.7 | ||
|
||
|
||
def make_answer_relevancy_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric: | ||
return AnswerRelevancyMetric( | ||
threshold=ANSRELEVANCY_THRESHOLD, | ||
model=model, | ||
include_reason=True, | ||
) | ||
|
||
|
||
def make_bias_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric: | ||
return BiasMetric( | ||
threshold=BIAS_THRESHOLD, | ||
model=model, | ||
include_reason=True, | ||
) | ||
|
||
|
||
def make_toxicity_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric: | ||
return ToxicityMetric( | ||
threshold=TOXICITY_THRESHOLD, | ||
model=model, | ||
include_reason=True, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
""" | ||
GEval metrics wrapper for DeepEval. | ||
GEval refers to custom LLM-based metrics with non-traditional definitions (e.g. precision, recall, relevancy, etc.) | ||
""" | ||
|
||
from deepeval.metrics import GEval | ||
from deepeval.test_case import LLMTestCaseParams | ||
from deepeval.models.base_model import DeepEvalBaseLLM | ||
|
||
|
||
def make_correctness_metric(model: DeepEvalBaseLLM) -> GEval: | ||
return GEval( | ||
name="Correctness", | ||
criteria="Determine whether the actual output is factually correct based on the expected output.", | ||
evaluation_steps=[ | ||
"Check whether the facts in 'actual output' contradicts any facts in 'expected output'", | ||
"You should also heavily penalize omission of detail", | ||
"Vague language, or contradicting OPINIONS, are OK", | ||
], | ||
evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT], | ||
model=model, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
from deepeval.metrics import ( | ||
ContextualPrecisionMetric, | ||
ContextualRecallMetric, | ||
ContextualRelevancyMetric, | ||
FaithfulnessMetric, | ||
HallucinationMetric, | ||
) | ||
from deepeval.models.base_model import DeepEvalBaseLLM | ||
|
||
PRECISION_THRESHOLD = 0.7 | ||
RECALL_THRESHOLD = 0.7 | ||
RELEVANCY_THRESHOLD = 0.7 | ||
FAITHFULNESS_THRESHOLD = 0.7 | ||
HALLUCINATION_THRESHOLD = 0.7 | ||
|
||
|
||
def make_contextual_precision_metric( | ||
model: DeepEvalBaseLLM, | ||
) -> ContextualPrecisionMetric: | ||
return ContextualPrecisionMetric( | ||
threshold=PRECISION_THRESHOLD, | ||
model=model, | ||
include_reason=True, | ||
) | ||
|
||
|
||
def make_contextual_recall_metric(model: DeepEvalBaseLLM) -> ContextualRecallMetric: | ||
return ContextualRecallMetric( | ||
threshold=RECALL_THRESHOLD, | ||
model=model, | ||
include_reason=True, | ||
) | ||
|
||
|
||
def make_contextual_relevancy_metric( | ||
model: DeepEvalBaseLLM, | ||
) -> ContextualRelevancyMetric: | ||
return ContextualRelevancyMetric( | ||
threshold=RELEVANCY_THRESHOLD, | ||
model=model, | ||
include_reason=True, | ||
) | ||
|
||
|
||
def make_faithfulness_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric: | ||
return FaithfulnessMetric( | ||
threshold=FAITHFULNESS_THRESHOLD, | ||
model=model, | ||
include_reason=True, | ||
) | ||
|
||
|
||
def make_hallucination_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric: | ||
return HallucinationMetric( | ||
threshold=HALLUCINATION_THRESHOLD, | ||
model=model, | ||
include_reason=True, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
pass |
Empty file.
Oops, something went wrong.