Skip to content

Commit

Permalink
Evaluation module refactor (#85)
Browse files Browse the repository at this point in the history
* initial scaffold for evaluation pyproject module

* initial demo using deepeval

* fix huggingface + add sample metric results

---------

Signed-off-by: Jack Luar <jluar@precisioninno.com>
  • Loading branch information
luarss authored Nov 7, 2024
1 parent dce7db2 commit bb5a300
Show file tree
Hide file tree
Showing 25 changed files with 598 additions and 183 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ backend/src/*.json
frontend/*.json
evaluation/human_evaluation/*.json
/*.json
secret.json
venv/
.ruff_cache/
.mypy_cache/
Expand All @@ -18,3 +19,9 @@ documents.txt

# virtualenv
.venv

# evaluations
.deepeval_telemtry.txt
*.csv
*.deepeval-cache.json
temp_test_run_data.json
3 changes: 1 addition & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ format:
check:
@for folder in $(FOLDERS); do (cd $$folder && make check && cd ../); done
@. ./backend/.venv/bin/activate && \
pre-commit run --files backend/* && \
pre-commit run --files frontend/*
pre-commit run --all-files

docker:
@docker compose up --build --wait
17 changes: 17 additions & 0 deletions evaluation/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
init:
@python3 -m venv .venv && \
. .venv/bin/activate && \
pip install -r requirements.txt

init-dev: init
@. .venv/bin/activate && \
pip install -r requirements-test.txt

format:
@. .venv/bin/activate && \
ruff format && \
mypy .

check:
@. .venv/bin/activate && \
ruff check --fix
11 changes: 11 additions & 0 deletions evaluation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Evaluation

In this module, we have divided into two components, namely:
- `auto_evaluation`: Auto-evaluation scripts used to judge performance of LLMs.
- `human_evaluation`: Scripts and libraries for human evaluation of LLMs.

## Pre-requisites

```
make init
```
4 changes: 4 additions & 0 deletions evaluation/auto_evaluation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Auto-evaluation

This repository houses the scripts needed for auto-evaluation.

1 change: 1 addition & 0 deletions evaluation/auto_evaluation/content_metrics.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"test_cases_lookup_map": {"{\"actual_output\": \"We offer a 30-day full refund at no extra cost.\", \"context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"], \"expected_output\": \"You are eligible for a 30 day full refund at no extra cost.\", \"hyperparameters\": null, \"input\": \"What if these shoes don't fit?\", \"retrieval_context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"]}": {"cached_metrics_data": [{"metric_data": {"name": "Contextual Precision", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because all relevant information was retrieved and ranked appropriately. Great job!", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context directly answers the input question about what happens if shoes don't fit by stating 'All customers are eligible for a 30 day full refund at no extra cost.'\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Recall", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the generated output perfectly reflects the information provided in node 1 in the retrieval context, regarding the 30-day full refund policy.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"This sentence is a paraphrase of the 1st node in the retrieval context, which states \\\"All customers are eligible for a 30 day full refund at no extra cost.\\\"\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the retrieval context directly addresses the user's concern about the shoes not fitting by stating that 'All customers are eligible for a 30 day full refund at no extra cost.'", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdicts\": [\n {\n \"statement\": \"All customers are eligible for a 30 day full refund at no extra cost.\",\n \"verdict\": \"yes\",\n \"reason\": null\n }\n ]\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Faithfulness", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the actual output perfectly aligns with the retrieval context, as evidenced by the absence of any contradictions.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Truths (limit=None):\n[\n \"Customers are eligible for a full refund.\",\n \"The refund period lasts 30 days.\",\n \"There is no extra cost for the refund.\"\n] \n \nClaims:\n[\n \"We offer a 30-day full refund at no extra cost.\"\n] \n \nVerdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": null\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Hallucination", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the actual output is fully supported by the provided context and doesn't introduce any contradictory or unsubstantiated information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"The actual output agrees with the provided context. While the context mentions \\\"all customers\\\", the actual output implies the same by stating \\\"we offer\\\" a 30-day full refund at no extra cost. This phrasing suggests a general policy applicable to all customers.\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}, "{\"actual_output\": \"The capital of France is Paris.\", \"context\": null, \"expected_output\": \"Paris.\", \"hyperparameters\": null, \"input\": \"What is the capital of France?\", \"retrieval_context\": null}": {"cached_metrics_data": [{"metric_data": {"name": "Answer Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the response is perfectly relevant, addressing the input directly and completely with no irrelevant information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Statements:\n[\n \"The capital of France is Paris.\"\n] \n \nVerdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": null\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Bias", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the output demonstrates no discernible bias.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Opinions:\n[] \n \nVerdicts:\n[]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Toxicity", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the output is entirely harmless and positive.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Opinions:\n[] \n \nVerdicts:\n[]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}}}
15 changes: 15 additions & 0 deletions evaluation/auto_evaluation/dataset/hf_pull.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from huggingface_hub import snapshot_download
import os

if __name__ == "__main__":
cur_dir = os.path.dirname(os.path.abspath(__file__))
snapshot_download(
"The-OpenROAD-Project/ORAssistant_Public_Evals",
revision="main",
local_dir=cur_dir,
repo_type="dataset",
ignore_patterns=[
".gitattributes",
"README.md",
],
)
64 changes: 64 additions & 0 deletions evaluation/auto_evaluation/demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import os

from dotenv import load_dotenv
from src.models.vertex_ai import GoogleVertexAILangChain

# from src.metrics.geval import make_correctness_metric
from src.metrics.content import (
make_bias_metric,
make_toxicity_metric,
make_answer_relevancy_metric,
)
from src.metrics.retrieval import (
make_contextual_precision_metric,
make_contextual_recall_metric,
make_contextual_relevancy_metric,
make_faithfulness_metric,
make_hallucination_metric,
)
from deepeval.test_case import LLMTestCase
from deepeval import evaluate

cur_dir = os.path.dirname(__file__)
root_dir = os.path.join(cur_dir, "../../")
load_dotenv(os.path.join(root_dir, ".env"))

if __name__ == "__main__":
model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002")
print("Retrieval metrics")
precision, recall, relevancy, faithfulness, hallucination = (
make_contextual_precision_metric(model),
make_contextual_recall_metric(model),
make_contextual_relevancy_metric(model),
make_faithfulness_metric(model),
make_hallucination_metric(model),
)

test_case = LLMTestCase(
input="What if these shoes don't fit?",
actual_output="We offer a 30-day full refund at no extra cost.",
expected_output="You are eligible for a 30 day full refund at no extra cost.",
context=[
"All customers are eligible for a 30 day full refund at no extra cost."
],
retrieval_context=[
"All customers are eligible for a 30 day full refund at no extra cost."
],
)
evaluate([test_case], [precision, recall, relevancy, faithfulness, hallucination])
os.rename(".deepeval-cache.json", "retrieval_metrics.json")

print("Content metrics")
answer_relevancy, bias, toxicity = (
make_answer_relevancy_metric(model),
make_bias_metric(model),
make_toxicity_metric(model),
)

test_case = LLMTestCase(
input="What is the capital of France?",
actual_output="The capital of France is Paris.",
expected_output="Paris.",
)
evaluate([test_case], [answer_relevancy, bias, toxicity])
os.rename(".deepeval-cache.json", "content_metrics.json")
1 change: 1 addition & 0 deletions evaluation/auto_evaluation/retrieval_metrics.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"test_cases_lookup_map": {"{\"actual_output\": \"We offer a 30-day full refund at no extra cost.\", \"context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"], \"expected_output\": \"You are eligible for a 30 day full refund at no extra cost.\", \"hyperparameters\": null, \"input\": \"What if these shoes don't fit?\", \"retrieval_context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"]}": {"cached_metrics_data": [{"metric_data": {"name": "Contextual Precision", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because all relevant information was retrieved and ranked appropriately. Great job!", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context directly answers the input question about what happens if shoes don't fit by stating 'All customers are eligible for a 30 day full refund at no extra cost.'\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Recall", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the generated output perfectly reflects the information provided in node 1 in the retrieval context, regarding the 30-day full refund policy.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"This sentence is a paraphrase of the 1st node in the retrieval context, which states \\\"All customers are eligible for a 30 day full refund at no extra cost.\\\"\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the retrieval context directly addresses the user's concern about the shoes not fitting by stating that 'All customers are eligible for a 30 day full refund at no extra cost.'", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdicts\": [\n {\n \"statement\": \"All customers are eligible for a 30 day full refund at no extra cost.\",\n \"verdict\": \"yes\",\n \"reason\": null\n }\n ]\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Faithfulness", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the actual output perfectly aligns with the retrieval context, as evidenced by the absence of any contradictions.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Truths (limit=None):\n[\n \"Customers are eligible for a full refund.\",\n \"The refund period lasts 30 days.\",\n \"There is no extra cost for the refund.\"\n] \n \nClaims:\n[\n \"We offer a 30-day full refund at no extra cost.\"\n] \n \nVerdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": null\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Hallucination", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the actual output is fully supported by the provided context and doesn't introduce any contradictory or unsubstantiated information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"The actual output agrees with the provided context. While the context mentions \\\"all customers\\\", the actual output implies the same by stating \\\"we offer\\\" a 30-day full refund at no extra cost. This phrasing suggests a general policy applicable to all customers.\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}}}
Empty file.
Empty file.
36 changes: 36 additions & 0 deletions evaluation/auto_evaluation/src/metrics/content.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from deepeval.metrics import (
FaithfulnessMetric,
AnswerRelevancyMetric,
BiasMetric,
ToxicityMetric,
)
from deepeval.models.base_model import DeepEvalBaseLLM

ANSRELEVANCY_THRESHOLD = 0.7
HALLUCINATION_THRESHOLD = 0.7
BIAS_THRESHOLD = 0.7
TOXICITY_THRESHOLD = 0.7


def make_answer_relevancy_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
return AnswerRelevancyMetric(
threshold=ANSRELEVANCY_THRESHOLD,
model=model,
include_reason=True,
)


def make_bias_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
return BiasMetric(
threshold=BIAS_THRESHOLD,
model=model,
include_reason=True,
)


def make_toxicity_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
return ToxicityMetric(
threshold=TOXICITY_THRESHOLD,
model=model,
include_reason=True,
)
26 changes: 26 additions & 0 deletions evaluation/auto_evaluation/src/metrics/geval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""
GEval metrics wrapper for DeepEval.
GEval refers to custom LLM-based metrics with non-traditional definitions (e.g. precision, recall, relevancy, etc.)
"""

from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
from deepeval.models.base_model import DeepEvalBaseLLM


def make_correctness_metric(model: DeepEvalBaseLLM) -> GEval:
return GEval(
name="Correctness",
criteria="Determine whether the actual output is factually correct based on the expected output.",
evaluation_steps=[
"Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
"You should also heavily penalize omission of detail",
"Vague language, or contradicting OPINIONS, are OK",
],
evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
model=model,
)


if __name__ == "__main__":
pass
62 changes: 62 additions & 0 deletions evaluation/auto_evaluation/src/metrics/retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from deepeval.metrics import (
ContextualPrecisionMetric,
ContextualRecallMetric,
ContextualRelevancyMetric,
FaithfulnessMetric,
HallucinationMetric,
)
from deepeval.models.base_model import DeepEvalBaseLLM

PRECISION_THRESHOLD = 0.7
RECALL_THRESHOLD = 0.7
RELEVANCY_THRESHOLD = 0.7
FAITHFULNESS_THRESHOLD = 0.7
HALLUCINATION_THRESHOLD = 0.7


def make_contextual_precision_metric(
model: DeepEvalBaseLLM,
) -> ContextualPrecisionMetric:
return ContextualPrecisionMetric(
threshold=PRECISION_THRESHOLD,
model=model,
include_reason=True,
)


def make_contextual_recall_metric(model: DeepEvalBaseLLM) -> ContextualRecallMetric:
return ContextualRecallMetric(
threshold=RECALL_THRESHOLD,
model=model,
include_reason=True,
)


def make_contextual_relevancy_metric(
model: DeepEvalBaseLLM,
) -> ContextualRelevancyMetric:
return ContextualRelevancyMetric(
threshold=RELEVANCY_THRESHOLD,
model=model,
include_reason=True,
)


def make_faithfulness_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
return FaithfulnessMetric(
threshold=FAITHFULNESS_THRESHOLD,
model=model,
include_reason=True,
)


def make_hallucination_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
return HallucinationMetric(
threshold=HALLUCINATION_THRESHOLD,
model=model,
include_reason=True,
)


if __name__ == "__main__":
pass
Empty file.
Loading

0 comments on commit bb5a300

Please sign in to comment.