Evaluation module refactor (#85)

* initial scaffold for evaluation pyproject module * initial demo using deepeval * fix huggingface + add sample metric results --------- Signed-off-by: Jack Luar <jluar@precisioninno.com>
The-OpenROAD-Project · Nov 7, 2024 · bb5a300 · bb5a300
1 parent dce7db2
commit bb5a300
Show file tree

Hide file tree

Showing 25 changed files with 598 additions and 183 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,7 @@ backend/src/*.json
 frontend/*.json  
 evaluation/human_evaluation/*.json  
 /*.json
+secret.json
 venv/
 .ruff_cache/
 .mypy_cache/
@@ -18,3 +19,9 @@ documents.txt
 
 # virtualenv
 .venv
+
+# evaluations
+.deepeval_telemtry.txt
+*.csv
+*.deepeval-cache.json
+temp_test_run_data.json
diff --git a/Makefile b/Makefile
@@ -12,8 +12,7 @@ format:
 check:
 	@for folder in $(FOLDERS); do (cd $$folder && make check && cd ../); done
 	@. ./backend/.venv/bin/activate && \
-		pre-commit run --files backend/* && \
-		pre-commit run --files frontend/*
+		pre-commit run --all-files
 
 docker:
 	@docker compose up --build --wait
diff --git a/evaluation/Makefile b/evaluation/Makefile
@@ -0,0 +1,17 @@
+init:
+	@python3 -m venv .venv && \
+		. .venv/bin/activate && \
+		pip install -r requirements.txt
+
+init-dev: init
+	@. .venv/bin/activate && \
+		pip install -r requirements-test.txt
+
+format:
+	@. .venv/bin/activate && \
+		ruff format && \
+		mypy .
+
+check:
+	@. .venv/bin/activate && \
+		ruff check --fix
diff --git a/evaluation/README.md b/evaluation/README.md
@@ -0,0 +1,11 @@
+# Evaluation
+
+In this module, we have divided into two components, namely:
+- `auto_evaluation`: Auto-evaluation scripts used to judge performance of LLMs.
+- `human_evaluation`: Scripts and libraries for human evaluation of LLMs.
+
+## Pre-requisites
+
+```
+make init
+```
diff --git a/evaluation/auto_evaluation/README.md b/evaluation/auto_evaluation/README.md
@@ -0,0 +1,4 @@
+# Auto-evaluation
+
+This repository houses the scripts needed for auto-evaluation.
+
diff --git a/evaluation/auto_evaluation/content_metrics.json b/evaluation/auto_evaluation/content_metrics.json
@@ -0,0 +1 @@
+{"test_cases_lookup_map": {"{\"actual_output\": \"We offer a 30-day full refund at no extra cost.\", \"context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"], \"expected_output\": \"You are eligible for a 30 day full refund at no extra cost.\", \"hyperparameters\": null, \"input\": \"What if these shoes don't fit?\", \"retrieval_context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"]}": {"cached_metrics_data": [{"metric_data": {"name": "Contextual Precision", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because all relevant information was retrieved and ranked appropriately. Great job!", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": \"The context directly answers the input question about what happens if shoes don't fit by stating 'All customers are eligible for a 30 day full refund at no extra cost.'\"\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Recall", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the generated output perfectly reflects the information provided in node 1 in the retrieval context, regarding the 30-day full refund policy.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": \"This sentence is a paraphrase of the 1st node in the retrieval context, which states \\\"All customers are eligible for a 30 day full refund at no extra cost.\\\"\"\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the retrieval context directly addresses the user's concern about the shoes not fitting by stating that 'All customers are eligible for a 30 day full refund at no extra cost.'", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdicts\": [\n            {\n                \"statement\": \"All customers are eligible for a 30 day full refund at no extra cost.\",\n                \"verdict\": \"yes\",\n                \"reason\": null\n            }\n        ]\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Faithfulness", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the actual output perfectly aligns with the retrieval context, as evidenced by the absence of any contradictions.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Truths (limit=None):\n[\n    \"Customers are eligible for a full refund.\",\n    \"The refund period lasts 30 days.\",\n    \"There is no extra cost for the refund.\"\n] \n \nClaims:\n[\n    \"We offer a 30-day full refund at no extra cost.\"\n] \n \nVerdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": null\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Hallucination", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the actual output is fully supported by the provided context and doesn't introduce any contradictory or unsubstantiated information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": \"The actual output agrees with the provided context. While the context mentions \\\"all customers\\\", the actual output implies the same by stating \\\"we offer\\\" a 30-day full refund at no extra cost. This phrasing suggests a general policy applicable to all customers.\"\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}, "{\"actual_output\": \"The capital of France is Paris.\", \"context\": null, \"expected_output\": \"Paris.\", \"hyperparameters\": null, \"input\": \"What is the capital of France?\", \"retrieval_context\": null}": {"cached_metrics_data": [{"metric_data": {"name": "Answer Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the response is perfectly relevant, addressing the input directly and completely with no irrelevant information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Statements:\n[\n    \"The capital of France is Paris.\"\n] \n \nVerdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": null\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Bias", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the output demonstrates no discernible bias.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Opinions:\n[] \n \nVerdicts:\n[]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Toxicity", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the output is entirely harmless and positive.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Opinions:\n[] \n \nVerdicts:\n[]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}}}
diff --git a/evaluation/auto_evaluation/dataset/hf_pull.py b/evaluation/auto_evaluation/dataset/hf_pull.py
@@ -0,0 +1,15 @@
+from huggingface_hub import snapshot_download
+import os
+
+if __name__ == "__main__":
+    cur_dir = os.path.dirname(os.path.abspath(__file__))
+    snapshot_download(
+        "The-OpenROAD-Project/ORAssistant_Public_Evals",
+        revision="main",
+        local_dir=cur_dir,
+        repo_type="dataset",
+        ignore_patterns=[
+            ".gitattributes",
+            "README.md",
+        ],
+    )
diff --git a/evaluation/auto_evaluation/demo.py b/evaluation/auto_evaluation/demo.py
@@ -0,0 +1,64 @@
+import os
+
+from dotenv import load_dotenv
+from src.models.vertex_ai import GoogleVertexAILangChain
+
+# from src.metrics.geval import make_correctness_metric
+from src.metrics.content import (
+    make_bias_metric,
+    make_toxicity_metric,
+    make_answer_relevancy_metric,
+)
+from src.metrics.retrieval import (
+    make_contextual_precision_metric,
+    make_contextual_recall_metric,
+    make_contextual_relevancy_metric,
+    make_faithfulness_metric,
+    make_hallucination_metric,
+)
+from deepeval.test_case import LLMTestCase
+from deepeval import evaluate
+
+cur_dir = os.path.dirname(__file__)
+root_dir = os.path.join(cur_dir, "../../")
+load_dotenv(os.path.join(root_dir, ".env"))
+
+if __name__ == "__main__":
+    model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002")
+    print("Retrieval metrics")
+    precision, recall, relevancy, faithfulness, hallucination = (
+        make_contextual_precision_metric(model),
+        make_contextual_recall_metric(model),
+        make_contextual_relevancy_metric(model),
+        make_faithfulness_metric(model),
+        make_hallucination_metric(model),
+    )
+
+    test_case = LLMTestCase(
+        input="What if these shoes don't fit?",
+        actual_output="We offer a 30-day full refund at no extra cost.",
+        expected_output="You are eligible for a 30 day full refund at no extra cost.",
+        context=[
+            "All customers are eligible for a 30 day full refund at no extra cost."
+        ],
+        retrieval_context=[
+            "All customers are eligible for a 30 day full refund at no extra cost."
+        ],
+    )
+    evaluate([test_case], [precision, recall, relevancy, faithfulness, hallucination])
+    os.rename(".deepeval-cache.json", "retrieval_metrics.json")
+
+    print("Content metrics")
+    answer_relevancy, bias, toxicity = (
+        make_answer_relevancy_metric(model),
+        make_bias_metric(model),
+        make_toxicity_metric(model),
+    )
+
+    test_case = LLMTestCase(
+        input="What is the capital of France?",
+        actual_output="The capital of France is Paris.",
+        expected_output="Paris.",
+    )
+    evaluate([test_case], [answer_relevancy, bias, toxicity])
+    os.rename(".deepeval-cache.json", "content_metrics.json")
diff --git a/evaluation/auto_evaluation/retrieval_metrics.json b/evaluation/auto_evaluation/retrieval_metrics.json
@@ -0,0 +1 @@
+{"test_cases_lookup_map": {"{\"actual_output\": \"We offer a 30-day full refund at no extra cost.\", \"context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"], \"expected_output\": \"You are eligible for a 30 day full refund at no extra cost.\", \"hyperparameters\": null, \"input\": \"What if these shoes don't fit?\", \"retrieval_context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"]}": {"cached_metrics_data": [{"metric_data": {"name": "Contextual Precision", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because all relevant information was retrieved and ranked appropriately. Great job!", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": \"The context directly answers the input question about what happens if shoes don't fit by stating 'All customers are eligible for a 30 day full refund at no extra cost.'\"\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Recall", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the generated output perfectly reflects the information provided in node 1 in the retrieval context, regarding the 30-day full refund policy.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": \"This sentence is a paraphrase of the 1st node in the retrieval context, which states \\\"All customers are eligible for a 30 day full refund at no extra cost.\\\"\"\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the retrieval context directly addresses the user's concern about the shoes not fitting by stating that 'All customers are eligible for a 30 day full refund at no extra cost.'", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdicts\": [\n            {\n                \"statement\": \"All customers are eligible for a 30 day full refund at no extra cost.\",\n                \"verdict\": \"yes\",\n                \"reason\": null\n            }\n        ]\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Faithfulness", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the actual output perfectly aligns with the retrieval context, as evidenced by the absence of any contradictions.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Truths (limit=None):\n[\n    \"Customers are eligible for a full refund.\",\n    \"The refund period lasts 30 days.\",\n    \"There is no extra cost for the refund.\"\n] \n \nClaims:\n[\n    \"We offer a 30-day full refund at no extra cost.\"\n] \n \nVerdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": null\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Hallucination", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the actual output is fully supported by the provided context and doesn't introduce any contradictory or unsubstantiated information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": \"The actual output agrees with the provided context. While the context mentions \\\"all customers\\\", the actual output implies the same by stating \\\"we offer\\\" a 30-day full refund at no extra cost. This phrasing suggests a general policy applicable to all customers.\"\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}}}
diff --git a/evaluation/auto_evaluation/src/__init__.py b/evaluation/auto_evaluation/src/__init__.py
diff --git a/evaluation/auto_evaluation/src/metrics/__init__.py b/evaluation/auto_evaluation/src/metrics/__init__.py
diff --git a/evaluation/auto_evaluation/src/metrics/content.py b/evaluation/auto_evaluation/src/metrics/content.py
@@ -0,0 +1,36 @@
+from deepeval.metrics import (
+    FaithfulnessMetric,
+    AnswerRelevancyMetric,
+    BiasMetric,
+    ToxicityMetric,
+)
+from deepeval.models.base_model import DeepEvalBaseLLM
+
+ANSRELEVANCY_THRESHOLD = 0.7
+HALLUCINATION_THRESHOLD = 0.7
+BIAS_THRESHOLD = 0.7
+TOXICITY_THRESHOLD = 0.7
+
+
+def make_answer_relevancy_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
+    return AnswerRelevancyMetric(
+        threshold=ANSRELEVANCY_THRESHOLD,
+        model=model,
+        include_reason=True,
+    )
+
+
+def make_bias_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
+    return BiasMetric(
+        threshold=BIAS_THRESHOLD,
+        model=model,
+        include_reason=True,
+    )
+
+
+def make_toxicity_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
+    return ToxicityMetric(
+        threshold=TOXICITY_THRESHOLD,
+        model=model,
+        include_reason=True,
+    )
diff --git a/evaluation/auto_evaluation/src/metrics/geval.py b/evaluation/auto_evaluation/src/metrics/geval.py
@@ -0,0 +1,26 @@
+"""
+GEval metrics wrapper for DeepEval.
+GEval refers to custom LLM-based metrics with non-traditional definitions (e.g. precision, recall, relevancy, etc.)
+"""
+
+from deepeval.metrics import GEval
+from deepeval.test_case import LLMTestCaseParams
+from deepeval.models.base_model import DeepEvalBaseLLM
+
+
+def make_correctness_metric(model: DeepEvalBaseLLM) -> GEval:
+    return GEval(
+        name="Correctness",
+        criteria="Determine whether the actual output is factually correct based on the expected output.",
+        evaluation_steps=[
+            "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
+            "You should also heavily penalize omission of detail",
+            "Vague language, or contradicting OPINIONS, are OK",
+        ],
+        evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
+        model=model,
+    )
+
+
+if __name__ == "__main__":
+    pass
diff --git a/evaluation/auto_evaluation/src/metrics/retrieval.py b/evaluation/auto_evaluation/src/metrics/retrieval.py
@@ -0,0 +1,62 @@
+from deepeval.metrics import (
+    ContextualPrecisionMetric,
+    ContextualRecallMetric,
+    ContextualRelevancyMetric,
+    FaithfulnessMetric,
+    HallucinationMetric,
+)
+from deepeval.models.base_model import DeepEvalBaseLLM
+
+PRECISION_THRESHOLD = 0.7
+RECALL_THRESHOLD = 0.7
+RELEVANCY_THRESHOLD = 0.7
+FAITHFULNESS_THRESHOLD = 0.7
+HALLUCINATION_THRESHOLD = 0.7
+
+
+def make_contextual_precision_metric(
+    model: DeepEvalBaseLLM,
+) -> ContextualPrecisionMetric:
+    return ContextualPrecisionMetric(
+        threshold=PRECISION_THRESHOLD,
+        model=model,
+        include_reason=True,
+    )
+
+
+def make_contextual_recall_metric(model: DeepEvalBaseLLM) -> ContextualRecallMetric:
+    return ContextualRecallMetric(
+        threshold=RECALL_THRESHOLD,
+        model=model,
+        include_reason=True,
+    )
+
+
+def make_contextual_relevancy_metric(
+    model: DeepEvalBaseLLM,
+) -> ContextualRelevancyMetric:
+    return ContextualRelevancyMetric(
+        threshold=RELEVANCY_THRESHOLD,
+        model=model,
+        include_reason=True,
+    )
+
+
+def make_faithfulness_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
+    return FaithfulnessMetric(
+        threshold=FAITHFULNESS_THRESHOLD,
+        model=model,
+        include_reason=True,
+    )
+
+
+def make_hallucination_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
+    return HallucinationMetric(
+        threshold=HALLUCINATION_THRESHOLD,
+        model=model,
+        include_reason=True,
+    )
+
+
+if __name__ == "__main__":
+    pass
diff --git a/evaluation/auto_evaluation/src/models/__init__.py b/evaluation/auto_evaluation/src/models/__init__.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,4 @@
		# Auto-evaluation

		This repository houses the scripts needed for auto-evaluation.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"test_cases_lookup_map": {"{\"actual_output\": \"We offer a 30-day full refund at no extra cost.\", \"context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"], \"expected_output\": \"You are eligible for a 30 day full refund at no extra cost.\", \"hyperparameters\": null, \"input\": \"What if these shoes don't fit?\", \"retrieval_context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"]}": {"cached_metrics_data": [{"metric_data": {"name": "Contextual Precision", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because all relevant information was retrieved and ranked appropriately. Great job!", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context directly answers the input question about what happens if shoes don't fit by stating 'All customers are eligible for a 30 day full refund at no extra cost.'\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Recall", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the generated output perfectly reflects the information provided in node 1 in the retrieval context, regarding the 30-day full refund policy.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"This sentence is a paraphrase of the 1st node in the retrieval context, which states \\\"All customers are eligible for a 30 day full refund at no extra cost.\\\"\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the retrieval context directly addresses the user's concern about the shoes not fitting by stating that 'All customers are eligible for a 30 day full refund at no extra cost.'", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdicts\": [\n {\n \"statement\": \"All customers are eligible for a 30 day full refund at no extra cost.\",\n \"verdict\": \"yes\",\n \"reason\": null\n }\n ]\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Faithfulness", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the actual output perfectly aligns with the retrieval context, as evidenced by the absence of any contradictions.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Truths (limit=None):\n[\n \"Customers are eligible for a full refund.\",\n \"The refund period lasts 30 days.\",\n \"There is no extra cost for the refund.\"\n] \n \nClaims:\n[\n \"We offer a 30-day full refund at no extra cost.\"\n] \n \nVerdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": null\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Hallucination", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the actual output is fully supported by the provided context and doesn't introduce any contradictory or unsubstantiated information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"The actual output agrees with the provided context. While the context mentions \\\"all customers\\\", the actual output implies the same by stating \\\"we offer\\\" a 30-day full refund at no extra cost. This phrasing suggests a general policy applicable to all customers.\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}, "{\"actual_output\": \"The capital of France is Paris.\", \"context\": null, \"expected_output\": \"Paris.\", \"hyperparameters\": null, \"input\": \"What is the capital of France?\", \"retrieval_context\": null}": {"cached_metrics_data": [{"metric_data": {"name": "Answer Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the response is perfectly relevant, addressing the input directly and completely with no irrelevant information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Statements:\n[\n \"The capital of France is Paris.\"\n] \n \nVerdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": null\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Bias", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the output demonstrates no discernible bias.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Opinions:\n[] \n \nVerdicts:\n[]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Toxicity", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the output is entirely harmless and positive.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Opinions:\n[] \n \nVerdicts:\n[]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}}}