From 01802990d8e929c4125e8473efde994d6674cd32 Mon Sep 17 00:00:00 2001 From: ethan-tonic Date: Wed, 8 May 2024 04:11:51 -0400 Subject: [PATCH] Added monitoring --- tonic_validate/__init__.py | 3 + tonic_validate/metrics/__init__.py | 4 +- .../answer_consistency_binary_metric.py | 20 +++- .../metrics/answer_consistency_metric.py | 16 ++- .../metrics/answer_contains_pii_metric.py | 19 +++- tonic_validate/metrics/answer_match_metric.py | 24 ++++- .../metrics/answer_similarity_metric.py | 25 ++++- .../metrics/augmentation_accuracy_metric.py | 25 ++++- .../metrics/augmentation_precision_metric.py | 16 ++- tonic_validate/metrics/binary_metric.py | 19 +++- .../metrics/contains_text_metric.py | 24 ++++- .../metrics/context_contains_pii_metric.py | 19 +++- .../metrics/context_length_metric.py | 24 ++++- tonic_validate/metrics/duplication_metric.py | 24 +++-- .../metrics/hate_speech_content_metric.py | 20 +++- tonic_validate/metrics/latency_metric.py | 15 ++- tonic_validate/metrics/metric.py | 31 +++++- tonic_validate/metrics/regex_metric.py | 24 ++++- .../metrics/response_length_metric.py | 24 ++++- .../metrics/retrieval_precision_metric.py | 25 ++++- tonic_validate/validate_monitorer.py | 100 ++++++++++++++++++ tonic_validate/validate_scorer.py | 38 ++++--- 22 files changed, 472 insertions(+), 67 deletions(-) create mode 100644 tonic_validate/validate_monitorer.py diff --git a/tonic_validate/__init__.py b/tonic_validate/__init__.py index 7a49dab..b189e44 100644 --- a/tonic_validate/__init__.py +++ b/tonic_validate/__init__.py @@ -1,5 +1,7 @@ from .validate_api import ValidateApi from .validate_scorer import ValidateScorer +from .validate_monitorer import ValidateMonitorer + from .classes import ( Benchmark, BenchmarkItem, @@ -14,6 +16,7 @@ __all__ = [ "ValidateApi", "ValidateScorer", + "ValidateMonitorer", "Benchmark", "BenchmarkItem", "LLMResponse", diff --git a/tonic_validate/metrics/__init__.py b/tonic_validate/metrics/__init__.py index 3172421..74c6cdc 100644 --- a/tonic_validate/metrics/__init__.py +++ b/tonic_validate/metrics/__init__.py @@ -15,6 +15,7 @@ from .latency_metric import LatencyMetric from .context_contains_pii_metric import ContextContainsPiiMetric from .answer_contains_pii_metric import AnswerContainsPiiMetric +from .metric import Metric __all__ = [ "AnswerConsistencyBinaryMetric", @@ -33,5 +34,6 @@ "HateSpeechContentMetric", "LatencyMetric", "ContextContainsPiiMetric", - "AnswerContainsPiiMetric" + "AnswerContainsPiiMetric", + "Metric", ] diff --git a/tonic_validate/metrics/answer_consistency_binary_metric.py b/tonic_validate/metrics/answer_consistency_binary_metric.py index d506ce8..fddbe67 100644 --- a/tonic_validate/metrics/answer_consistency_binary_metric.py +++ b/tonic_validate/metrics/answer_consistency_binary_metric.py @@ -1,11 +1,15 @@ import logging -from typing import Union +from typing import Any, Dict, Union from tonic_validate.classes.llm_response import LLMResponse from tonic_validate.metrics.binary_metric import BinaryMetric +from tonic_validate.metrics.metric import Metric, MetricRequirement from tonic_validate.utils.metrics_util import parse_boolean_response from tonic_validate.services.openai_service import OpenAIService from tonic_validate.services.litellm_service import LiteLLMService -from tonic_validate.utils.llm_calls import answer_consistent_with_context_call, context_consistency_prompt +from tonic_validate.utils.llm_calls import ( + answer_consistent_with_context_call, + context_consistency_prompt, +) logger = logging.getLogger() @@ -13,6 +17,7 @@ class AnswerConsistencyBinaryMetric(BinaryMetric): name: str = "answer_consistency_binary" prompt: str = context_consistency_prompt() + requirements = {MetricRequirement.LLM_ANSWER, MetricRequirement.LLM_CONTEXT} def __init__(self): """ @@ -21,8 +26,17 @@ def __init__(self): """ super().__init__(self.name, self.metric_callback) + def serialize_config(self): + return {} + + @staticmethod + def from_config(config: Dict[str, Any]) -> Metric: + return AnswerConsistencyBinaryMetric() + async def metric_callback( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> bool: """Check if answer is consistent with context. diff --git a/tonic_validate/metrics/answer_consistency_metric.py b/tonic_validate/metrics/answer_consistency_metric.py index 1080e1e..1d43ca8 100644 --- a/tonic_validate/metrics/answer_consistency_metric.py +++ b/tonic_validate/metrics/answer_consistency_metric.py @@ -1,7 +1,7 @@ import logging -from typing import Union +from typing import Any, Dict, Union from tonic_validate.classes.llm_response import LLMResponse -from tonic_validate.metrics.metric import Metric +from tonic_validate.metrics.metric import Metric, MetricRequirement from tonic_validate.utils.metrics_util import ( parse_boolean_response, parse_bullet_list_response, @@ -27,6 +27,7 @@ class AnswerConsistencyMetric(Metric): f"{statement_derived_from_context_prompt(statement='EXAMPLE STATEMENT', context_list=[])}\n" "-------------------\n" ) + requirements = {MetricRequirement.LLM_ANSWER, MetricRequirement.LLM_CONTEXT} def __init__(self): """ @@ -35,8 +36,17 @@ def __init__(self): """ pass + def serialize_config(self): + return {} + + @staticmethod + def from_config(config: Dict[str, Any]) -> Metric: + return AnswerConsistencyMetric() + async def score( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> float: main_points_response = await main_points_call( llm_response.llm_answer, llm_service diff --git a/tonic_validate/metrics/answer_contains_pii_metric.py b/tonic_validate/metrics/answer_contains_pii_metric.py index 61669ba..1b960bf 100644 --- a/tonic_validate/metrics/answer_contains_pii_metric.py +++ b/tonic_validate/metrics/answer_contains_pii_metric.py @@ -1,13 +1,16 @@ import os import requests -from typing import List, Optional, Union +from typing import Any, Dict, List, Optional, Union from tonic_validate.classes.llm_response import LLMResponse from tonic_validate.metrics.binary_metric import BinaryMetric +from tonic_validate.metrics.metric import Metric, MetricRequirement from tonic_validate.services.openai_service import OpenAIService from tonic_validate.services.litellm_service import LiteLLMService class AnswerContainsPiiMetric(BinaryMetric): + requirements = {MetricRequirement.LLM_ANSWER} + def __init__(self, pii_types: List[str], textual_api_key: Optional[str] = None): """ Checks to see if PII is contained in the RAG provided answer. The types of PII looked for are found in the pii_types list. @@ -40,8 +43,20 @@ def __init__(self, pii_types: List[str], textual_api_key: Optional[str] = None): super().__init__("answer_contains_pii", self.metric_callback) + def serialize_config(self): + return {"pii_types": self.pii_types, "textual_api_key": self.textual.api_key} + + @staticmethod + def from_config(config: Dict[str, Any]) -> Metric: + return AnswerContainsPiiMetric( + pii_types=config["pii_types"], + textual_api_key=config["textual_api_key"], + ) + def metric_callback( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> bool: try: response = self.textual.redact(llm_response.llm_answer) diff --git a/tonic_validate/metrics/answer_match_metric.py b/tonic_validate/metrics/answer_match_metric.py index aeba71f..3bab445 100644 --- a/tonic_validate/metrics/answer_match_metric.py +++ b/tonic_validate/metrics/answer_match_metric.py @@ -1,8 +1,9 @@ import logging -from typing import Union +from typing import Any, Dict, Union from tonic_validate.classes.llm_response import LLMResponse from tonic_validate.metrics.binary_metric import BinaryMetric +from tonic_validate.metrics.metric import Metric, MetricRequirement from tonic_validate.services.openai_service import OpenAIService from tonic_validate.services.litellm_service import LiteLLMService @@ -10,6 +11,8 @@ class AnswerMatchMetric(BinaryMetric): + requirements = {MetricRequirement.LLM_ANSWER} + def __init__(self, name: str, answer: str, case_sensitive: bool = False): """ Create a metric that checks if the answer matches a given string. @@ -28,8 +31,25 @@ def __init__(self, name: str, answer: str, case_sensitive: bool = False): self.answer = answer self.case_sensitive = case_sensitive + def serialize_config(self): + return { + "name": self.name, + "answer": self.answer, + "case_sensitive": self.case_sensitive, + } + + @staticmethod + def from_config(config: Dict[str, Any]) -> Metric: + return AnswerMatchMetric( + name=config["name"], + answer=config["answer"], + case_sensitive=config["case_sensitive"], + ) + def metric_callback( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> bool: if self.case_sensitive: return self.answer == llm_response.llm_answer diff --git a/tonic_validate/metrics/answer_similarity_metric.py b/tonic_validate/metrics/answer_similarity_metric.py index a967706..533be4a 100644 --- a/tonic_validate/metrics/answer_similarity_metric.py +++ b/tonic_validate/metrics/answer_similarity_metric.py @@ -1,10 +1,13 @@ import logging -from typing import Union +from typing import Any, Dict, Union from tonic_validate.classes.llm_response import LLMResponse -from tonic_validate.metrics.metric import Metric +from tonic_validate.metrics.metric import Metric, MetricRequirement from tonic_validate.services.openai_service import OpenAIService from tonic_validate.services.litellm_service import LiteLLMService -from tonic_validate.utils.llm_calls import similarity_score_call, similarity_score_prompt +from tonic_validate.utils.llm_calls import ( + similarity_score_call, + similarity_score_prompt, +) logger = logging.getLogger() @@ -12,6 +15,11 @@ class AnswerSimilarityMetric(Metric): name: str = "answer_similarity" prompt: str = similarity_score_prompt() + requirements = { + MetricRequirement.QUESTION, + MetricRequirement.REFERENCE_ANSWER, + MetricRequirement.LLM_ANSWER, + } def __init__(self) -> None: """ @@ -20,8 +28,17 @@ def __init__(self) -> None: """ pass + def serialize_config(self): + return {} + + @staticmethod + def from_config(config: Dict[str, Any]) -> Metric: + return AnswerSimilarityMetric() + async def score( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> float: # Check that the benchmark item has an answer if llm_response.benchmark_item.answer is None: diff --git a/tonic_validate/metrics/augmentation_accuracy_metric.py b/tonic_validate/metrics/augmentation_accuracy_metric.py index 4065d87..e981498 100644 --- a/tonic_validate/metrics/augmentation_accuracy_metric.py +++ b/tonic_validate/metrics/augmentation_accuracy_metric.py @@ -1,11 +1,14 @@ import logging -from typing import List, Tuple, Union +from typing import Any, Dict, List, Tuple, Union from tonic_validate.classes.llm_response import LLMResponse -from tonic_validate.metrics.metric import Metric +from tonic_validate.metrics.metric import Metric, MetricRequirement from tonic_validate.utils.metrics_util import parse_boolean_response from tonic_validate.services.openai_service import OpenAIService from tonic_validate.services.litellm_service import LiteLLMService -from tonic_validate.utils.llm_calls import answer_contains_context_call, answer_contains_context_prompt +from tonic_validate.utils.llm_calls import ( + answer_contains_context_call, + answer_contains_context_prompt, +) logger = logging.getLogger() @@ -13,6 +16,7 @@ class AugmentationAccuracyMetric(Metric): name: str = "augmentation_accuracy" prompt: str = answer_contains_context_prompt() + requirements = {MetricRequirement.LLM_ANSWER, MetricRequirement.LLM_CONTEXT} def __init__(self): """ @@ -21,13 +25,24 @@ def __init__(self): """ pass + def serialize_config(self): + return {} + + @staticmethod + def from_config(config: Dict[str, Any]) -> Metric: + return AugmentationAccuracyMetric() + async def score( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> float: return (await self.calculate_metric(llm_response, llm_service))[0] async def calculate_metric( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> Tuple[float, List[bool]]: contains_context_list: List[bool] = [] if len(llm_response.llm_context_list) == 0: diff --git a/tonic_validate/metrics/augmentation_precision_metric.py b/tonic_validate/metrics/augmentation_precision_metric.py index 34fded3..ca3fc68 100644 --- a/tonic_validate/metrics/augmentation_precision_metric.py +++ b/tonic_validate/metrics/augmentation_precision_metric.py @@ -1,5 +1,5 @@ import logging -from typing import List, Union +from typing import Any, Dict, List, Union from tonic_validate.classes.llm_response import LLMResponse from tonic_validate.metrics.augmentation_accuracy_metric import ( AugmentationAccuracyMetric, @@ -14,6 +14,9 @@ class AugmentationPrecisionMetric(Metric): name: str = "augmentation_precision" + requirements = AugmentationAccuracyMetric.requirements.union( + RetrievalPrecisionMetric.requirements + ) def __init__(self) -> None: """ @@ -23,8 +26,17 @@ def __init__(self) -> None: self.augmentation_accuracy = AugmentationAccuracyMetric() self.retrieval_precision = RetrievalPrecisionMetric() + def serialize_config(self): + return {} + + @staticmethod + def from_config(config: Dict[str, Any]) -> Metric: + return AugmentationPrecisionMetric() + async def score( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> float: retrieval_precision_score = await self.retrieval_precision.calculate_metric( llm_response, llm_service diff --git a/tonic_validate/metrics/binary_metric.py b/tonic_validate/metrics/binary_metric.py index 48e0d7a..5dbebc2 100644 --- a/tonic_validate/metrics/binary_metric.py +++ b/tonic_validate/metrics/binary_metric.py @@ -1,5 +1,5 @@ import logging -from typing import Awaitable, Callable, Union +from typing import Any, Awaitable, Callable, Dict, Union from tonic_validate.classes.llm_response import LLMResponse from tonic_validate.metrics.metric import Metric @@ -18,7 +18,10 @@ def name(self) -> str: def __init__( self, name: str, - callback: Callable[[LLMResponse, Union[LiteLLMService, OpenAIService]], Union[Awaitable[bool], bool]], + callback: Callable[ + [LLMResponse, Union[LiteLLMService, OpenAIService]], + Union[Awaitable[bool], bool], + ], ): """ Create a binary metric with a name and a callback. A binary metric returns either True (1) or False (0). @@ -31,12 +34,20 @@ def __init__( The callback that takes an LLMResponse and an OpenAIService and returns a boolean. The callback can be either an async function or a regular function. """ - self._name = name self.callback = callback + def serialize_config(self): + raise NotImplementedError("Cannot serialize a custom binary metric") + + @staticmethod + def from_config(config: Dict[str, Any]) -> Metric: + raise NotImplementedError("Cannot deserialize a custom binary metric") + async def score( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> float: if inspect.iscoroutinefunction(self.callback): result = await self.callback(llm_response, llm_service) diff --git a/tonic_validate/metrics/contains_text_metric.py b/tonic_validate/metrics/contains_text_metric.py index a5d66bd..1c743c4 100644 --- a/tonic_validate/metrics/contains_text_metric.py +++ b/tonic_validate/metrics/contains_text_metric.py @@ -1,8 +1,9 @@ import logging -from typing import List, Optional, Union +from typing import Any, Dict, List, Optional, Union from tonic_validate.classes.llm_response import LLMResponse from tonic_validate.metrics.binary_metric import BinaryMetric +from tonic_validate.metrics.metric import Metric, MetricRequirement from tonic_validate.services.openai_service import OpenAIService from tonic_validate.services.litellm_service import LiteLLMService @@ -12,6 +13,8 @@ class ContainsTextMetric(BinaryMetric): """Checks whether or not response contains the given text.""" + requirements = {MetricRequirement.LLM_ANSWER} + def __init__( self, name: str, @@ -35,8 +38,25 @@ def __init__( self.text = text self.case_sensitive = case_sensitive + def serialize_config(self): + return { + "name": self.name, + "text": self.text, + "case_sensitive": self.case_sensitive, + } + + @staticmethod + def from_config(config: Dict[str, Any]) -> Metric: + return ContainsTextMetric( + name=config["name"], + text=config["text"], + case_sensitive=config["case_sensitive"], + ) + def metric_callback( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> bool: if isinstance(self.text, list): return all(self.contains_text(llm_response, text) for text in self.text) diff --git a/tonic_validate/metrics/context_contains_pii_metric.py b/tonic_validate/metrics/context_contains_pii_metric.py index 3f9b2a9..b0e9276 100644 --- a/tonic_validate/metrics/context_contains_pii_metric.py +++ b/tonic_validate/metrics/context_contains_pii_metric.py @@ -1,13 +1,16 @@ import os import requests -from typing import List, Optional, Union +from typing import Any, Dict, List, Optional, Union from tonic_validate.classes.llm_response import LLMResponse from tonic_validate.metrics.binary_metric import BinaryMetric +from tonic_validate.metrics.metric import Metric, MetricRequirement from tonic_validate.services.openai_service import OpenAIService from tonic_validate.services.litellm_service import LiteLLMService class ContextContainsPiiMetric(BinaryMetric): + requirements = {MetricRequirement.LLM_CONTEXT} + def __init__(self, pii_types: List[str], textual_api_key: Optional[str] = None): """ Checks to see if PII is contained in the RAG provided context. The types of PII looked for are found in the pii_types list. @@ -40,8 +43,20 @@ def __init__(self, pii_types: List[str], textual_api_key: Optional[str] = None): super().__init__("context_contains_pii", self.metric_callback) + def serialize_config(self): + return {"pii_types": self.pii_types, "textual_api_key": self.textual.api_key} + + @staticmethod + def from_config(config: Dict[str, Any]) -> Metric: + return ContextContainsPiiMetric( + pii_types=config["pii_types"], + textual_api_key=config["textual_api_key"], + ) + def metric_callback( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> bool: try: response = self.textual.redact("\n".join(llm_response.llm_context_list)) diff --git a/tonic_validate/metrics/context_length_metric.py b/tonic_validate/metrics/context_length_metric.py index f64382e..f3fe33e 100644 --- a/tonic_validate/metrics/context_length_metric.py +++ b/tonic_validate/metrics/context_length_metric.py @@ -1,8 +1,9 @@ import logging -from typing import Optional, Union +from typing import Any, Dict, Optional, Union from tonic_validate.classes.llm_response import LLMResponse from tonic_validate.metrics.binary_metric import BinaryMetric +from tonic_validate.metrics.metric import Metric, MetricRequirement from tonic_validate.services.openai_service import OpenAIService from tonic_validate.services.litellm_service import LiteLLMService @@ -12,6 +13,8 @@ class ContextLengthMetric(BinaryMetric): """Checks that context length is within a certain range.""" + requirements = {MetricRequirement.LLM_CONTEXT} + def __init__( self, name: str, @@ -35,8 +38,25 @@ def __init__( self.min_length = min_length self.max_length = max_length + def serialize_config(self): + return { + "name": self.name, + "min_length": self.min_length, + "max_length": self.max_length, + } + + @staticmethod + def from_config(config: Dict[str, Any]) -> Metric: + return ContextLengthMetric( + name=config["name"], + min_length=config["min_length"], + max_length=config["max_length"], + ) + def metric_callback( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> bool: # For all items in the context list, check if the length is within the min and max length return all( diff --git a/tonic_validate/metrics/duplication_metric.py b/tonic_validate/metrics/duplication_metric.py index 14df375..6e62980 100644 --- a/tonic_validate/metrics/duplication_metric.py +++ b/tonic_validate/metrics/duplication_metric.py @@ -1,11 +1,15 @@ import logging -from typing import Union +from typing import Any, Dict, Union from tonic_validate.classes.llm_response import LLMResponse from tonic_validate.metrics.binary_metric import BinaryMetric +from tonic_validate.metrics.metric import Metric, MetricRequirement from tonic_validate.services.openai_service import OpenAIService from tonic_validate.services.litellm_service import LiteLLMService -from tonic_validate.utils.llm_calls import contains_duplicate_information, contains_duplicate_info_prompt +from tonic_validate.utils.llm_calls import ( + contains_duplicate_information, + contains_duplicate_info_prompt, +) from tonic_validate.utils.metrics_util import parse_boolean_response logger = logging.getLogger() @@ -14,6 +18,7 @@ class DuplicationMetric(BinaryMetric): name: str = "duplication_metric" prompt: str = contains_duplicate_info_prompt() + requirements = {MetricRequirement.LLM_ANSWER} def __init__(self): """ @@ -22,11 +27,18 @@ def __init__(self): """ super().__init__(self.name, self.metric_callback) + def serialize_config(self): + return {} + + @staticmethod + def from_config(config: Dict[str, Any]) -> Metric: + return DuplicationMetric() + async def metric_callback( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> bool: return parse_boolean_response( - await contains_duplicate_information( - llm_response.llm_answer, llm_service - ) + await contains_duplicate_information(llm_response.llm_answer, llm_service) ) diff --git a/tonic_validate/metrics/hate_speech_content_metric.py b/tonic_validate/metrics/hate_speech_content_metric.py index c58518b..0e61edf 100644 --- a/tonic_validate/metrics/hate_speech_content_metric.py +++ b/tonic_validate/metrics/hate_speech_content_metric.py @@ -1,11 +1,15 @@ import logging -from typing import Union +from typing import Any, Dict, Union from tonic_validate.classes.llm_response import LLMResponse from tonic_validate.metrics.binary_metric import BinaryMetric +from tonic_validate.metrics.metric import Metric, MetricRequirement from tonic_validate.services.openai_service import OpenAIService from tonic_validate.services.litellm_service import LiteLLMService -from tonic_validate.utils.llm_calls import contains_hate_speech, contains_hate_speech_prompt +from tonic_validate.utils.llm_calls import ( + contains_hate_speech, + contains_hate_speech_prompt, +) from tonic_validate.utils.metrics_util import parse_boolean_response logger = logging.getLogger() @@ -14,6 +18,7 @@ class HateSpeechContentMetric(BinaryMetric): name: str = "hate_speech_content" prompt: str = contains_hate_speech_prompt() + requirements = {MetricRequirement.LLM_ANSWER} def __init__(self): """ @@ -22,8 +27,17 @@ def __init__(self): """ super().__init__(self.name, self.metric_callback) + def serialize_config(self): + return {} + + @staticmethod + def from_config(config: Dict[str, Any]) -> Metric: + return HateSpeechContentMetric() + async def metric_callback( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> bool: return parse_boolean_response( await contains_hate_speech(llm_response.llm_answer, llm_service) diff --git a/tonic_validate/metrics/latency_metric.py b/tonic_validate/metrics/latency_metric.py index c1c2ae9..f02bef1 100644 --- a/tonic_validate/metrics/latency_metric.py +++ b/tonic_validate/metrics/latency_metric.py @@ -1,7 +1,8 @@ import logging -from typing import Union +from typing import Any, Dict, Union from tonic_validate.classes.llm_response import LLMResponse from tonic_validate.metrics.binary_metric import BinaryMetric +from tonic_validate.metrics.metric import Metric, MetricRequirement from tonic_validate.services.openai_service import OpenAIService from tonic_validate.services.litellm_service import LiteLLMService @@ -10,6 +11,7 @@ class LatencyMetric(BinaryMetric): name: str = "latency_metric" + requirements = {MetricRequirement.LLM_RUN_TIME} def __init__(self, target_time: float = 5.0) -> None: """ @@ -23,9 +25,18 @@ def __init__(self, target_time: float = 5.0) -> None: """ self.target_time = target_time + def serialize_config(self): + return {"target_time": self.target_time} + + @staticmethod + def from_config(config: Dict[str, Any]) -> Metric: + return LatencyMetric(target_time=config["target_time"]) + # We do async here for consistency even though this method doesn't use async async def score( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> float: # Check that llm_response.run_time is not None if llm_response.run_time is None: diff --git a/tonic_validate/metrics/metric.py b/tonic_validate/metrics/metric.py index dbf0635..4d93dc3 100644 --- a/tonic_validate/metrics/metric.py +++ b/tonic_validate/metrics/metric.py @@ -1,26 +1,51 @@ from abc import ABC, abstractmethod -from typing import Optional, Union +from typing import Any, Dict, Optional, Set, Union +from enum import Enum from tonic_validate.classes.llm_response import LLMResponse from tonic_validate.services.openai_service import OpenAIService from tonic_validate.services.litellm_service import LiteLLMService +class MetricRequirement(str, Enum): + QUESTION = "QUESTION" + REFERENCE_ANSWER = "REFERENCE_ANSWER" + LLM_ANSWER = "LLM_ANSWER" + LLM_CONTEXT = "LLM_CONTEXT" + LLM_RUN_TIME = "LLM_RUN_TIME" + + class Metric(ABC): """Abstract class for a metric that can be calculated on an LLM response.""" - """Prompt for the metric. Can be overridden by subclasses if a specific prompt is needed.""" + # Prompt for the metric prompt: Optional[str] = None + # List of requirements for the metric + requirements: Set[MetricRequirement] + @property @abstractmethod def name(self) -> str: """Metric name for the UI""" pass + @staticmethod + @abstractmethod + def from_config(config: Dict[str, Any]) -> "Metric": + """Creates a metric object from a JSON object""" + pass + + @abstractmethod + def serialize_config(self) -> Dict[str, Any]: + """Serializes the metric configuration to a JSON object""" + pass + @abstractmethod async def score( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> float: """Calculate the score of the metric""" pass diff --git a/tonic_validate/metrics/regex_metric.py b/tonic_validate/metrics/regex_metric.py index acc62cf..c2e0f8a 100644 --- a/tonic_validate/metrics/regex_metric.py +++ b/tonic_validate/metrics/regex_metric.py @@ -1,9 +1,10 @@ import logging import re -from typing import Union +from typing import Any, Dict, Union from tonic_validate.classes.llm_response import LLMResponse from tonic_validate.metrics.binary_metric import BinaryMetric +from tonic_validate.metrics.metric import Metric, MetricRequirement from tonic_validate.services.openai_service import OpenAIService from tonic_validate.services.litellm_service import LiteLLMService @@ -11,6 +12,8 @@ class RegexMetric(BinaryMetric): + requirements = {MetricRequirement.LLM_ANSWER} + def __init__(self, name: str, pattern: str, match_count: int = 1): """ Creates a binary metric that checks whether the answer matches a given regex pattern. @@ -29,8 +32,25 @@ def __init__(self, name: str, pattern: str, match_count: int = 1): self.pattern = pattern self.match_count = match_count + def serialize_config(self): + return { + "name": self.name, + "pattern": self.pattern, + "match_count": self.match_count, + } + + @staticmethod + def from_config(config: Dict[str, Any]) -> Metric: + return RegexMetric( + name=config["name"], + pattern=config["pattern"], + match_count=config["match_count"], + ) + def metric_callback( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> bool: return self.match_count == len( re.findall(self.pattern, llm_response.llm_answer) diff --git a/tonic_validate/metrics/response_length_metric.py b/tonic_validate/metrics/response_length_metric.py index 3f3d837..8168367 100644 --- a/tonic_validate/metrics/response_length_metric.py +++ b/tonic_validate/metrics/response_length_metric.py @@ -1,8 +1,9 @@ import logging -from typing import Optional, Union +from typing import Any, Dict, Optional, Union from tonic_validate.classes.llm_response import LLMResponse from tonic_validate.metrics.binary_metric import BinaryMetric +from tonic_validate.metrics.metric import Metric, MetricRequirement from tonic_validate.services.openai_service import OpenAIService from tonic_validate.services.litellm_service import LiteLLMService @@ -10,6 +11,8 @@ class ResponseLengthMetric(BinaryMetric): + requirements = {MetricRequirement.LLM_ANSWER} + def __init__( self, name: str, @@ -33,8 +36,25 @@ def __init__( self.min_length = min_length self.max_length = max_length + def serialize_config(self): + return { + "name": self.name, + "min_length": self.min_length, + "max_length": self.max_length, + } + + @staticmethod + def from_config(config: Dict[str, Any]) -> Metric: + return ResponseLengthMetric( + name=config["name"], + min_length=config["min_length"], + max_length=config["max_length"], + ) + def metric_callback( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> bool: if self.min_length and len(llm_response.llm_answer) < self.min_length: return False diff --git a/tonic_validate/metrics/retrieval_precision_metric.py b/tonic_validate/metrics/retrieval_precision_metric.py index 79a1ff5..fede1e2 100644 --- a/tonic_validate/metrics/retrieval_precision_metric.py +++ b/tonic_validate/metrics/retrieval_precision_metric.py @@ -1,10 +1,13 @@ import logging -from typing import List, Tuple, Union +from typing import Any, Dict, List, Tuple, Union from tonic_validate.classes.llm_response import LLMResponse -from tonic_validate.metrics.metric import Metric +from tonic_validate.metrics.metric import Metric, MetricRequirement from tonic_validate.utils.metrics_util import parse_boolean_response from tonic_validate.services.openai_service import OpenAIService -from tonic_validate.utils.llm_calls import context_relevancy_call, context_relevancy_prompt +from tonic_validate.utils.llm_calls import ( + context_relevancy_call, + context_relevancy_prompt, +) from tonic_validate.services.litellm_service import LiteLLMService logger = logging.getLogger() @@ -13,6 +16,7 @@ class RetrievalPrecisionMetric(Metric): name: str = "retrieval_precision" prompt: str = context_relevancy_prompt() + requirements = {MetricRequirement.QUESTION, MetricRequirement.LLM_CONTEXT} def __init__(self): """ @@ -21,13 +25,24 @@ def __init__(self): """ pass + def serialize_config(self): + return {} + + @staticmethod + def from_config(config: Dict[str, Any]) -> Metric: + return RetrievalPrecisionMetric() + async def score( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> float: return (await self.calculate_metric(llm_response, llm_service))[0] async def calculate_metric( - self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService] + self, + llm_response: LLMResponse, + llm_service: Union[LiteLLMService, OpenAIService], ) -> Tuple[float, List[bool]]: if len(llm_response.llm_context_list) == 0: raise ValueError( diff --git a/tonic_validate/validate_monitorer.py b/tonic_validate/validate_monitorer.py new file mode 100644 index 0000000..1aad2f4 --- /dev/null +++ b/tonic_validate/validate_monitorer.py @@ -0,0 +1,100 @@ +from typing import Any, Dict, List, Optional + +from pydantic import ConfigDict, validate_call +import logging + +from tonic_validate.config import Config +from tonic_validate.metrics import ( + AnswerSimilarityMetric, + RetrievalPrecisionMetric, + AugmentationPrecisionMetric, + AnswerConsistencyMetric, +) + +from tonic_validate.metrics.metric import Metric, MetricRequirement +from tonic_validate.utils.http_client import HttpClient +from tonic_validate.utils.telemetry import Telemetry + +logger = logging.getLogger() + + +class ValidateMonitorer: + DEFAULT_PARALLELISM_CALLBACK = 1 + DEFAULT_PARALLELISM_SCORING = 50 + + @validate_call(config=ConfigDict(arbitrary_types_allowed=True)) + def __init__( + self, + metrics: List[Metric] = [ + RetrievalPrecisionMetric(), + AugmentationPrecisionMetric(), + AnswerConsistencyMetric(), + ], + api_key: Optional[str] = None, + quiet: bool = False, + ): + """ + Create a Tonic Validate scorer that can work with either OpenAIService or LiteLLMService. + + Parameters + ---------- + metrics: List[Metric] + The list of metrics to be used for scoring. + quiet: bool + If True, will suppress all logging except errors. + """ + self.metrics = metrics + self.quiet = quiet + logger.setLevel(logging.ERROR if quiet else logging.INFO) + + self.config = Config() + if api_key is None: + api_key = self.config.TONIC_VALIDATE_API_KEY + if api_key is None: + exception_message = ( + "No api key provided. Please provide an api key or set " + "TONIC_VALIDATE_API_KEY environment variable." + ) + raise Exception(exception_message) + self.client = HttpClient(self.config.TONIC_VALIDATE_BASE_URL, api_key) + try: + telemetry = Telemetry(api_key) + telemetry.link_user() + except Exception as _: + pass + + def check_metric_requirements(self): + if any( + MetricRequirement.REFERENCE_ANSWER in metric.requirements + for metric in self.metrics + ): + raise ValueError("This metric is not supported for monitoring") + + @validate_call(config=ConfigDict(arbitrary_types_allowed=True)) + def log( + self, + project_id: str, + question: str, + answer: str, + context_list: Optional[List[str]], + log_metadata: Optional[Dict[str, Any]] = {}, + tags: Optional[List[str]] = [], + ): + self.check_metric_requirements() + config: Dict[str, Dict[str, Any]] = dict() + for metric in self.metrics: + # Get class name for metric + cls_name = metric.__class__.__name__ + config[cls_name] = metric.serialize_config() + response = self.client.http_post( + f"/projects/{project_id}/monitoring/jobs", + data={ + "reference_question": question, + "llm_answer": answer, + "llm_context": context_list, + "log_metadata": log_metadata, + "tags": tags, + "metrics_config": config, + }, + ) + return response["id"] diff --git a/tonic_validate/validate_scorer.py b/tonic_validate/validate_scorer.py index 893a979..7758ac6 100644 --- a/tonic_validate/validate_scorer.py +++ b/tonic_validate/validate_scorer.py @@ -2,7 +2,7 @@ import asyncio from collections import defaultdict from concurrent.futures import ThreadPoolExecutor -from typing import Awaitable, Callable, DefaultDict, List, Dict, Union +from typing import Any, Awaitable, Callable, DefaultDict, List, Dict, Type, Union from pydantic import ConfigDict, TypeAdapter, validate_call from tonic_validate.classes.benchmark import Benchmark, BenchmarkItem @@ -11,13 +11,7 @@ from tonic_validate.classes.llm_response import CallbackLLMResponse, LLMResponse from tonic_validate.classes.run import Run, RunData -from tonic_validate.metrics.answer_consistency_metric import AnswerConsistencyMetric -from tonic_validate.metrics.answer_similarity_metric import AnswerSimilarityMetric -from tonic_validate.metrics.augmentation_precision_metric import ( - AugmentationPrecisionMetric, -) - -from tonic_validate.metrics.metric import Metric +import tonic_validate.metrics as tonic_metrics from tonic_validate.services.openai_service import OpenAIService from tonic_validate.services.litellm_service import LiteLLMService import tiktoken @@ -29,6 +23,17 @@ logger = logging.getLogger() CallbackValidator = TypeAdapter(CallbackLLMResponse) +# Gets a list of all the metric names +metric_dict: Dict[str, Type[tonic_metrics.Metric]] = {} +for metric in tonic_metrics.__all__: + cls = getattr(tonic_metrics, metric) + if not issubclass(cls, tonic_metrics.Metric): + continue + try: + metric_dict[cls.__name__] = cls + except AttributeError: + print(f"The Metric {metric} does not have a '__name__' attribute.") + class ValidateScorer: DEFAULT_PARALLELISM_CALLBACK = 1 @@ -37,10 +42,10 @@ class ValidateScorer: @validate_call(config=ConfigDict(arbitrary_types_allowed=True)) def __init__( self, - metrics: List[Metric] = [ - AnswerSimilarityMetric(), - AugmentationPrecisionMetric(), - AnswerConsistencyMetric(), + metrics: List[tonic_metrics.Metric] = [ + tonic_metrics.AnswerSimilarityMetric(), + tonic_metrics.AugmentationPrecisionMetric(), + tonic_metrics.AnswerConsistencyMetric(), ], model_evaluator: str = "gpt-4-turbo-preview", max_parsing_retries: int = 3, @@ -366,3 +371,12 @@ def create_response(item: BenchmarkItem) -> LLMResponse: ) return self.score_responses(responses, scoring_parallelism) + + @staticmethod + def metric_config_to_list(config: Dict[str, Dict[str, Any]]): + metrics: List[tonic_metrics.Metric] = [] + for metric in config: + if metric not in metric_dict: + raise Exception(f"Metric {metric} not found.") + metrics.append(metric_dict[metric].from_config(config[metric])) + return metrics