Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added monitoring #167

Merged
merged 1 commit into from
May 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions tonic_validate/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from .validate_api import ValidateApi
from .validate_scorer import ValidateScorer
from .validate_monitorer import ValidateMonitorer

from .classes import (
Benchmark,
BenchmarkItem,
Expand All @@ -14,6 +16,7 @@
__all__ = [
"ValidateApi",
"ValidateScorer",
"ValidateMonitorer",
"Benchmark",
"BenchmarkItem",
"LLMResponse",
Expand Down
4 changes: 3 additions & 1 deletion tonic_validate/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .latency_metric import LatencyMetric
from .context_contains_pii_metric import ContextContainsPiiMetric
from .answer_contains_pii_metric import AnswerContainsPiiMetric
from .metric import Metric

__all__ = [
"AnswerConsistencyBinaryMetric",
Expand All @@ -33,5 +34,6 @@
"HateSpeechContentMetric",
"LatencyMetric",
"ContextContainsPiiMetric",
"AnswerContainsPiiMetric"
"AnswerContainsPiiMetric",
"Metric",
]
20 changes: 17 additions & 3 deletions tonic_validate/metrics/answer_consistency_binary_metric.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
import logging
from typing import Union
from typing import Any, Dict, Union
from tonic_validate.classes.llm_response import LLMResponse
from tonic_validate.metrics.binary_metric import BinaryMetric
from tonic_validate.metrics.metric import Metric, MetricRequirement
from tonic_validate.utils.metrics_util import parse_boolean_response
from tonic_validate.services.openai_service import OpenAIService
from tonic_validate.services.litellm_service import LiteLLMService
from tonic_validate.utils.llm_calls import answer_consistent_with_context_call, context_consistency_prompt
from tonic_validate.utils.llm_calls import (
answer_consistent_with_context_call,
context_consistency_prompt,
)

logger = logging.getLogger()


class AnswerConsistencyBinaryMetric(BinaryMetric):
name: str = "answer_consistency_binary"
prompt: str = context_consistency_prompt()
requirements = {MetricRequirement.LLM_ANSWER, MetricRequirement.LLM_CONTEXT}

def __init__(self):
"""
Expand All @@ -21,8 +26,17 @@ def __init__(self):
"""
super().__init__(self.name, self.metric_callback)

def serialize_config(self):
return {}

@staticmethod
def from_config(config: Dict[str, Any]) -> Metric:
return AnswerConsistencyBinaryMetric()

async def metric_callback(
self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
self,
llm_response: LLMResponse,
llm_service: Union[LiteLLMService, OpenAIService],
) -> bool:
"""Check if answer is consistent with context.

Expand Down
16 changes: 13 additions & 3 deletions tonic_validate/metrics/answer_consistency_metric.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
from typing import Union
from typing import Any, Dict, Union
from tonic_validate.classes.llm_response import LLMResponse
from tonic_validate.metrics.metric import Metric
from tonic_validate.metrics.metric import Metric, MetricRequirement
from tonic_validate.utils.metrics_util import (
parse_boolean_response,
parse_bullet_list_response,
Expand All @@ -27,6 +27,7 @@ class AnswerConsistencyMetric(Metric):
f"{statement_derived_from_context_prompt(statement='EXAMPLE STATEMENT', context_list=[])}\n"
"-------------------\n"
)
requirements = {MetricRequirement.LLM_ANSWER, MetricRequirement.LLM_CONTEXT}

def __init__(self):
"""
Expand All @@ -35,8 +36,17 @@ def __init__(self):
"""
pass

def serialize_config(self):
return {}

@staticmethod
def from_config(config: Dict[str, Any]) -> Metric:
return AnswerConsistencyMetric()

async def score(
self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
self,
llm_response: LLMResponse,
llm_service: Union[LiteLLMService, OpenAIService],
) -> float:
main_points_response = await main_points_call(
llm_response.llm_answer, llm_service
Expand Down
19 changes: 17 additions & 2 deletions tonic_validate/metrics/answer_contains_pii_metric.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import os
import requests
from typing import List, Optional, Union
from typing import Any, Dict, List, Optional, Union
from tonic_validate.classes.llm_response import LLMResponse
from tonic_validate.metrics.binary_metric import BinaryMetric
from tonic_validate.metrics.metric import Metric, MetricRequirement
from tonic_validate.services.openai_service import OpenAIService
from tonic_validate.services.litellm_service import LiteLLMService


class AnswerContainsPiiMetric(BinaryMetric):
requirements = {MetricRequirement.LLM_ANSWER}

def __init__(self, pii_types: List[str], textual_api_key: Optional[str] = None):
"""
Checks to see if PII is contained in the RAG provided answer. The types of PII looked for are found in the pii_types list.
Expand Down Expand Up @@ -40,8 +43,20 @@ def __init__(self, pii_types: List[str], textual_api_key: Optional[str] = None):

super().__init__("answer_contains_pii", self.metric_callback)

def serialize_config(self):
return {"pii_types": self.pii_types, "textual_api_key": self.textual.api_key}

@staticmethod
def from_config(config: Dict[str, Any]) -> Metric:
return AnswerContainsPiiMetric(
pii_types=config["pii_types"],
textual_api_key=config["textual_api_key"],
)

def metric_callback(
self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
self,
llm_response: LLMResponse,
llm_service: Union[LiteLLMService, OpenAIService],
) -> bool:
try:
response = self.textual.redact(llm_response.llm_answer)
Expand Down
24 changes: 22 additions & 2 deletions tonic_validate/metrics/answer_match_metric.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import logging

from typing import Union
from typing import Any, Dict, Union
from tonic_validate.classes.llm_response import LLMResponse
from tonic_validate.metrics.binary_metric import BinaryMetric
from tonic_validate.metrics.metric import Metric, MetricRequirement
from tonic_validate.services.openai_service import OpenAIService
from tonic_validate.services.litellm_service import LiteLLMService

logger = logging.getLogger()


class AnswerMatchMetric(BinaryMetric):
requirements = {MetricRequirement.LLM_ANSWER}

def __init__(self, name: str, answer: str, case_sensitive: bool = False):
"""
Create a metric that checks if the answer matches a given string.
Expand All @@ -28,8 +31,25 @@ def __init__(self, name: str, answer: str, case_sensitive: bool = False):
self.answer = answer
self.case_sensitive = case_sensitive

def serialize_config(self):
return {
"name": self.name,
"answer": self.answer,
"case_sensitive": self.case_sensitive,
}

@staticmethod
def from_config(config: Dict[str, Any]) -> Metric:
return AnswerMatchMetric(
name=config["name"],
answer=config["answer"],
case_sensitive=config["case_sensitive"],
)

def metric_callback(
self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
self,
llm_response: LLMResponse,
llm_service: Union[LiteLLMService, OpenAIService],
) -> bool:
if self.case_sensitive:
return self.answer == llm_response.llm_answer
Expand Down
25 changes: 21 additions & 4 deletions tonic_validate/metrics/answer_similarity_metric.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,25 @@
import logging
from typing import Union
from typing import Any, Dict, Union
from tonic_validate.classes.llm_response import LLMResponse
from tonic_validate.metrics.metric import Metric
from tonic_validate.metrics.metric import Metric, MetricRequirement
from tonic_validate.services.openai_service import OpenAIService
from tonic_validate.services.litellm_service import LiteLLMService
from tonic_validate.utils.llm_calls import similarity_score_call, similarity_score_prompt
from tonic_validate.utils.llm_calls import (
similarity_score_call,
similarity_score_prompt,
)

logger = logging.getLogger()


class AnswerSimilarityMetric(Metric):
name: str = "answer_similarity"
prompt: str = similarity_score_prompt()
requirements = {
MetricRequirement.QUESTION,
MetricRequirement.REFERENCE_ANSWER,
MetricRequirement.LLM_ANSWER,
}

def __init__(self) -> None:
"""
Expand All @@ -20,8 +28,17 @@ def __init__(self) -> None:
"""
pass

def serialize_config(self):
return {}

@staticmethod
def from_config(config: Dict[str, Any]) -> Metric:
return AnswerSimilarityMetric()

async def score(
self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
self,
llm_response: LLMResponse,
llm_service: Union[LiteLLMService, OpenAIService],
) -> float:
# Check that the benchmark item has an answer
if llm_response.benchmark_item.answer is None:
Expand Down
25 changes: 20 additions & 5 deletions tonic_validate/metrics/augmentation_accuracy_metric.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
import logging
from typing import List, Tuple, Union
from typing import Any, Dict, List, Tuple, Union
from tonic_validate.classes.llm_response import LLMResponse
from tonic_validate.metrics.metric import Metric
from tonic_validate.metrics.metric import Metric, MetricRequirement
from tonic_validate.utils.metrics_util import parse_boolean_response
from tonic_validate.services.openai_service import OpenAIService
from tonic_validate.services.litellm_service import LiteLLMService
from tonic_validate.utils.llm_calls import answer_contains_context_call, answer_contains_context_prompt
from tonic_validate.utils.llm_calls import (
answer_contains_context_call,
answer_contains_context_prompt,
)

logger = logging.getLogger()


class AugmentationAccuracyMetric(Metric):
name: str = "augmentation_accuracy"
prompt: str = answer_contains_context_prompt()
requirements = {MetricRequirement.LLM_ANSWER, MetricRequirement.LLM_CONTEXT}

def __init__(self):
"""
Expand All @@ -21,13 +25,24 @@ def __init__(self):
"""
pass

def serialize_config(self):
return {}

@staticmethod
def from_config(config: Dict[str, Any]) -> Metric:
return AugmentationAccuracyMetric()

async def score(
self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
self,
llm_response: LLMResponse,
llm_service: Union[LiteLLMService, OpenAIService],
) -> float:
return (await self.calculate_metric(llm_response, llm_service))[0]

async def calculate_metric(
self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
self,
llm_response: LLMResponse,
llm_service: Union[LiteLLMService, OpenAIService],
) -> Tuple[float, List[bool]]:
contains_context_list: List[bool] = []
if len(llm_response.llm_context_list) == 0:
Expand Down
16 changes: 14 additions & 2 deletions tonic_validate/metrics/augmentation_precision_metric.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import List, Union
from typing import Any, Dict, List, Union
from tonic_validate.classes.llm_response import LLMResponse
from tonic_validate.metrics.augmentation_accuracy_metric import (
AugmentationAccuracyMetric,
Expand All @@ -14,6 +14,9 @@

class AugmentationPrecisionMetric(Metric):
name: str = "augmentation_precision"
requirements = AugmentationAccuracyMetric.requirements.union(
RetrievalPrecisionMetric.requirements
)

def __init__(self) -> None:
"""
Expand All @@ -23,8 +26,17 @@ def __init__(self) -> None:
self.augmentation_accuracy = AugmentationAccuracyMetric()
self.retrieval_precision = RetrievalPrecisionMetric()

def serialize_config(self):
return {}

@staticmethod
def from_config(config: Dict[str, Any]) -> Metric:
return AugmentationPrecisionMetric()

async def score(
self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
self,
llm_response: LLMResponse,
llm_service: Union[LiteLLMService, OpenAIService],
) -> float:
retrieval_precision_score = await self.retrieval_precision.calculate_metric(
llm_response, llm_service
Expand Down
19 changes: 15 additions & 4 deletions tonic_validate/metrics/binary_metric.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import Awaitable, Callable, Union
from typing import Any, Awaitable, Callable, Dict, Union

from tonic_validate.classes.llm_response import LLMResponse
from tonic_validate.metrics.metric import Metric
Expand All @@ -18,7 +18,10 @@ def name(self) -> str:
def __init__(
self,
name: str,
callback: Callable[[LLMResponse, Union[LiteLLMService, OpenAIService]], Union[Awaitable[bool], bool]],
callback: Callable[
[LLMResponse, Union[LiteLLMService, OpenAIService]],
Union[Awaitable[bool], bool],
],
):
"""
Create a binary metric with a name and a callback. A binary metric returns either True (1) or False (0).
Expand All @@ -31,12 +34,20 @@ def __init__(
The callback that takes an LLMResponse and an OpenAIService and returns a boolean.
The callback can be either an async function or a regular function.
"""

self._name = name
self.callback = callback

def serialize_config(self):
raise NotImplementedError("Cannot serialize a custom binary metric")

@staticmethod
def from_config(config: Dict[str, Any]) -> Metric:
raise NotImplementedError("Cannot deserialize a custom binary metric")

async def score(
self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
self,
llm_response: LLMResponse,
llm_service: Union[LiteLLMService, OpenAIService],
) -> float:
if inspect.iscoroutinefunction(self.callback):
result = await self.callback(llm_response, llm_service)
Expand Down
Loading
Loading