TonicAI · ethan-tonic · Apr 23, 2024 · Apr 16, 2024 · Apr 16, 2024 · Apr 23, 2024
diff --git a/README.md b/README.md
@@ -299,6 +299,21 @@ os.environ["AZURE_OPENAI_KEY"] = "put-your-azure-openai-api-key-here"
 os.environ["AZURE_OPENAI_ENDPOINT"] = "put-your-azure-endpoint-here"
 ```
 
+#### Using Gemini
+If you already have the `GEMINI_API_KEY` set in your system's environment variables then you can skip this step. Otherwise, please set the environment variable before proceeding.
+```python
+import os
+os.environ["GEMINI_API_KEY"] = "put-your-gemini-api-key-here"
+```
+_Note that to use gemini, your Python version must be 3.9 or higher._
+
+#### Using Claude
+If you already have the `ANTHROPIC_API_KEY` set in your system's environment variables then you can skip this step. Otherwise, please set the environment variable before proceeding.
+```python
+import os
+os.environ["ANTHROPIC_API_KEY"] = "put-your-anthropic-api-key-here"
+```
+
 
 #### Setting up the Tonic Validate Scorer
 To use metrics, instantiate an instance of ValidateScorer.
@@ -319,6 +334,13 @@ scorer = ValidateScorer([
 ], model_evaluator="gpt-3.5-turbo")
 ```
 
+You can also pass in other models like Google Gemini or Claude by setting the `model_evaluator` argument to the model name like so
+```python
+scorer = ValidateScorer(model_evaluator="gemini/gemini-1.5-pro-latest")
+```
+```python
+scorer = ValidateScorer(model_evaluator="claude-3")
+```
 If an error occurs while scoring an item's metric, the score for that metric will be set to `None`. If you instead wish to have Tonic Validate throw an exception when there's an error scoring, then set `fail_on_error` to `True` in the constructor
 
 ```python
@@ -461,13 +483,11 @@ If you wish to opt out of telemetry, you only need to set the `TONIC_VALIDATE_DO
 
 #### What models can I use an LLM evaluator?
 
-We currently allow the family of chat completion models from Open AI.
-
-This restriction makes it easy to follow the logic for the definition of the metrics in this package. It also ensures that this package does not depend on langchain, which also makes the logic of the package easier to follow.
+We currently allow the family of chat completion models from Open AI, Google, Anthropic, and more.  We are always looking to add more models to our evaluator.  If you have a model you would like to see added, please file an issue against this repository.
 
 We'd like to add more models as choices for the LLM evaluator without adding to the complexity of the package too much.
 
-The default model used for scoring metrics is GPT 4 Turbo. To change the OpenAI model, pass the OpenAI model name into the `model` argument for `ValidateScorer`
+The default model used for scoring metrics is GPT 4 Turbo. To change the model, pass the model name into the `model` argument for `ValidateScorer`
 
 ```python
 scorer = ValidateScorer([

diff --git a/examples/quickstart_gemini.ipynb b/examples/quickstart_gemini.ipynb
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "tonic-validate"
-version = "4.0.6"
+version = "4.0.7"
 description = "RAG evaluation metrics."
 authors = ["Joe Ferrara <joeferrara@tonic.ai>", "Ethan Philpott <ephilpott@tonic.ai>", "Adam Kamor <adam@tonic.ai>"]
 readme = "README.md"
@@ -14,6 +14,8 @@ python-dotenv = "^1.0.1"
 tqdm = "^4.66.2"
 pydantic = "^2.6.4"
 typing-extensions = "^4.10.0"
+litellm = "^1.35.8"
+google-generativeai = { version = "^0.5.2", python = ">=3.9" }
 
 [tool.poetry.group.dev.dependencies]
 sphinx = "^7.0.0"

diff --git a/tonic_validate/metrics/answer_consistency_binary_metric.py b/tonic_validate/metrics/answer_consistency_binary_metric.py
@@ -1,8 +1,10 @@
 import logging
+from typing import Union
 from tonic_validate.classes.llm_response import LLMResponse
 from tonic_validate.metrics.binary_metric import BinaryMetric
 from tonic_validate.utils.metrics_util import parse_boolean_response
 from tonic_validate.services.openai_service import OpenAIService
+from tonic_validate.services.litellm_service import LiteLLMService
 from tonic_validate.utils.llm_calls import answer_consistent_with_context_call, context_consistency_prompt
 
 logger = logging.getLogger()
@@ -20,7 +22,7 @@ def __init__(self):
         super().__init__(self.name, self.metric_callback)
 
     async def metric_callback(
-        self, llm_response: LLMResponse, openai_service: OpenAIService
+        self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
     ) -> bool:
         """Check if answer is consistent with context.
 
@@ -37,6 +39,6 @@ async def metric_callback(
             True if answer is consistent with context, False otherwise.
         """
         hallucination_response = await answer_consistent_with_context_call(
-            llm_response.llm_answer, llm_response.llm_context_list, openai_service
+            llm_response.llm_answer, llm_response.llm_context_list, llm_service
         )
         return parse_boolean_response(hallucination_response)
diff --git a/tonic_validate/metrics/answer_consistency_metric.py b/tonic_validate/metrics/answer_consistency_metric.py
@@ -1,11 +1,13 @@
 import logging
+from typing import Union
 from tonic_validate.classes.llm_response import LLMResponse
 from tonic_validate.metrics.metric import Metric
 from tonic_validate.utils.metrics_util import (
     parse_boolean_response,
     parse_bullet_list_response,
 )
 from tonic_validate.services.openai_service import OpenAIService
+from tonic_validate.services.litellm_service import LiteLLMService
 from tonic_validate.utils.llm_calls import (
     main_points_call,
     statement_derived_from_context_call,
@@ -34,17 +36,17 @@ def __init__(self):
         pass
 
     async def score(
-        self, llm_response: LLMResponse, openai_service: OpenAIService
+        self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
     ) -> float:
         main_points_response = await main_points_call(
-            llm_response.llm_answer, openai_service
+            llm_response.llm_answer, llm_service
         )
         main_point_list = parse_bullet_list_response(main_points_response)
         main_point_derived_from_context_list = []
         for main_point in main_point_list:
             statement_derived_from_context_response = (
                 await statement_derived_from_context_call(
-                    main_point, llm_response.llm_context_list, openai_service
+                    main_point, llm_response.llm_context_list, llm_service
                 )
             )
             main_point_derived_from_context_list.append(

diff --git a/tonic_validate/metrics/answer_contains_pii_metric.py b/tonic_validate/metrics/answer_contains_pii_metric.py
@@ -1,9 +1,10 @@
 import os
 import requests
-from typing import List, Optional
+from typing import List, Optional, Union
 from tonic_validate.classes.llm_response import LLMResponse
 from tonic_validate.metrics.binary_metric import BinaryMetric
 from tonic_validate.services.openai_service import OpenAIService
+from tonic_validate.services.litellm_service import LiteLLMService
 
 
 class AnswerContainsPiiMetric(BinaryMetric):
@@ -40,7 +41,7 @@ def __init__(self, pii_types: List[str], textual_api_key: Optional[str] = None):
         super().__init__("answer_contains_pii", self.metric_callback)
 
     def metric_callback(
-        self, llm_response: LLMResponse, openai_service: OpenAIService
+        self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
     ) -> bool:
         try:
             response = self.textual.redact(llm_response.llm_answer)

diff --git a/tonic_validate/metrics/answer_match_metric.py b/tonic_validate/metrics/answer_match_metric.py
@@ -1,8 +1,10 @@
 import logging
 
+from typing import Union
 from tonic_validate.classes.llm_response import LLMResponse
 from tonic_validate.metrics.binary_metric import BinaryMetric
 from tonic_validate.services.openai_service import OpenAIService
+from tonic_validate.services.litellm_service import LiteLLMService
 
 logger = logging.getLogger()
 
@@ -27,7 +29,7 @@ def __init__(self, name: str, answer: str, case_sensitive: bool = False):
         self.case_sensitive = case_sensitive
 
     def metric_callback(
-        self, llm_response: LLMResponse, openai_service: OpenAIService
+        self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
     ) -> bool:
         if self.case_sensitive:
             return self.answer == llm_response.llm_answer

diff --git a/tonic_validate/metrics/answer_similarity_metric.py b/tonic_validate/metrics/answer_similarity_metric.py
@@ -1,7 +1,9 @@
 import logging
+from typing import Union
 from tonic_validate.classes.llm_response import LLMResponse
 from tonic_validate.metrics.metric import Metric
 from tonic_validate.services.openai_service import OpenAIService
+from tonic_validate.services.litellm_service import LiteLLMService
 from tonic_validate.utils.llm_calls import similarity_score_call, similarity_score_prompt
 
 logger = logging.getLogger()
@@ -19,7 +21,7 @@ def __init__(self) -> None:
         pass
 
     async def score(
-        self, llm_response: LLMResponse, openai_service: OpenAIService
+        self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
     ) -> float:
         # Check that the benchmark item has an answer
         if llm_response.benchmark_item.answer is None:
@@ -29,7 +31,7 @@ async def score(
             llm_response.benchmark_item.question,
             llm_response.benchmark_item.answer,
             llm_response.llm_answer,
-            openai_service,
+            llm_service,
         )
         try:
             similarity_score = float(similarity_score_response)

diff --git a/tonic_validate/metrics/augmentation_accuracy_metric.py b/tonic_validate/metrics/augmentation_accuracy_metric.py
@@ -1,9 +1,10 @@
 import logging
-from typing import List, Tuple
+from typing import List, Tuple, Union
 from tonic_validate.classes.llm_response import LLMResponse
 from tonic_validate.metrics.metric import Metric
 from tonic_validate.utils.metrics_util import parse_boolean_response
 from tonic_validate.services.openai_service import OpenAIService
+from tonic_validate.services.litellm_service import LiteLLMService
 from tonic_validate.utils.llm_calls import answer_contains_context_call, answer_contains_context_prompt
 
 logger = logging.getLogger()
@@ -21,12 +22,12 @@ def __init__(self):
         pass
 
     async def score(
-        self, llm_response: LLMResponse, openai_service: OpenAIService
+        self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
     ) -> float:
-        return (await self.calculate_metric(llm_response, openai_service))[0]
+        return (await self.calculate_metric(llm_response, llm_service))[0]
 
     async def calculate_metric(
-        self, llm_response: LLMResponse, openai_service: OpenAIService
+        self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
     ) -> Tuple[float, List[bool]]:
         contains_context_list: List[bool] = []
         if len(llm_response.llm_context_list) == 0:
@@ -35,7 +36,7 @@ async def calculate_metric(
             )
         for context in llm_response.llm_context_list:
             contains_context_response = await answer_contains_context_call(
-                llm_response.llm_answer, context, openai_service
+                llm_response.llm_answer, context, llm_service
             )
             contains_context_list.append(
                 parse_boolean_response(contains_context_response)

diff --git a/tonic_validate/metrics/augmentation_precision_metric.py b/tonic_validate/metrics/augmentation_precision_metric.py
@@ -1,12 +1,13 @@
 import logging
-from typing import List
+from typing import List, Union
 from tonic_validate.classes.llm_response import LLMResponse
 from tonic_validate.metrics.augmentation_accuracy_metric import (
     AugmentationAccuracyMetric,
 )
 from tonic_validate.metrics.metric import Metric
 from tonic_validate.metrics.retrieval_precision_metric import RetrievalPrecisionMetric
 from tonic_validate.services.openai_service import OpenAIService
+from tonic_validate.services.litellm_service import LiteLLMService
 
 logger = logging.getLogger()
 
@@ -23,14 +24,14 @@ def __init__(self) -> None:
         self.retrieval_precision = RetrievalPrecisionMetric()
 
     async def score(
-        self, llm_response: LLMResponse, openai_service: OpenAIService
+        self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
     ) -> float:
         retrieval_precision_score = await self.retrieval_precision.calculate_metric(
-            llm_response, openai_service
+            llm_response, llm_service
         )
         context_relevant_list = retrieval_precision_score[1]
         augmentation_accuracy_score = await self.augmentation_accuracy.calculate_metric(
-            llm_response, openai_service
+            llm_response, llm_service
         )
         contains_context_list = augmentation_accuracy_score[1]
 

diff --git a/tonic_validate/metrics/binary_metric.py b/tonic_validate/metrics/binary_metric.py
@@ -4,6 +4,7 @@
 from tonic_validate.classes.llm_response import LLMResponse
 from tonic_validate.metrics.metric import Metric
 from tonic_validate.services.openai_service import OpenAIService
+from tonic_validate.services.litellm_service import LiteLLMService
 import inspect
 
 logger = logging.getLogger()
@@ -17,7 +18,7 @@ def name(self) -> str:
     def __init__(
         self,
         name: str,
-        callback: Callable[[LLMResponse, OpenAIService], Union[Awaitable[bool], bool]],
+        callback: Callable[[LLMResponse, Union[LiteLLMService, OpenAIService]], Union[Awaitable[bool], bool]],
     ):
         """
         Create a binary metric with a name and a callback. A binary metric returns either True (1) or False (0).
@@ -35,10 +36,10 @@ def __init__(
         self.callback = callback
 
     async def score(
-        self, llm_response: LLMResponse, openai_service: OpenAIService
+        self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
     ) -> float:
         if inspect.iscoroutinefunction(self.callback):
-            result = await self.callback(llm_response, openai_service)
+            result = await self.callback(llm_response, llm_service)
         else:
-            result = self.callback(llm_response, openai_service)
+            result = self.callback(llm_response, llm_service)
         return 1.0 if result else 0.0
diff --git a/tonic_validate/metrics/contains_text_metric.py b/tonic_validate/metrics/contains_text_metric.py
@@ -4,6 +4,7 @@
 from tonic_validate.classes.llm_response import LLMResponse
 from tonic_validate.metrics.binary_metric import BinaryMetric
 from tonic_validate.services.openai_service import OpenAIService
+from tonic_validate.services.litellm_service import LiteLLMService
 
 logger = logging.getLogger()
 
@@ -35,7 +36,7 @@ def __init__(
         self.case_sensitive = case_sensitive
 
     def metric_callback(
-        self, llm_response: LLMResponse, openai_service: OpenAIService
+        self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
     ) -> bool:
         if isinstance(self.text, list):
             return all(self.contains_text(llm_response, text) for text in self.text)

diff --git a/tonic_validate/metrics/context_contains_pii_metric.py b/tonic_validate/metrics/context_contains_pii_metric.py
@@ -1,9 +1,10 @@
 import os
 import requests
-from typing import List, Optional
+from typing import List, Optional, Union
 from tonic_validate.classes.llm_response import LLMResponse
 from tonic_validate.metrics.binary_metric import BinaryMetric
 from tonic_validate.services.openai_service import OpenAIService
+from tonic_validate.services.litellm_service import LiteLLMService
 
 
 class ContextContainsPiiMetric(BinaryMetric):
@@ -40,7 +41,7 @@ def __init__(self, pii_types: List[str], textual_api_key: Optional[str] = None):
         super().__init__("context_contains_pii", self.metric_callback)
 
     def metric_callback(
-        self, llm_response: LLMResponse, openai_service: OpenAIService
+        self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
     ) -> bool:
         try:
             response = self.textual.redact("\n".join(llm_response.llm_context_list))

diff --git a/tonic_validate/metrics/context_length_metric.py b/tonic_validate/metrics/context_length_metric.py
@@ -1,9 +1,10 @@
 import logging
-from typing import Optional
+from typing import Optional, Union
 
 from tonic_validate.classes.llm_response import LLMResponse
 from tonic_validate.metrics.binary_metric import BinaryMetric
 from tonic_validate.services.openai_service import OpenAIService
+from tonic_validate.services.litellm_service import LiteLLMService
 
 logger = logging.getLogger()
 
@@ -35,7 +36,7 @@ def __init__(
         self.max_length = max_length
 
     def metric_callback(
-        self, llm_response: LLMResponse, openai_service: OpenAIService
+        self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
     ) -> bool:
         # For all items in the context list, check if the length is within the min and max length
         return all(

diff --git a/tonic_validate/metrics/duplication_metric.py b/tonic_validate/metrics/duplication_metric.py
@@ -1,8 +1,10 @@
 import logging
 
+from typing import Union
 from tonic_validate.classes.llm_response import LLMResponse
 from tonic_validate.metrics.binary_metric import BinaryMetric
 from tonic_validate.services.openai_service import OpenAIService
+from tonic_validate.services.litellm_service import LiteLLMService
 from tonic_validate.utils.llm_calls import contains_duplicate_information, contains_duplicate_info_prompt
 from tonic_validate.utils.metrics_util import parse_boolean_response
 
@@ -21,10 +23,10 @@ def __init__(self):
         super().__init__(self.name, self.metric_callback)
 
     async def metric_callback(
-        self, llm_response: LLMResponse, openai_service: OpenAIService
+        self, llm_response: LLMResponse, llm_service: Union[LiteLLMService, OpenAIService]
     ) -> bool:
         return parse_boolean_response(
             await contains_duplicate_information(
-                llm_response.llm_answer, openai_service
+                llm_response.llm_answer, llm_service
             )
         )