diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index fb8169e0729f..79f0ff3a2102 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -7,6 +7,7 @@ ### Breaking Changes - Removed `numpy` dependency. All NaN values returned by the SDK have been changed to from `numpy.nan` to `math.nan`. +- `credential` is now required to be passed in for all content safety evaluators and `ProtectedMaterialsEvaluator`. `DefaultAzureCredential` will no longer be chosen if a credential is not passed. ### Bugs Fixed diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py index 4896c30a755b..fb84db1a2cba 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py @@ -16,7 +16,6 @@ from azure.ai.evaluation._http_utils import get_async_http_client from azure.ai.evaluation._model_configurations import AzureAIProject from azure.core.credentials import TokenCredential -from azure.identity import DefaultAzureCredential from .constants import ( CommonConstants, @@ -434,10 +433,6 @@ async def evaluate_with_rai_service( :return: The parsed annotation result. :rtype: List[List[Dict]] """ - # Use DefaultAzureCredential if no credential is provided - # This is for the for batch run scenario as the credential cannot be serialized by promoptflow - if credential is None or credential == {}: - credential = DefaultAzureCredential() # Get RAI service URL from discovery service and check service availability token = await fetch_or_reuse_token(credential) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py index 7bbb82240c8a..4267c05abb7d 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py @@ -5,7 +5,7 @@ from typing import Dict, Optional from typing_extensions import override -from azure.identity import DefaultAzureCredential +from azure.core.credentials import TokenCredential from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service from azure.ai.evaluation._exceptions import EvaluationException @@ -17,14 +17,14 @@ class RaiServiceEvaluatorBase(EvaluatorBase): This includes content safety evaluators, protected material evaluators, and others. These evaluators are all assumed to be of the "query and response or conversation" input variety. - param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic - to specify which evaluation to perform. - type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics - param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no + :param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic + to specify which evaluation to perform. + :type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics + :param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no aggregation will be performed. If False, all turns will be evaluated and the numeric results will be, aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key when this occurs. Default is False, resulting full conversation evaluation and aggregation. - type eval_last_turn: bool + :type eval_last_turn: bool """ @override @@ -32,17 +32,13 @@ def __init__( self, eval_metric: EvaluationMetrics, azure_ai_project: dict, - credential: Optional[dict] = None, + credential: TokenCredential, eval_last_turn: bool = False, ): super().__init__(eval_last_turn=eval_last_turn) self._eval_metric = eval_metric self._azure_ai_project = azure_ai_project - if credential is None: - # Use DefaultCredential if no credential is provided - self._credential = DefaultAzureCredential() - else: - self._credential = credential + self._credential = credential @override def __call__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py index 4310122f2951..c1d4d2c1bc94 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py @@ -21,13 +21,13 @@ class ContentSafetyEvaluator: """ Initialize a content safety evaluator configured to evaluate content safetry metrics for QA scenario. + :param credential: The credential for connecting to Azure AI project. Required + :type credential: ~azure.core.credentials.TokenCredential :param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project name. :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution. Default is True. - :param credential: The credential for connecting to Azure AI project. - :type credential: ~azure.core.credentials.TokenCredential :return: A function that evaluates content-safety metrics for "question-answering" scenario. :rtype: Callable @@ -66,13 +66,13 @@ class ContentSafetyEvaluator: } """ - def __init__(self, azure_ai_project: dict, parallel: bool = True, credential=None): + def __init__(self, credential, azure_ai_project: dict, parallel: bool = True): self._parallel = parallel self._evaluators = [ - ViolenceEvaluator(azure_ai_project, credential), - SexualEvaluator(azure_ai_project, credential), - SelfHarmEvaluator(azure_ai_project, credential), - HateUnfairnessEvaluator(azure_ai_project, credential), + ViolenceEvaluator(credential, azure_ai_project), + SexualEvaluator(credential, azure_ai_project), + SelfHarmEvaluator(credential, azure_ai_project), + HateUnfairnessEvaluator(credential, azure_ai_project), ] def __call__(self, *, query: str, response: str, **kwargs): diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py index a17a0a4cdcfa..b4587ce01af3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py @@ -29,6 +29,8 @@ class ContentSafetyChatEvaluator: """ Initialize a content safety chat evaluator configured to evaluate content safetry metrics for chat scenario. + :param credential: The credential for connecting to Azure AI project. Required + :type credential: ~azure.core.credentials.TokenCredential :param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project name. :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject @@ -38,8 +40,6 @@ class ContentSafetyChatEvaluator: :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution. Default is True. :type parallel: bool - :param credential: The credential for connecting to Azure AI project. - :type credential: ~azure.core.credentials.TokenCredential :return: A function that evaluates and generates metrics for "chat" scenario. :rtype: Callable @@ -88,7 +88,7 @@ class ContentSafetyChatEvaluator: } """ - def __init__(self, azure_ai_project: dict, eval_last_turn: bool = False, parallel: bool = True, credential=None): + def __init__(self, credential, azure_ai_project: dict, eval_last_turn: bool = False, parallel: bool = True): self._eval_last_turn = eval_last_turn self._parallel = parallel self._evaluators = [ diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py index b3c78fd6842a..cda53fa057a7 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py @@ -1,7 +1,6 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Optional from typing_extensions import override from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase @@ -11,11 +10,11 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase): """ Initialize a hate-unfairness evaluator for hate unfairness score. + :param credential: The credential for connecting to Azure AI project. Required + :type credential: ~azure.core.credentials.TokenCredential :param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project name. :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject - :param credential: The credential for connecting to Azure AI project. - :type credential: Optional[~azure.core.credentials.TokenCredential] **Usage** @@ -43,8 +42,8 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase): @override def __init__( self, + credential, azure_ai_project: dict, - credential: Optional[dict] = None, eval_last_turn: bool = False, ): super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py index 8854e2eb73ca..2948c49e84e1 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py @@ -1,7 +1,6 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Optional from typing_extensions import override from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase @@ -11,11 +10,11 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase): """ Initialize a self harm evaluator for self harm score. + :param credential: The credential for connecting to Azure AI project. Required + :type credential: ~azure.core.credentials.TokenCredential :param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project name. :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject - :param credential: The credential for connecting to Azure AI project. - :type credential: Optional[~azure.core.credentials.TokenCredential] **Usage** @@ -43,8 +42,8 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase): @override def __init__( self, + credential, azure_ai_project: dict, - credential: Optional[dict] = None, eval_last_turn: bool = False, ): super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py index 0d0fd973b549..a752a49b3c52 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py @@ -1,7 +1,6 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Optional from typing_extensions import override from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase @@ -11,11 +10,11 @@ class SexualEvaluator(RaiServiceEvaluatorBase): """ Initialize a sexual evaluator for sexual score. + :param credential: The credential for connecting to Azure AI project. Required + :type credential: ~azure.core.credentials.TokenCredential :param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project name. :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject - :param credential: The credential for connecting to Azure AI project. - :type credential: Optional[~azure.core.credentials.TokenCredential] **Usage** @@ -43,8 +42,8 @@ class SexualEvaluator(RaiServiceEvaluatorBase): @override def __init__( self, + credential, azure_ai_project: dict, - credential: Optional[dict] = None, eval_last_turn: bool = False, ): super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py index 59729fcedef0..606c256750d9 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py @@ -1,7 +1,6 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Optional from typing_extensions import override from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase @@ -11,11 +10,11 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase): """ Initialize a violence evaluator for violence score. + :param credential: The credential for connecting to Azure AI project. Required + :type credential: ~azure.core.credentials.TokenCredential :param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project name. :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject - :param credential: The credential for connecting to Azure AI project. - :type credential: Optional[~azure.core.credentials.TokenCredential] **Usage** @@ -43,8 +42,8 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase): @override def __init__( self, + credential, azure_ai_project: dict, - credential: Optional[dict] = None, eval_last_turn: bool = False, ): super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py index eb314a239072..59e3f616fbb0 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py @@ -1,7 +1,6 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Optional from typing_extensions import override from azure.ai.evaluation._common.constants import _InternalEvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase @@ -17,11 +16,11 @@ class ECIEvaluator(RaiServiceEvaluatorBase): "AI-generated content may be incorrect. If you are seeking ECI-related information, please go to Bing Search." Outputs True or False with AI-generated reasoning. + :param credential: The credential for connecting to Azure AI project. Required + :type credential: ~azure.core.credentials.TokenCredential :param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project name. :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject - :param credential: The credential for connecting to Azure AI project. - :type credential: Optional[~azure.core.credentials.TokenCredential] :return: Whether or not ECI was found in the response without a disclaimer, with AI-generated reasoning :rtype: Dict[str, str] @@ -50,8 +49,8 @@ class ECIEvaluator(RaiServiceEvaluatorBase): @override def __init__( self, + credential, azure_ai_project: dict, - credential: Optional[dict] = None, eval_last_turn: bool = False, ): super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py index 7a7ed46b13f3..6035e5bc67c9 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py @@ -1,7 +1,6 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Optional from typing_extensions import override from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase @@ -12,11 +11,11 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase): Initialize a protected material evaluator to detect whether protected material is present in your AI system's response. Outputs True or False with AI-generated reasoning. + :param credential: The credential for connecting to Azure AI project. Required + :type credential: ~azure.core.credentials.TokenCredential :param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project name. :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject - :param credential: The credential for connecting to Azure AI project. - :type credential: Optional[~azure.core.credentials.TokenCredential] :return: Whether or not protected material was found in the response, with AI-generated reasoning. :rtype: Dict[str, str] @@ -45,8 +44,8 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase): @override def __init__( self, + credential, azure_ai_project: dict, - credential: Optional[dict] = None, eval_last_turn: bool = False, ): super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py index 4e3dd48744a3..fe4f9c7cfd84 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py @@ -54,11 +54,11 @@ class ProtectedMaterialsEvaluator: Initialize a protected materials evaluator to detect whether protected material is present in your AI system's response. Outputs True or False with AI-generated reasoning. + :param credential: The credential for connecting to Azure AI project. Required + :type credential: ~azure.core.credentials.TokenCredential :param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project name. :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject - :param credential: The credential for connecting to Azure AI project. - :type credential: ~azure.core.credentials.TokenCredential :return: Whether or not protected material was found in the response, with AI-generated reasoning. :rtype: Dict[str, str] @@ -84,7 +84,7 @@ class ProtectedMaterialsEvaluator: } """ - def __init__(self, azure_ai_project: dict, credential=None): + def __init__(self, credential, azure_ai_project: dict): self._async_evaluator = _AsyncProtectedMaterialsEvaluator(azure_ai_project, credential) def __call__(self, *, query: str, response: str, **kwargs): diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py index 2a36d21e5158..ea5a12868f04 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py @@ -3,7 +3,6 @@ # --------------------------------------------------------- import logging -from typing import Optional from typing_extensions import override from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase @@ -17,14 +16,14 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase): Detect whether cross domain injected attacks are present in your AI system's response. + :param credential: The credential for connecting to Azure AI project. Required + :type credential: ~azure.core.credentials.TokenCredential :param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project name. :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue, focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False :type eval_last_turn: bool - :param credential: The credential for connecting to Azure AI project. - :type credential: Optional[~azure.core.credentials.TokenCredential] :return: A function that evaluates and generates metrics for XPIA chat scenario. Metrics include the overall evaluation label and reason for the Q/A Pair, as well as sub-labels for manipulated content, intrusion, and information. @@ -53,8 +52,8 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase): @override def __init__( self, + credential, azure_ai_project: dict, - credential: Optional[dict] = None, eval_last_turn: bool = False, ): super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/setup.py b/sdk/evaluation/azure-ai-evaluation/setup.py index b878ee5998d6..0847ff315fea 100644 --- a/sdk/evaluation/azure-ai-evaluation/setup.py +++ b/sdk/evaluation/azure-ai-evaluation/setup.py @@ -68,7 +68,8 @@ "promptflow-devkit>=1.15.0", "promptflow-core>=1.15.0", "pyjwt>=2.8.0", - "azure-identity>=1.12.0", + # pickle support for credentials was added to this release + "azure-identity>=1.16.0", "azure-core>=1.30.2", "nltk>=3.9.1", "rouge-score>=0.1.2", diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py index 2c4a63ed136a..c4f2b286ae9c 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py @@ -171,7 +171,7 @@ def test_quality_evaluator_prompt_based_with_dict_input(self, model_config): assert score["gpt_fluency"] > 0.0 def test_content_safety_evaluator_violence(self, project_scope, azure_cred, simple_conversation): - eval_fn = ViolenceEvaluator(project_scope, azure_cred) + eval_fn = ViolenceEvaluator(azure_cred, project_scope) score = eval_fn( query="What is the capital of Japan?", response="The capital of Japan is Tokyo.", @@ -194,7 +194,7 @@ def test_content_safety_evaluator_violence(self, project_scope, azure_cred, simp assert all(score2["evaluation_per_turn"]["violence_reason"]), "violence_reason must not be None or empty." def test_content_safety_evaluator_sexual(self, project_scope, azure_cred, simple_conversation): - eval_fn = SexualEvaluator(project_scope, azure_cred) + eval_fn = SexualEvaluator(azure_cred, project_scope) score = eval_fn( query="What is the capital of Japan?", response="The capital of Japan is Tokyo.", @@ -213,7 +213,7 @@ def test_content_safety_evaluator_sexual(self, project_scope, azure_cred, simple assert all(score2["evaluation_per_turn"]["sexual_reason"]), "sexual_reason must not be None or empty." def test_content_safety_evaluator_self_harm(self, project_scope, azure_cred, simple_conversation): - eval_fn = SelfHarmEvaluator(project_scope, azure_cred) + eval_fn = SelfHarmEvaluator(azure_cred, project_scope) score = eval_fn( query="What is the capital of Japan?", response="The capital of Japan is Tokyo.", @@ -235,14 +235,14 @@ def test_content_safety_evaluator_self_harm(self, project_scope, azure_cred, sim assert all(score2["evaluation_per_turn"]["self_harm_reason"]), "self_harm_reason must not be None or empty." # Test eval_last_turn behavior - eval_fn_last = SelfHarmEvaluator(project_scope, azure_cred, eval_last_turn=True) + eval_fn_last = SelfHarmEvaluator(azure_cred, project_scope, eval_last_turn=True) score3 = eval_fn_last(conversation=simple_conversation) assert score3["self_harm"] != "Very low" assert score3["self_harm_score"] > 0 assert score3["self_harm_reason"], "self_harm_reason must not be None or empty." def test_content_safety_evaluator_hate_unfairness(self, project_scope, azure_cred, simple_conversation): - eval_fn = HateUnfairnessEvaluator(project_scope, azure_cred) + eval_fn = HateUnfairnessEvaluator(azure_cred, project_scope) score = eval_fn( query="What is the capital of Japan?", response="The capital of Japan is Tokyo.", @@ -270,7 +270,7 @@ def test_content_safety_evaluator_hate_unfairness(self, project_scope, azure_cre @pytest.mark.skipif(True, reason="Content safety service IS available now!") def test_content_safety_service_unavailable(self, project_scope, azure_cred): - eval_fn = ViolenceEvaluator(project_scope, azure_cred) + eval_fn = ViolenceEvaluator(azure_cred, project_scope) # Doing this is replay mode breaks causes mismatch between scrubbed recordings # and the actual request made. if is_live(): @@ -337,7 +337,7 @@ def test_composite_evaluator_qa_for_nans(self, model_config): assert not math.isnan(score["gpt_similarity"]) def test_composite_evaluator_content_safety(self, project_scope, azure_cred): - safety_eval = ContentSafetyEvaluator(project_scope, parallel=False, credential=azure_cred) + safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, parallel=False) score = safety_eval( query="Tokyo is the capital of which country?", response="Japan", @@ -358,7 +358,7 @@ def test_composite_evaluator_content_safety(self, project_scope, azure_cred): assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty." def test_protected_material_evaluator(self, project_scope, azure_cred, simple_conversation): - ip_eval = ProtectedMaterialEvaluator(project_scope, credential=azure_cred) + ip_eval = ProtectedMaterialEvaluator(azure_cred, project_scope) good_result = ip_eval( query="What shape has 4 equilateral sides?", response="Rhombus", @@ -404,7 +404,7 @@ def test_protected_material_evaluator(self, project_scope, azure_cred, simple_co ), "protected_material_reason must not be None or empty." def test_eci_evaluator(self, project_scope, azure_cred, simple_conversation): - eci_eval = ECIEvaluator(project_scope, credential=azure_cred) + eci_eval = ECIEvaluator(azure_cred, project_scope) unrelated_result = eci_eval( query="What shape has 4 equilateral sides?", response="Rhombus", @@ -421,7 +421,7 @@ def test_eci_evaluator(self, project_scope, azure_cred, simple_conversation): def test_xpia_evaluator(self, project_scope, azure_cred, simple_conversation): - xpia_eval = IndirectAttackEvaluator(project_scope, credential=azure_cred) + xpia_eval = IndirectAttackEvaluator(azure_cred, project_scope) unrelated_result = xpia_eval( query="What shape has 4 equilateral sides?", response="Rhombus", diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_sim_and_eval.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_sim_and_eval.py index c15e3f3912a7..247d7a3ea74f 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_sim_and_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_sim_and_eval.py @@ -95,7 +95,7 @@ async def callback( file.writelines([json.dumps({"conversation": conversation}) + "\n" for conversation in simulator_output]) # Evaluator simulator output - violence_eval = ViolenceEvaluator(project_scope, credential=DefaultAzureCredential()) + violence_eval = ViolenceEvaluator(DefaultAzureCredential(), project_scope) # run the evaluation eval_output = evaluate( data=file_name, diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate_telemetry.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate_telemetry.py index 4e3d4ced44e2..fa88bb96d597 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate_telemetry.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate_telemetry.py @@ -136,12 +136,13 @@ def test_evaluators_telemetry(self, mock_app_insight_logger): def test_evaluator_start_telemetry( self, + azure_cred, mock_app_insight_logger, mock_project_scope, mock_trace_destination_to_cloud, mock_validate_trace_destination, ): - hate_unfairness = HateUnfairnessEvaluator(azure_ai_project=None) + hate_unfairness = HateUnfairnessEvaluator(azure_cred, azure_ai_project=None) data = _get_file("evaluate_test_data.jsonl") evaluators = {