Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix __call__ Overload Types #38238

Merged
merged 11 commits into from
Nov 1, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,10 @@ def __call__( # pylint: disable=docstring-missing-param
or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
turns, the evaluator will aggregate the results of each turn.

:keyword query: The query to be evaluated.
:paramtype query: str
:keyword response: The response to be evaluated.
:paramtype response: Optional[str]
:keyword context: The context to be evaluated.
:paramtype context: Optional[str]
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
key "messages". Conversation turns are expected
to be dictionaries with keys "content" and "role".
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
:param azure_ai_project: The scope of the Azure AI project.
It contains subscription id, resource group, and project name.
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
:param eval_last_turn: Whether to evaluate the last turn of a conversation. Default is False.
:type eval_last_turn: bool
:param kwargs: Additional arguments to pass to the evaluator.
:type kwargs: Any
:return: A function that evaluates content-safety metrics for "question-answering" scenario.
Expand Down Expand Up @@ -69,8 +67,8 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
"""

# TODO address 3579092 to re-enabled parallel evals.
def __init__(self, credential, azure_ai_project, eval_last_turn: bool = False, **kwargs):
super().__init__(eval_last_turn=eval_last_turn)
def __init__(self, credential, azure_ai_project, **kwargs):
super().__init__()
self._parallel = kwargs.pop("_parallel", False)
self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
ViolenceEvaluator(credential, azure_ai_project),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,11 @@ def __init__(
self,
credential,
azure_ai_project,
eval_last_turn: bool = False,
):
super().__init__(
eval_metric=EvaluationMetrics.HATE_FAIRNESS,
azure_ai_project=azure_ai_project,
credential=credential,
eval_last_turn=eval_last_turn,
)

@overload
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,11 @@ def __init__(
self,
credential,
azure_ai_project,
eval_last_turn: bool = False,
):
super().__init__(
eval_metric=EvaluationMetrics.SELF_HARM,
azure_ai_project=azure_ai_project,
credential=credential,
eval_last_turn=eval_last_turn,
)

@overload
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,11 @@ def __init__(
self,
credential,
azure_ai_project,
eval_last_turn: bool = False,
):
super().__init__(
eval_metric=EvaluationMetrics.SEXUAL,
azure_ai_project=azure_ai_project,
credential=credential,
eval_last_turn=eval_last_turn,
)

@overload
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,11 @@ def __init__(
self,
credential,
azure_ai_project,
eval_last_turn: bool = False,
):
super().__init__(
eval_metric=EvaluationMetrics.VIOLENCE,
azure_ai_project=azure_ai_project,
credential=credential,
eval_last_turn=eval_last_turn,
)

@overload
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,11 @@ def __init__(
self,
credential,
azure_ai_project,
eval_last_turn: bool = False,
):
super().__init__(
eval_metric=_InternalEvaluationMetrics.ECI,
azure_ai_project=azure_ai_project,
credential=credential,
eval_last_turn=eval_last_turn,
)

@overload
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def __call__(
*,
response: str,
) -> Dict[str, Union[str, float]]:
"""Evaluate fluency in given query/response
"""Evaluate fluency in given response

:keyword response: The response to be evaluated.
:paramtype response: str
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,11 @@ def __init__(
self,
credential,
azure_ai_project,
eval_last_turn: bool = False,
):
super().__init__(
eval_metric=EvaluationMetrics.PROTECTED_MATERIAL,
azure_ai_project=azure_ai_project,
credential=credential,
eval_last_turn=eval_last_turn,
)

@overload
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,14 +180,14 @@ def __call__(
self,
*,
conversation: Conversation,
) -> Dict[str, Union[float, Dict[str, List[float]]]]:
) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
"""Evaluates retrieval for a for a multi-turn evaluation. If the conversation has more than one turn,
the evaluator will aggregate the results of each turn.

:keyword conversation: The conversation to be evaluated.
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
:return: The scores for Chat scenario.
:rtype: :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
:rtype: Dict[str, Union[float, Dict[str, List[float]]]]
"""

def __call__(self, *args, **kwargs): # pylint: disable=docstring-missing-param
Expand All @@ -202,7 +202,7 @@ def __call__(self, *args, **kwargs): # pylint: disable=docstring-missing-param
:keyword conversation: The conversation to be evaluated.
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
:return: The scores for Chat scenario.
:rtype: :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
:rtype: :rtype: Dict[str, Union[float, Dict[str, List[str, float]]]]
"""
query = kwargs.pop("query", None)
context = kwargs.pop("context", None)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from typing import List, Optional, Union, Dict
from typing import List, Union, Dict
from typing_extensions import overload, override

from azure.ai.evaluation._common._experimental import experimental
Expand Down Expand Up @@ -105,18 +105,18 @@ def __init__(
def __call__(
self,
*,
query: Optional[str] = None,
response: Optional[str] = None,
context: Optional[str] = None,
response: str,
context: str,
query: str,
) -> Dict[str, Union[str, bool]]:
"""Evaluate groundedness for a given query/response/context

:keyword query: The query to be evaluated.
:paramtype query: Optional[str]
:keyword response: The response to be evaluated.
:paramtype response: Optional[str]
:paramtype response: str
:keyword context: The context to be evaluated.
:paramtype context: Optional[str]
:paramtype context: str
:keyword query: The query to be evaluated.
:paramtype query: Optional[str]
:return: The relevance score.
:rtype: Dict[str, Union[str, bool]]
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,6 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
:param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project
name.
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
:param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
:type eval_last_turn: bool

**Usage**

Expand Down Expand Up @@ -57,13 +54,11 @@ def __init__(
self,
credential,
azure_ai_project,
eval_last_turn: bool = False,
):
super().__init__(
eval_metric=EvaluationMetrics.XPIA,
azure_ai_project=azure_ai_project,
credential=credential,
eval_last_turn=eval_last_turn,
)

@overload
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from azure.ai.evaluation._common.utils import validate_azure_ai_project
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
from azure.ai.evaluation._http_utils import get_async_http_client
from azure.ai.evaluation._model_configurations import AzureAIProject
from azure.ai.evaluation.simulator import AdversarialScenario
from azure.ai.evaluation.simulator._adversarial_scenario import _UnstableAdversarialScenario
from azure.core.credentials import TokenCredential
Expand Down Expand Up @@ -48,7 +49,7 @@ class AdversarialSimulator:
:type credential: ~azure.core.credentials.TokenCredential
"""

def __init__(self, *, azure_ai_project: dict, credential):
def __init__(self, *, azure_ai_project: AzureAIProject, credential: TokenCredential):
"""Constructor."""

try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from azure.ai.evaluation._common.utils import validate_azure_ai_project
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
from azure.ai.evaluation.simulator import AdversarialScenario
from azure.ai.evaluation._model_configurations import AzureAIProject
from azure.core.credentials import TokenCredential

from ._adversarial_simulator import AdversarialSimulator
Expand All @@ -32,7 +33,7 @@ class DirectAttackSimulator:
:type credential: ~azure.core.credentials.TokenCredential
"""

def __init__(self, *, azure_ai_project: dict, credential):
def __init__(self, *, azure_ai_project: AzureAIProject, credential: TokenCredential):
"""Constructor."""

try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from azure.ai.evaluation._common._experimental import experimental
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
from azure.ai.evaluation.simulator import AdversarialScenarioJailbreak, SupportedLanguages
from azure.ai.evaluation._model_configurations import AzureAIProject
from azure.core.credentials import TokenCredential

from ._adversarial_simulator import AdversarialSimulator, JsonLineList
Expand All @@ -34,7 +35,7 @@ class IndirectAttackSimulator(AdversarialSimulator):
:type credential: ~azure.core.credentials.TokenCredential
"""

def __init__(self, *, azure_ai_project: dict, credential):
def __init__(self, *, azure_ai_project: AzureAIProject, credential: TokenCredential):
"""Constructor."""

try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -285,12 +285,13 @@ def test_content_safety_evaluator_self_harm(self, project_scope, azure_cred, sim
assert score2["evaluation_per_turn"]["self_harm_score"][1] > 0
assert all(score2["evaluation_per_turn"]["self_harm_reason"]), "self_harm_reason must not be None or empty."

# Test eval_last_turn behavior
eval_fn_last = SelfHarmEvaluator(azure_cred, project_scope, eval_last_turn=True)
score3 = eval_fn_last(conversation=simple_conversation)
assert score3["self_harm"] in (level.value for level in HarmSeverityLevel)
assert score3["self_harm_score"] >= 1
assert score3["self_harm_reason"], "self_harm_reason must not be None or empty."
# removed eval_last_turn from evaluators since behavior is incorrect. We can re-enable when
# it's added bakc to evaluators
# eval_fn_last = SelfHarmEvaluator(azure_cred, project_scope, eval_last_turn=True)
# score3 = eval_fn_last(conversation=simple_conversation)
# assert score3["self_harm"] == "Very low"
# assert score3["self_harm_score"] >= 1
# assert score3["self_harm_reason"], "self_harm_reason must not be None or empty."

def test_content_safety_evaluator_hate_unfairness(self, project_scope, azure_cred, simple_conversation):
eval_fn = HateUnfairnessEvaluator(azure_cred, project_scope)
Expand Down
Loading