diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py index e8f39ef0d761..fe8b9df6230a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py @@ -48,6 +48,13 @@ class DefaultOpenEncoding: """SDK Default Encoding when writing a file""" +class EvaluationRunProperties: + """Defines properties used to identify an evaluation run by UI""" + + RUN_TYPE = "runType" + EVALUATION_RUN = "_azureml.evaluation_run" + + DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json" CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4 diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index c40528245b9f..d038bdb5840a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -16,6 +16,7 @@ from .._constants import ( CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT, EvaluationMetrics, + EvaluationRunProperties, Prefixes, _InternalEvaluationMetrics, ) @@ -346,7 +347,7 @@ def _apply_target_to_data( flow=target, display_name=evaluation_name, data=data, - properties={"runType": "eval_run", "isEvaluatorRun": "true"}, + properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"}, stream=True, name=_run_name, ) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py index b01bb33611d6..4e87fced2d85 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py @@ -12,7 +12,12 @@ import pandas as pd -from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME, DefaultOpenEncoding, Prefixes +from azure.ai.evaluation._constants import ( + DEFAULT_EVALUATION_RESULTS_FILE_NAME, + DefaultOpenEncoding, + Prefixes, + EvaluationRunProperties, +) from azure.ai.evaluation._evaluate._eval_run import EvalRun from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException @@ -112,7 +117,8 @@ def _log_metrics_and_instance_results( if run is None: ev_run.write_properties_to_run_history( properties={ - "_azureml.evaluation_run": "azure-ai-generative-parent", + EvaluationRunProperties.RUN_TYPE: "eval_run", + EvaluationRunProperties.EVALUATION_RUN: "azure-ai-generative-parent", "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]), "isEvaluatorRun": "true", } diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py index 1ee4e022250e..cb351de28f67 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py @@ -14,6 +14,7 @@ F1ScoreEvaluator, FluencyEvaluator, GroundednessEvaluator, + evaluate, ) @@ -399,6 +400,7 @@ def test_evaluate_track_in_cloud( # module named test_evaluate and it will be a different module in unit test # folder. By keeping function in separate file we guarantee, it will be loaded # from there. + # os.environ["AZURE_TEST_RUN_LIVE"] = "True" from .target_fn import target_fn f1_score_eval = F1ScoreEvaluator() @@ -413,7 +415,6 @@ def test_evaluate_track_in_cloud( ) row_result_df = pd.DataFrame(result["rows"]) - assert "outputs.answer" in row_result_df.columns assert "outputs.answer.length" in row_result_df.columns assert list(row_result_df["outputs.answer.length"]) == [28, 76, 22] assert "outputs.f1.f1_score" in row_result_df.columns @@ -427,6 +428,7 @@ def test_evaluate_track_in_cloud( assert remote_run is not None assert remote_run["runMetadata"]["properties"]["azureml.promptflow.local_to_cloud"] == "true" assert remote_run["runMetadata"]["properties"]["runType"] == "eval_run" + assert remote_run["runMetadata"]["properties"]["_azureml.evaluation_run"] == "promptflow.BatchRun" assert remote_run["runMetadata"]["displayName"] == evaluation_name @pytest.mark.skipif(in_ci(), reason="This test fails in CI and needs to be investigate. Bug: 3458432") @@ -470,6 +472,7 @@ def test_evaluate_track_in_cloud_no_target( remote_run = _get_run_from_run_history(run_id, azure_ml_client, project_scope) assert remote_run is not None + assert remote_run["runMetadata"]["properties"]["runType"] == "eval_run" assert remote_run["runMetadata"]["properties"]["_azureml.evaluation_run"] == "azure-ai-generative-parent" assert remote_run["runMetadata"]["displayName"] == evaluation_name