From ed6259514449d868c9107604b4dc3d0ec4c17de7 Mon Sep 17 00:00:00 2001 From: zhangxingzhi Date: Tue, 18 Oct 2022 19:23:22 +0800 Subject: [PATCH 1/4] feat: enable internal components in pipeline yaml --- .../azure/ai/ml/_internal/_schema/node.py | 4 +- .../azure/ai/ml/entities/_builders/command.py | 4 +- .../entities/_job/pipeline/_load_component.py | 11 ++ .../internal/unittests/test_pipeline_job.py | 17 ++- .../pipeline_job_with_properties.yml | 140 +----------------- .../pipeline_job_with_properties.yml | 12 -- 6 files changed, 38 insertions(+), 150 deletions(-) diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/_internal/_schema/node.py b/sdk/ml/azure-ai-ml/azure/ai/ml/_internal/_schema/node.py index 7cba9af5c709..39bb649e5890 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/_internal/_schema/node.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/_internal/_schema/node.py @@ -6,7 +6,7 @@ from azure.ai.ml._schema import ArmVersionedStr, NestedField, RegistryStr, StringTransformedEnum, UnionField from azure.ai.ml._schema.pipeline.component_job import BaseNodeSchema, _resolve_inputs_outputs -from azure.ai.ml.constants._common import AzureMLResourceType +from azure.ai.ml.constants._common import AzureMLResourceType, BASE_PATH_CONTEXT_KEY from .component import InternalBaseComponentSchema, NodeType @@ -40,7 +40,7 @@ def make(self, data, **kwargs): # pylint: disable=unused-argument, no-self-use # dict to node object from azure.ai.ml.entities._job.pipeline._load_component import pipeline_node_factory - return pipeline_node_factory.load_from_dict(data) # pylint: disable=E1125, too-many-function-args + return pipeline_node_factory.load_from_dict(data=data) @pre_dump def resolve_inputs_outputs(self, job, **kwargs): # pylint: disable=unused-argument, no-self-use diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_builders/command.py b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_builders/command.py index b0c4696d2883..18b99b098f17 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_builders/command.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_builders/command.py @@ -264,9 +264,9 @@ def command(self, value: str) -> None: if isinstance(self.component, Component): self.component.command = value else: - msg = "Can't set command property for a registered component {}" + msg = "Can't set command property for a registered component {}. Tried to set it to {}." raise ValidationException( - message=msg.format(self.component), + message=msg.format(self.component, value), no_personal_data_message=msg, target=ErrorTarget.COMMAND_JOB, error_category=ErrorCategory.USER_ERROR, diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/pipeline/_load_component.py b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/pipeline/_load_component.py index 332ed4255a58..59803dd94135 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/pipeline/_load_component.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/pipeline/_load_component.py @@ -20,6 +20,7 @@ from azure.ai.ml.entities._builders.do_while import DoWhile from azure.ai.ml.entities._builders.pipeline import Pipeline from azure.ai.ml.entities._component.component import Component +from azure.ai.ml.entities._component.component_factory import component_factory from azure.ai.ml.entities._job.automl.automl_job import AutoMLJob from azure.ai.ml.entities._util import extract_label from azure.ai.ml.exceptions import ErrorCategory, ErrorTarget, ValidationException @@ -172,6 +173,16 @@ def load_from_dict(self, *, data: dict, _type: str = None) -> Union[BaseNode, Au else: data[CommonYamlFields.TYPE] = _type + # parse component + component_key = "component" + if component_key in data and isinstance(data[component_key], dict): + data[component_key] = component_factory.load_from_dict( + data=data[component_key], + context={ + BASE_PATH_CONTEXT_KEY: data[component_key].get(BASE_PATH_CONTEXT_KEY, None), + } + ) + new_instance = self.get_create_instance_func(_type)() new_instance.__init__(**data) return new_instance diff --git a/sdk/ml/azure-ai-ml/tests/internal/unittests/test_pipeline_job.py b/sdk/ml/azure-ai-ml/tests/internal/unittests/test_pipeline_job.py index c4d954ee3769..123b46f1c53c 100644 --- a/sdk/ml/azure-ai-ml/tests/internal/unittests/test_pipeline_job.py +++ b/sdk/ml/azure-ai-ml/tests/internal/unittests/test_pipeline_job.py @@ -8,7 +8,7 @@ import pytest import yaml -from azure.ai.ml import Input, load_component +from azure.ai.ml import Input, load_component, load_job from azure.ai.ml._internal import ( AISuperComputerConfiguration, AISuperComputerScalePolicy, @@ -592,3 +592,18 @@ def test_pipeline_with_setting_node_output_directly(self) -> None: copy_file.outputs.output_dir.path = "path_on_datastore" assert copy_file.outputs.output_dir.path == "path_on_datastore" assert copy_file.outputs.output_dir.type == "path" + + def test_job_properties(self): + pipeline_job: PipelineJob = load_job( + source="./tests/test_configs/internal/pipeline_jobs/pipeline_job_with_properties.yml" + ) + pipeline_dict = pipeline_job._to_dict() + rest_pipeline_dict = pipeline_job._to_rest_object().as_dict()["properties"] + assert pipeline_dict["properties"] == {"AZURE_ML_PathOnCompute_input_data": "/tmp/test"} + assert rest_pipeline_dict["properties"] == pipeline_dict["properties"] + for name, node_dict in pipeline_dict["jobs"].items(): + rest_node_dict = rest_pipeline_dict["jobs"][name] + assert len(node_dict["properties"]) == 1 + assert "AZURE_ML_PathOnCompute_" in list(node_dict["properties"].keys())[0] + assert node_dict["properties"] == rest_node_dict["properties"] + diff --git a/sdk/ml/azure-ai-ml/tests/test_configs/internal/pipeline_jobs/pipeline_job_with_properties.yml b/sdk/ml/azure-ai-ml/tests/test_configs/internal/pipeline_jobs/pipeline_job_with_properties.yml index a208c528b32f..3904529ce48f 100644 --- a/sdk/ml/azure-ai-ml/tests/test_configs/internal/pipeline_jobs/pipeline_job_with_properties.yml +++ b/sdk/ml/azure-ai-ml/tests/test_configs/internal/pipeline_jobs/pipeline_job_with_properties.yml @@ -26,139 +26,13 @@ properties: AZURE_ML_PathOnCompute_input_data: "/tmp/test" jobs: - node0: # inline command job with properties - command: echo hello ${{inputs.hello_string}} - environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest - inputs: - hello_string: ${{parent.inputs.hello_string}} - properties: - AZURE_ML_PathOnCompute_hello_string: "/tmp/test" - - node1: # inline parallel job with properties - type: parallel - compute: "azureml:cpu-cluster" - inputs: - test1: ${{parent.inputs.input_data}} - resources: - instance_count: 3 - mini_batch_size: "100kb" - mini_batch_error_threshold: 5 - logging_level: "DEBUG" - input_data: ${{inputs.input_data}} - max_concurrency_per_instance: 2 - task: - type: run_function - code: "../python" - entry_script: pass_through.py - append_row_to: ${{outputs.scored_result}} # optional, If Null, equals to summary_only mode in v1. - environment: azureml:my-env:1 - properties: - AZURE_ML_PathOnCompute_input_data: "/tmp/test" - - node2: # inline import job with properties - type: import - source: - type: azuresqldb - query: >- - select * from REGION - connection: azureml:my_username_password - output: - type: mltable - path: azureml://datastores/workspaceblobstore/paths/output_dir/ - properties: - AZURE_ML_PathOnCompute_output: "/tmp/test" - - node3: # inline spark job with properties - type: spark - inputs: - test1: ${{parent.inputs.input_data}} - file_input2: ${{parent.inputs.input_data}} - code: ../dsl_pipeline/spark_job_in_pipeline/src - entry: - file: entry.py # file path of the entry file relative to the code root folder - py_files: - - utils.zip - jars: - - scalaproj.jar - files: - - my_files.txt - args: >- - --file_input1 ${{inputs.test1}} - --file_input2 ${{inputs.file_input2}} - --output ${{outputs.output}} - compute: azureml:rezas-synapse-10 - conf: - spark.driver.cores: 2 - spark.driver.memory: "1g" - spark.executor.cores: 1 - spark.executor.memory: "1g" - spark.executor.instances: 1 - properties: - AZURE_ML_PathOnCompute_input_data: "/tmp/test" - - node4: # inline automl job with properties - type: automl - task: text_ner - log_verbosity: info - primary_metric: accuracy - limits: - max_trials: 1 - timeout_minutes: 60 - training_data: ${{parent.inputs.text_ner_training_data}} - validation_data: ${{parent.inputs.text_ner_validation_data}} - properties: - AZURE_ML_PathOnCompute_training_data: "/tmp/test" - - node5: # inline sweep job with properties - type: sweep - search_space: - component_in_number: - type: choice - values: - - 25 - - 35 - limits: - max_total_trials: 3 - sampling_algorithm: random - objective: - goal: maximize - primary_metric: accuracy - trial: azureml:microsoftsamplescommandcomponentbasic_nopaths_test:1 - properties: - AZURE_ML_PathOnCompute_input: "/tmp/test" - - node6: # parallel node with properties as a typical implement of base node. - type: parallel + node7: # internal command node with properties as a typical implement of internal base node. + type: CommandComponent compute: azureml:cpu-cluster - component: ../components/parallel_component_with_file_input.yml + component: file:../helloworld/helloworld_component_command.yml inputs: - job_data_path: ${{parent.inputs.pipeline_job_data_path}} - outputs: - job_output_path: - mini_batch_size: "1" - mini_batch_error_threshold: 1 - max_concurrency_per_instance: 1 - properties: - AZURE_ML_PathOnCompute_job_data_path: "/tmp/test" - -# Comment these lines out as internal node is not well supported in yaml now. -# node7: # internal command node with properties as a typical implement of internal base node. -# type: CommandComponent -# compute: azureml:cpu-cluster -# component: ../internal/helloworld/helloworld_component_command.yml -# inputs: -# training_data: ${{parent.inputs.input_data}} -# max_epochs: 10 -# learning_rate: 0.01 -# properties: -# AZURE_ML_PathOnCompute_job_training_data: "/tmp/test" - - node8: # pipeline node with properties - type: pipeline - inputs: - component_in_number: 11 - component_in_path: ${{parent.inputs.input_data}} - - component: ../components/helloworld_pipeline_component.yml + training_data: ${{parent.inputs.input_data}} + max_epochs: 10 + learning_rate: 0.01 properties: - AZURE_ML_PathOnCompute_job_component_in_path: "/tmp/test" + AZURE_ML_PathOnCompute_job_training_data: "/tmp/test" diff --git a/sdk/ml/azure-ai-ml/tests/test_configs/pipeline_jobs/pipeline_job_with_properties.yml b/sdk/ml/azure-ai-ml/tests/test_configs/pipeline_jobs/pipeline_job_with_properties.yml index a208c528b32f..6f686cef5891 100644 --- a/sdk/ml/azure-ai-ml/tests/test_configs/pipeline_jobs/pipeline_job_with_properties.yml +++ b/sdk/ml/azure-ai-ml/tests/test_configs/pipeline_jobs/pipeline_job_with_properties.yml @@ -141,18 +141,6 @@ jobs: properties: AZURE_ML_PathOnCompute_job_data_path: "/tmp/test" -# Comment these lines out as internal node is not well supported in yaml now. -# node7: # internal command node with properties as a typical implement of internal base node. -# type: CommandComponent -# compute: azureml:cpu-cluster -# component: ../internal/helloworld/helloworld_component_command.yml -# inputs: -# training_data: ${{parent.inputs.input_data}} -# max_epochs: 10 -# learning_rate: 0.01 -# properties: -# AZURE_ML_PathOnCompute_job_training_data: "/tmp/test" - node8: # pipeline node with properties type: pipeline inputs: From 895267082eb8489f555eb16c23237d726e66f087 Mon Sep 17 00:00:00 2001 From: zhangxingzhi Date: Tue, 18 Oct 2022 19:56:45 +0800 Subject: [PATCH 2/4] refactor: add some gate logic --- .../entities/_job/pipeline/_load_component.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/pipeline/_load_component.py b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/pipeline/_load_component.py index 59803dd94135..1eae81aace1a 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/pipeline/_load_component.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/pipeline/_load_component.py @@ -173,17 +173,19 @@ def load_from_dict(self, *, data: dict, _type: str = None) -> Union[BaseNode, Au else: data[CommonYamlFields.TYPE] = _type - # parse component - component_key = "component" - if component_key in data and isinstance(data[component_key], dict): - data[component_key] = component_factory.load_from_dict( - data=data[component_key], - context={ - BASE_PATH_CONTEXT_KEY: data[component_key].get(BASE_PATH_CONTEXT_KEY, None), - } - ) + new_instance: Union[BaseNode, AutoMLJob] = self.get_create_instance_func(_type)() + + if isinstance(new_instance, BaseNode): + # parse component + component_key = new_instance._get_component_attr_name() + if component_key in data and isinstance(data[component_key], dict): + data[component_key] = component_factory.load_from_dict( + data=data[component_key], + context={ + BASE_PATH_CONTEXT_KEY: data[component_key].get(BASE_PATH_CONTEXT_KEY, None), + } + ) - new_instance = self.get_create_instance_func(_type)() new_instance.__init__(**data) return new_instance From afd597015dcc9e53e19c511cfe96efe0f3c2ac72 Mon Sep 17 00:00:00 2001 From: zhangxingzhi Date: Tue, 18 Oct 2022 21:08:16 +0800 Subject: [PATCH 3/4] fix: fix test_pipeline_job_create_with_registries --- .../pipeline_jobs/hello_pipeline_job_with_registries.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/ml/azure-ai-ml/tests/test_configs/pipeline_jobs/hello_pipeline_job_with_registries.yml b/sdk/ml/azure-ai-ml/tests/test_configs/pipeline_jobs/hello_pipeline_job_with_registries.yml index 6fdbeaa2d653..339f57b1d4e4 100644 --- a/sdk/ml/azure-ai-ml/tests/test_configs/pipeline_jobs/hello_pipeline_job_with_registries.yml +++ b/sdk/ml/azure-ai-ml/tests/test_configs/pipeline_jobs/hello_pipeline_job_with_registries.yml @@ -8,7 +8,6 @@ inputs: jobs: a: component: azureml://registries/testFeed/components/my_hello_world_asset_2/versions/1 - command: echo hello ${{inputs.hello_string}} environment: azureml://registries/testFeed/environments/sklearn-10-ubuntu2004-py38-cpu/versions/19.dev6 b: command: echo "world" >> ${{outputs.world_output}}/world.txt From 73612711b267a9b2e205d2f52c7eab6b86c1b59f Mon Sep 17 00:00:00 2001 From: zhangxingzhi Date: Tue, 18 Oct 2022 21:42:10 +0800 Subject: [PATCH 4/4] fix: fix pylint --- sdk/ml/azure-ai-ml/azure/ai/ml/_internal/_schema/node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/_internal/_schema/node.py b/sdk/ml/azure-ai-ml/azure/ai/ml/_internal/_schema/node.py index 39bb649e5890..d198007ad54d 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/_internal/_schema/node.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/_internal/_schema/node.py @@ -6,7 +6,7 @@ from azure.ai.ml._schema import ArmVersionedStr, NestedField, RegistryStr, StringTransformedEnum, UnionField from azure.ai.ml._schema.pipeline.component_job import BaseNodeSchema, _resolve_inputs_outputs -from azure.ai.ml.constants._common import AzureMLResourceType, BASE_PATH_CONTEXT_KEY +from azure.ai.ml.constants._common import AzureMLResourceType from .component import InternalBaseComponentSchema, NodeType