Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: enable internal components in pipeline yaml #26800

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sdk/ml/azure-ai-ml/azure/ai/ml/_internal/_schema/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def make(self, data, **kwargs): # pylint: disable=unused-argument, no-self-use
# dict to node object
from azure.ai.ml.entities._job.pipeline._load_component import pipeline_node_factory

return pipeline_node_factory.load_from_dict(data) # pylint: disable=E1125, too-many-function-args
return pipeline_node_factory.load_from_dict(data=data)

@pre_dump
def resolve_inputs_outputs(self, job, **kwargs): # pylint: disable=unused-argument, no-self-use
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,9 +264,9 @@ def command(self, value: str) -> None:
if isinstance(self.component, Component):
self.component.command = value
else:
msg = "Can't set command property for a registered component {}"
msg = "Can't set command property for a registered component {}. Tried to set it to {}."
raise ValidationException(
message=msg.format(self.component),
message=msg.format(self.component, value),
no_personal_data_message=msg,
target=ErrorTarget.COMMAND_JOB,
error_category=ErrorCategory.USER_ERROR,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from azure.ai.ml.entities._builders.do_while import DoWhile
from azure.ai.ml.entities._builders.pipeline import Pipeline
from azure.ai.ml.entities._component.component import Component
from azure.ai.ml.entities._component.component_factory import component_factory
from azure.ai.ml.entities._job.automl.automl_job import AutoMLJob
from azure.ai.ml.entities._util import extract_label
from azure.ai.ml.exceptions import ErrorCategory, ErrorTarget, ValidationException
Expand Down Expand Up @@ -172,7 +173,19 @@ def load_from_dict(self, *, data: dict, _type: str = None) -> Union[BaseNode, Au
else:
data[CommonYamlFields.TYPE] = _type

new_instance = self.get_create_instance_func(_type)()
new_instance: Union[BaseNode, AutoMLJob] = self.get_create_instance_func(_type)()

if isinstance(new_instance, BaseNode):
# parse component
component_key = new_instance._get_component_attr_name()
if component_key in data and isinstance(data[component_key], dict):
data[component_key] = component_factory.load_from_dict(
data=data[component_key],
context={
BASE_PATH_CONTEXT_KEY: data[component_key].get(BASE_PATH_CONTEXT_KEY, None),
}
)

new_instance.__init__(**data)
return new_instance

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pytest
import yaml

from azure.ai.ml import Input, load_component
from azure.ai.ml import Input, load_component, load_job
from azure.ai.ml._internal import (
AISuperComputerConfiguration,
AISuperComputerScalePolicy,
Expand Down Expand Up @@ -592,3 +592,18 @@ def test_pipeline_with_setting_node_output_directly(self) -> None:
copy_file.outputs.output_dir.path = "path_on_datastore"
assert copy_file.outputs.output_dir.path == "path_on_datastore"
assert copy_file.outputs.output_dir.type == "path"

def test_job_properties(self):
pipeline_job: PipelineJob = load_job(
source="./tests/test_configs/internal/pipeline_jobs/pipeline_job_with_properties.yml"
)
pipeline_dict = pipeline_job._to_dict()
rest_pipeline_dict = pipeline_job._to_rest_object().as_dict()["properties"]
assert pipeline_dict["properties"] == {"AZURE_ML_PathOnCompute_input_data": "/tmp/test"}
assert rest_pipeline_dict["properties"] == pipeline_dict["properties"]
for name, node_dict in pipeline_dict["jobs"].items():
rest_node_dict = rest_pipeline_dict["jobs"][name]
assert len(node_dict["properties"]) == 1
assert "AZURE_ML_PathOnCompute_" in list(node_dict["properties"].keys())[0]
assert node_dict["properties"] == rest_node_dict["properties"]

Original file line number Diff line number Diff line change
Expand Up @@ -26,139 +26,13 @@ properties:
AZURE_ML_PathOnCompute_input_data: "/tmp/test"

jobs:
node0: # inline command job with properties
command: echo hello ${{inputs.hello_string}}
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest
inputs:
hello_string: ${{parent.inputs.hello_string}}
properties:
AZURE_ML_PathOnCompute_hello_string: "/tmp/test"

node1: # inline parallel job with properties
type: parallel
compute: "azureml:cpu-cluster"
inputs:
test1: ${{parent.inputs.input_data}}
resources:
instance_count: 3
mini_batch_size: "100kb"
mini_batch_error_threshold: 5
logging_level: "DEBUG"
input_data: ${{inputs.input_data}}
max_concurrency_per_instance: 2
task:
type: run_function
code: "../python"
entry_script: pass_through.py
append_row_to: ${{outputs.scored_result}} # optional, If Null, equals to summary_only mode in v1.
environment: azureml:my-env:1
properties:
AZURE_ML_PathOnCompute_input_data: "/tmp/test"

node2: # inline import job with properties
type: import
source:
type: azuresqldb
query: >-
select * from REGION
connection: azureml:my_username_password
output:
type: mltable
path: azureml://datastores/workspaceblobstore/paths/output_dir/
properties:
AZURE_ML_PathOnCompute_output: "/tmp/test"

node3: # inline spark job with properties
type: spark
inputs:
test1: ${{parent.inputs.input_data}}
file_input2: ${{parent.inputs.input_data}}
code: ../dsl_pipeline/spark_job_in_pipeline/src
entry:
file: entry.py # file path of the entry file relative to the code root folder
py_files:
- utils.zip
jars:
- scalaproj.jar
files:
- my_files.txt
args: >-
--file_input1 ${{inputs.test1}}
--file_input2 ${{inputs.file_input2}}
--output ${{outputs.output}}
compute: azureml:rezas-synapse-10
conf:
spark.driver.cores: 2
spark.driver.memory: "1g"
spark.executor.cores: 1
spark.executor.memory: "1g"
spark.executor.instances: 1
properties:
AZURE_ML_PathOnCompute_input_data: "/tmp/test"

node4: # inline automl job with properties
type: automl
task: text_ner
log_verbosity: info
primary_metric: accuracy
limits:
max_trials: 1
timeout_minutes: 60
training_data: ${{parent.inputs.text_ner_training_data}}
validation_data: ${{parent.inputs.text_ner_validation_data}}
properties:
AZURE_ML_PathOnCompute_training_data: "/tmp/test"

node5: # inline sweep job with properties
type: sweep
search_space:
component_in_number:
type: choice
values:
- 25
- 35
limits:
max_total_trials: 3
sampling_algorithm: random
objective:
goal: maximize
primary_metric: accuracy
trial: azureml:microsoftsamplescommandcomponentbasic_nopaths_test:1
properties:
AZURE_ML_PathOnCompute_input: "/tmp/test"

node6: # parallel node with properties as a typical implement of base node.
type: parallel
node7: # internal command node with properties as a typical implement of internal base node.
type: CommandComponent
compute: azureml:cpu-cluster
component: ../components/parallel_component_with_file_input.yml
component: file:../helloworld/helloworld_component_command.yml
inputs:
job_data_path: ${{parent.inputs.pipeline_job_data_path}}
outputs:
job_output_path:
mini_batch_size: "1"
mini_batch_error_threshold: 1
max_concurrency_per_instance: 1
properties:
AZURE_ML_PathOnCompute_job_data_path: "/tmp/test"

# Comment these lines out as internal node is not well supported in yaml now.
# node7: # internal command node with properties as a typical implement of internal base node.
# type: CommandComponent
# compute: azureml:cpu-cluster
# component: ../internal/helloworld/helloworld_component_command.yml
# inputs:
# training_data: ${{parent.inputs.input_data}}
# max_epochs: 10
# learning_rate: 0.01
# properties:
# AZURE_ML_PathOnCompute_job_training_data: "/tmp/test"

node8: # pipeline node with properties
type: pipeline
inputs:
component_in_number: 11
component_in_path: ${{parent.inputs.input_data}}

component: ../components/helloworld_pipeline_component.yml
training_data: ${{parent.inputs.input_data}}
max_epochs: 10
learning_rate: 0.01
properties:
AZURE_ML_PathOnCompute_job_component_in_path: "/tmp/test"
AZURE_ML_PathOnCompute_job_training_data: "/tmp/test"
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ inputs:
jobs:
a:
component: azureml://registries/testFeed/components/my_hello_world_asset_2/versions/1
command: echo hello ${{inputs.hello_string}}
elliotzh marked this conversation as resolved.
Show resolved Hide resolved
environment: azureml://registries/testFeed/environments/sklearn-10-ubuntu2004-py38-cpu/versions/19.dev6
b:
command: echo "world" >> ${{outputs.world_output}}/world.txt
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,18 +141,6 @@ jobs:
properties:
AZURE_ML_PathOnCompute_job_data_path: "/tmp/test"

# Comment these lines out as internal node is not well supported in yaml now.
# node7: # internal command node with properties as a typical implement of internal base node.
# type: CommandComponent
# compute: azureml:cpu-cluster
# component: ../internal/helloworld/helloworld_component_command.yml
# inputs:
# training_data: ${{parent.inputs.input_data}}
# max_epochs: 10
# learning_rate: 0.01
# properties:
# AZURE_ML_PathOnCompute_job_training_data: "/tmp/test"

node8: # pipeline node with properties
type: pipeline
inputs:
Expand Down