forked from microsoft/pai
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pai-job-protocol.yaml
247 lines (229 loc) · 11.3 KB
/
pai-job-protocol.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
%YAML 1.2
---
# OpenPAI Job Protocol YAML
protocolVersion: String, required # Protocol version, current version is 2.
name: String, required # String in ^[a-zA-Z0-9_-]+$ format.
type: String, required # Component type, should be "job" here.
version: String, optional # Component version, default is latest.
contributor: String, optional
description: String, optional
prerequisites: # Optional
# Each item is the protocol for data, script, dockerimage, or output type.
- protocolVersion: String, optional # If omitted, follow the protocolVersion in root.
name: String, required
type: String, required # Component type. Must be one of the following: data, script, dockerimage, or output. Prerequisites.type cannot be "job".
version: String, optional # Component version, Default is latest.
contributor: String, optional
description: String, optional
auth: Object, optional # Only available when the type is dockerimage.
username: String, optional
password: String, optional # If a password is needed, it should be referenced as a secret
registryuri: String, optional
uri: String or list, required # Only when the type is data can the uri be a list.
# If specified, the whole parameters object can be referenced as `$parameters`.
# Scope of reference `$parameters`: the reference is shared among all task roles.
parameters: # Optional, can be omitted.
<param1>: value1 # Specify name and value of all the referencable parameters that will be used in the whole job template.
<param2>: value2 # Can be referenced by `<% $parameters.param1 %>`, `<% $parameters.param2 %>`.
# If sensitive information including password or API key is needed in the protocol,
# it should be specified here in secrets section and referenced as `$secrets`.
# Scope of reference `$secrets`: the reference is shared among all task roles and docker image's `auth` field.
# A system that supports PAI protocol should keep the secret information away from
# unauthorized users (how to define unauthorized user is out of the scope of this protocol).
# For example, the yaml file used for job cloning, the stdout/stderr should protect all information marked as secrets.
secrets: # Optional, can be omitted.
<secret1>: password # Specify name and value of all secrets that will be used in the whole job template.
<secret2>: key # Can be referenced by `<% $secrets.secret1 %>`, `<% $secrets.secret2 %>`.
jobRetryCount: Integer, optional # Default is 0.
# Task roles are different types of task in the protocol.
# One job may have one or more task roles, each task role has one or more instances, and each instance runs inside one container.
taskRoles:
<name>: String, required # Name of the taskRole, string in ^[a-zA-Z_][a-zA-Z0-9_]*$ format (valid C variable name).
instances: Integer, optional # Default is 1, instances of a taskRole, no less than 1.
completion: # Completion poclicy for the job, https://github.com/Microsoft/pai/blob/master/subprojects/frameworklauncher/yarn/doc/USERMANUAL.md#ApplicationCompletionPolicy.
# Number of failed tasks to fail the entire job, -1 or no less than 1, if set to -1 means the job will always succeed regardless any task failure.
minFailedInstances: Integer, optional # Default is 1.
# Number of succeeded tasks to succeed the entire job, -1 or no less than 1, if set to -1 means the job will only succeed until all tasks are completed and minFailedInstances is not triggered.
minSucceededInstances: Integer, optional # Default is task instances.
taskRetryCount: Integer, optional # Default is 0.
dockerImage: String, required # Should reference to a dockerimage defined in prerequisites.
# Scope of the reference `$data`, '$output', `$script`: the reference is only valid inside this task role.
# User cannot reference them from another task role. Reference for `$parameters` is global and shared among task roles.
data: String, optional # Select data defined in prerequisites, target can be referenced as `$data` in this task role.
output: String, optional # Select output defined in prerequisites, target can be referenced as `$output` in this task role.
script: String, optional # Select script defined in prerequisites, target can be referenced as `$script` in this task role.
extraContainerOptions:
shmMB: Integer, optional # Config the /dev/shm in a docker container, https://docs.docker.com/compose/compose-file/#shm_size.
infiniband: Boolean, optional # Use InfiniBand devices or not in a docker container.
resourcePerInstance:
cpu: Integer, required # CPU number, unit is CPU vcore
memoryMB: Integer, required # Memory number, unit is MB
gpu: Integer, required # GPU number, unit is GPU card
ports: # Optional, only for host network, port label is string in ^[a-zA-Z_][a-zA-Z0-9_]*$ format (valid C variable name).
<portLabel>: Integer, required, minimum number is 1 # Port number for the port label.
commands:
- String, required
# To handle that a component may interact with different component differently, user is encouraged to place the codes handling such difference in the "deployments" field,
# e.g., a job may get input data through wget, hdfc -dfs cp, copy, or just directly read from remote storage. This logic can be placed here.
# In summary, the deployments field is responsible to make sure the job to run properly in a deployment specific runtime environment.
# One could have many deployments, but only one deployment can be activated at runtime by specifying in "defaults". User can choose the deployment and specify in "defaults" at submission time.
deployments:
- name: String, required
taskRoles:
<name>: String, required # Should be in taskRoles
preCommands:
- String, required # Execute before the taskRole's command
postCommands:
- String, required # Execute after the taskRole's command
defaults: # Optional, default cluster specific settings
virtualCluster: String, optional
deployment: String, optional # Should reference to deployment defined in deployments
extras: # Optional, extra field, object, save any information that plugin may use
submitFrom: String, optional
hivedscheduler: # Optional
jobPriorityClass: String, required
taskRoles:
<name>: String, required
gpuType/reservationId: String, required # Only one allowed
affinityGroupName: String, optional
---
# OpenPAI Job Protocol YAML Example for a Distributed TensorFlow Job
protocolVersion: 2
name: tensorflow_cifar10
type: job
version: 1.0
contributor: Alice
description: image classification, cifar10 dataset, tensorflow, distributed training
prerequisites:
- protocolVersion: 2
name: tf_example
type: dockerimage
version: latest
contributor: Alice
description: python3.5, tensorflow
auth:
username: user
password: <% $secrets.docker_password %>
registryuri: openpai.azurecr.io
uri: openpai/pai.example.tensorflow
- protocolVersion: 2
name: tensorflow_cifar10_model
type: output
version: latest
contributor: Alice
description: cifar10 data output
uri: hdfs://10.151.40.179:9000/core/cifar10_model
- protocolVersion: 2
name: tensorflow_cnnbenchmarks
type: script
version: 84820935288cab696c9c2ac409cbd46a1f24723d
contributor: MaggieQi
description: tensorflow benchmarks
uri: github.com/MaggieQi/benchmarks
- protocolVersion: 2
name: cifar10
type: data
version: latest
contributor: Alice
description: cifar10 dataset, image classification
uri:
- https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
parameters:
model: resnet20
batchsize: 32
secrets:
docker_password: password
github_token: cGFzc3dvcmQ=
jobRetryCount: 1
taskRoles:
worker:
instances: 1
completion:
minFailedInstances: 1
minSucceededInstances: 1
taskRetryCount: 0
dockerImage: tf_example
data: cifar10
output: tensorflow_cifar10_model
script: tensorflow_cnnbenchmarks
extraContainerOptions:
shmMB: 64
resourcePerInstance:
cpu: 2
memoryMB: 16384
gpu: 4
ports:
ssh: 1
http: 1
commands:
- cd script_<% $script.name %>/scripts/tf_cnn_benchmarks
- >
python tf_cnn_benchmarks.py --job_name=worker
--local_parameter_device=gpu
--variable_update=parameter_server
--ps_hosts=$PAI_TASK_ROLE_ps_server_HOST_LIST
--worker_hosts=$PAI_TASK_ROLE_worker_HOST_LIST
--task_index=$PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX
--data_name=<% $data.name %>
--data_dir=$PAI_WORK_DIR/data_<% $data.name %>
--train_dir=$PAI_WORK_DIR/output_<% $output.name %>
--model=<% $parameters.model %>
--batch_size=<% $parameters.batchsize %>
ps_server:
instances: 1
completion:
minFailedInstances: 1
minSucceededInstances: -1
taskRetryCount: 0
dockerImage: tf_example
data: cifar10
output: tensorflow_cifar10_model
script: tensorflow_cnnbenchmarks
extraContainerOptions:
shmMB: 64
resourcePerInstance:
cpu: 2
memoryMB: 8192
gpu: 0
ports:
ssh: 1
http: 1
commands:
- cd script_<% $script.name %>/scripts/tf_cnn_benchmarks
- >
python tf_cnn_benchmarks.py --job_name=ps
--local_parameter_device=gpu
--variable_update=parameter_server
--ps_hosts=$PAI_TASK_ROLE_ps_server_HOST_LIST
--worker_hosts=$PAI_TASK_ROLE_worker_HOST_LIST
--task_index=$PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX
--data_dir=$PAI_WORK_DIR/data_<% $data.name %>
--data_name=<% $data.name %>
--train_dir=$PAI_WORK_DIR/output_<% $output.name %>
--model=<% $parameters.model %>
--batch_size=<% $parameters.batchsize %>
deployments:
- name: prod # This implementation will download the data to local disk, and the computed model will be output to local disk first and then being copied to hdfs.
version: 1.0
taskRoles:
worker:
preCommands:
- wget <% $data.uri[0] %> -P data_<% $data.name %> # If local data cache deployed, one can copy data from local cache, only wget in case of cache miss.
- >
git clone https://<% $script.contributor %>:<% $secrets.github_token %>@<% $script.uri %> script_<% $script.name %> &&
cd script_<% $script.name %> && git checkout <% $script.version %> && cd ..
# Then the system will go ahead to execute worker's command.
ps_server:
preCommands:
- wget <% $data.uri[0] %> -P data_<% $data.name %>
- >
git clone https://<% $script.contributor %>:<% $secrets.github_token %>@<% $script.uri %> script_<% $script.name %> &&
cd script_<% $script.name %> && git checkout <% $script.version %> && cd ..
# Then the system will go ahead to execute ps_server's command.
postCommands:
# After the execution of ps_server's command, the system goes here.
- hdfs dfs -cp output_<% $output.name %> <% $output.uri %>
# Assume the model is output locally, and this command copies the local output to hdfs. One can output to hdfs directly.
# In this case, you will have to change "--train_dir=$PAI_WORK_DIR/output_<% $output.name %>".
defaults:
deployment: prod # Use prod deployment in job submission.