forked from Azure/azureml-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
job.py
123 lines (102 loc) · 4.39 KB
/
job.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# description: train CNN model on CIFAR-10 dataset with distributed PyTorch
# imports
import os
import urllib
import tarfile
from pathlib import Path
from azureml.core import Workspace
from azureml.core import ScriptRunConfig, Experiment, Environment, Dataset
from azureml.core.runconfig import PyTorchConfiguration
# get workspace
ws = Workspace.from_config()
# get root of git repo
prefix = Path(__file__).parent
# training script
source_dir = str(prefix.joinpath("src"))
script_name = "train.py"
# azure ml settings
environment_name = "AzureML-PyTorch-1.6-GPU" # using curated environment
experiment_name = "pytorch-cifar10-distributed-example"
compute_name = "gpu-K80-2"
# get environment
env = Environment.get(ws, name=environment_name)
# download and extract cifar-10 data
url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
filename = "cifar-10-python.tar.gz"
data_root = "cifar-10"
filepath = os.path.join(data_root, filename)
if not os.path.isdir(data_root):
os.makedirs(data_root, exist_ok=True)
urllib.request.urlretrieve(url, filepath)
with tarfile.open(filepath, "r:gz") as tar:
tar.extractall(path=data_root)
os.remove(filepath) # delete tar.gz file after extraction
# create azureml dataset
datastore = ws.get_default_datastore()
dataset = Dataset.File.upload_directory(
src_dir=data_root, target=(datastore, data_root)
)
# The training script in this example utilizes native PyTorch distributed training with DistributeDataParallel.
#
# To launch a distributed PyTorch job on Azure ML, you have two options:
# 1) Per-process launch - specify the total # of worker processes (typically one per GPU) you want to run, and
# Azure ML will handle launching each process.
# 2) Per-node launch with torch.distributed.launch - provide the torch.distributed.launch command you want to
# run on each node.
#
# Both options are demonstrated below.
###############################
# Option 1 - per-process launch
###############################
# To use the per-process launch option in which Azure ML will handle launching each of the processes to run
# your training script, create a `PyTorchConfiguration` and specify `node_count` and `process_count`.
# The `process_count` is the total number of processes you want to run for the job; this should typically
# equal the # of GPUs available on each node multiplied by the # of nodes.
#
# Azure ML will set the MASTER_ADDR, MASTER_PORT, NODE_RANK, WORLD_SIZE environment variables on each node, in addition
# to the process-level RANK and LOCAL_RANK environment variables, that are needed for distributed PyTorch training.
# create distributed config
distr_config = PyTorchConfiguration(process_count=4, node_count=2)
# create args
args = ["--data-dir", dataset.as_download(), "--epochs", 25]
# create job config
src = ScriptRunConfig(
source_directory=source_dir,
script=script_name,
arguments=args,
compute_target=compute_name,
environment=env,
distributed_job_config=distr_config,
)
###############################
# Option 2 - per-node launch
###############################
# If you would instead like to use the PyTorch-provided launch utility `torch.distributed.launch` to
# handle launching the worker processes on each node, you can do so as well. Create a
# `PyTorchConfiguration` and specify the `node_count`. You do not need to specify the `process_count`;
# by default Azure ML will launch one process per node to run the `command` you provided.
#
# Provide the launch command to the `command` parameter of ScriptRunConfig. For PyTorch jobs Azure ML
# will set the MASTER_ADDR, MASTER_PORT, and NODE_RANK environment variables on each node, so you can
# simply just reference those environment variables in your command.
#
# Uncomment the code below to configure a job with this method.
"""
# create distributed config
distr_config = PyTorchConfiguration(node_count=2)
# define command
launch_cmd = ["python -m torch.distributed.launch --nproc_per_node 2 --nnodes 2 " \
"--node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT --use_env " \
"train.py --data-dir", dataset.as_download(), "--epochs 25"]
# create job config
src = ScriptRunConfig(
source_directory=source_dir,
command=launch_cmd,
compute_target=compute_name,
environment=env,
distributed_job_config=distr_config,
)
"""
# submit job
run = Experiment(ws, experiment_name).submit(src)
run.wait_for_completion(show_output=True)