Skip to content

Commit

Permalink
Add EDL use toturial. (#67)
Browse files Browse the repository at this point in the history
  • Loading branch information
gongweibao authored Apr 8, 2020
1 parent ad07ac4 commit ffd1fae
Show file tree
Hide file tree
Showing 29 changed files with 142 additions and 44 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ vendor/
.vscode/
*.pyc
build/
*.log
resnet50_pod/
.*.swp
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ set(EDL_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(EDL_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
SET(EDL_INSTALL_DIR ${CMAKE_BINARY_DIR}/output)
SET(CMAKE_INSTALL_RPATH "$ORIGIN" "${CMAKE_INSTALL_RPATH}")
project(paddle-edl)
project(paddle_edl)

include(python)

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ of EDL on this cluster is
- [Fault-Tolerant Training in PaddlePaddle](./doc/fault_tolerance.md).
- [Elastic Deep Learning Design Doc:compute engine](./doc/edl_collective_design_doc.md).
- [Elastic Deep Learning Design Doc:Scheduler](./doc/edl_design_doc.md).
- [Run Elastic Deep Learning Demo on a sinle node](./doc/collective_demo.md).

## FAQ

Expand Down
38 changes: 38 additions & 0 deletions doc/collective_demo.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Introduction
This demo is for developers of EDL: you can test Paddle EDL function without a Kubernetes cluster. And it's simple to test it on a none or multiple nodes.
Of course, this is also a toy. You can play with it!
Have fun!

# Install
1. Install EDL from source

```
git clone https://github.com/PaddlePaddle/edl
cd edl
mkdir build & cd build
cmake ..
pip install ./python/dist/paddle_edl-0.0.0-py2-none-any.whl
```

2. Install EDL using `pip install paddle_edl`.

# Run the demo on a single node
1. Start a Jobserver on one node.

```
git clone https://github.com/PaddlePaddle/edl
cd python/edl/demo/collective
./start_job_server.sh
```

2. Start a Jobclient on every node. Jobclient controls the POD process.

```
#Set the ImageNet data path
export PADDLE_EDL_IMAGENET_PATH=<your path>
#Set the checkpoint path
export PADDLE_EDL_FLEET_CHECKPOINT_PATH=<your path>
mkdir -p resnet50_pod
./start_job_client.sh
```
37 changes: 37 additions & 0 deletions doc/collective_demo_cn.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# 前言
在单节点或者多个节点(物理机器或者虚拟机或者Docker之类的)搭建EDL主要是为开发者准备的:没有集群的情况下也可以对Paddle(计算引擎)模拟进行EDL的测试。
当然,这个过程也有点意思,看着训练进程起起伏伏而且不影响最后的结果,还是蛮有意思的。
Have fun!

# 安装EDL
1. 你可以从源代码编译安装

```
git clone https://github.com/PaddlePaddle/edl
cd edl
mkdir build & cd build
cmake ..
pip install ./python/dist/paddle_edl-0.0.0-py2-none-any.whl
```

2. 也可以直接使用`pip`安装我们发布的版本`pip install paddle_edl`

# demo搭建步骤:以单节点为例
1. 我们需要在一个节点上启动JobServer的demo,用来记录训练任务的Pod信息。

```
git clone https://github.com/PaddlePaddle/edl
cd python/paddle_edl/demo/collective
./start_job_server.sh
```
2. 我们需要在(各个)节点上启动一个JobClient的demo,用来管理训练的Pod进程。

```
#指定ImageNet的数据目录路径
export PADDLE_EDL_IMAGENET_PATH=<your path>
#指定`checkpoint`的目录,用来保存checkpoint
export PADDLE_EDL_FLEET_CHECKPOINT_PATH=<your path>
mkdir -p resnet50_pod
./start_job_client.sh
```
4 changes: 2 additions & 2 deletions example/collective/resnet50/train_pretrain.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
export FLAGS_sync_nccl_allreduce=1
export FLAGS_cudnn_exhaustive_search=1
export FLAGS_conv_workspace_size_limit=4000 #MB
#export FLAGS_conv_workspace_size_limit=4000 #MB
export FLAGS_cudnn_batchnorm_spatial_persistent=1

export GLOG_v=1
Expand All @@ -18,7 +18,7 @@ if [[ ${use_dali} == "True" ]]; then
export FLAGS_fraction_of_gpu_memory_to_use=0.8
fi

python -m paddle-edl.launch ${distributed_args} \
python -m paddle_edl.collective.launch ${distributed_args} \
--log_dir log \
--log_level 20 \
./train_with_fleet.py \
Expand Down
6 changes: 3 additions & 3 deletions example/collective/resnet50/train_with_fleet.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,13 @@
add_arg('do_test', bool, False, "Whether do test every epoch.")
add_arg('use_gpu', bool, True, "Whether to use GPU or not.")
add_arg('fuse', bool, False, "Whether to use tensor fusion.")
add_arg('fuse_elewise_add_act_ops', bool, True, "Whether to use elementwise_act fusion.")
add_arg('fuse_bn_act_ops', bool, True, "Whether to use bn_act fusion.")
add_arg('fuse_elewise_add_act_ops', bool, False, "Whether to use elementwise_act fusion.")
add_arg('fuse_bn_act_ops', bool, False, "Whether to use bn_act fusion.")
add_arg('nccl_comm_num', int, 1, "nccl comm num")
add_arg("use_hierarchical_allreduce", bool, False, "Use hierarchical allreduce or not.")
add_arg('num_threads', int, 1, "Use num_threads to run the fluid program.")
add_arg('num_iteration_per_drop_scope', int, 100, "Ihe iteration intervals to clean up temporary variables.")
add_arg('benchmark_test', bool, True, "Whether to use print benchmark logs or not.")
add_arg('benchmark_test', bool, False, "Whether to use print benchmark logs or not.")

add_arg('use_dgc', bool, False, "Whether use DGCMomentum Optimizer or not")
add_arg('rampup_begin_step', int, 5008, "The beginning step from which dgc is implemented.")
Expand Down
4 changes: 2 additions & 2 deletions python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ file(GLOB_RECURSE EDL_FILES collective/*.py demo/*.py demo/*.sh setup.py)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
add_custom_command(
OUTPUT ${EDL_BINARY_DIR}/.timestamp
COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/edl ${EDL_BINARY_DIR}/python/
COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_edl ${EDL_BINARY_DIR}/python/
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} ./setup.py bdist_wheel
DEPENDS ${EDL_FILES})
add_custom_target(edl_python ALL DEPENDS ${EDL_BINARY_DIR}/.timestamp)
add_subdirectory(edl/tests/unittests)
add_subdirectory(paddle_edl/tests/unittests)
2 changes: 0 additions & 2 deletions python/edl/tests/unittests/start_edl_demo.sh

This file was deleted.

File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
import requests
import time
import sys
from utils import Cluster, Pod, Trainer, logger
from utils import Cluster, Pod, Trainer, logger, Hdfs
from http_store import kv_server


class Edlenv(object):
Expand Down Expand Up @@ -173,7 +174,7 @@ def edl_barrier(edl_env, hdfs, timeout=-1):
if pod.rank == 0 and not kv_server.is_alive():
kv_server.start("0.0.0.0", pod.port)

ret = edl_utils.barrier(cluster=cluster, pod=pod)
ret = barrier(cluster=cluster, pod=pod)
if ret:
break

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def start(self, host, port):
current_env["PYTHONUNBUFFERED"] = "1"

self._cmd = [
sys.executable, "-m", "paddle.distributed.http_store",
sys.executable, "-m", "paddle_edl.collective.http_store",
"--host={}".format(host), "--port={}".format(port)
]
logger.info("start http store:{}".format(self._cmd))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
from contextlib import closing
import socket

from paddle.distributed.utils import *
from utils import *
import edl_utils
from http_store import kv_server

Expand Down Expand Up @@ -124,6 +124,7 @@ def _parse_args():
parser.add_argument(
"--log_dir",
type=str,
default=None,
help="The path for each process's log.If it's not set, the log will printed to default pipe."
)

Expand Down Expand Up @@ -169,10 +170,15 @@ def launch(args):
assert edl_env.is_under_edl(), "edl launch must run under edl env"

hdfs = edl_utils.get_hdfs_from_args(args)
cluster, pod = edl_barrier(edl_env, hdfs, timeout=15 * 60)
cluster, pod = edl_utils.edl_barrier(edl_env, hdfs, timeout=15 * 60)
logger.info("get cluster from edl:{}".format(cluster))

procs = start_local_trainers(cluster, pod)
procs = start_local_trainers(
cluster,
pod,
args.training_script,
args.training_script_args,
log_dir=args.log_dir)

while True:
cluster2, pod = edl_env.get_cluster(hdfs)
Expand All @@ -182,9 +188,15 @@ def launch(args):
format(cluster2, cluster))
terminate_local_procs(procs)

cluster, pod = edl_barrier(edl_env, hdfs, timeout=30 * 60)
cluster, pod = edl_utils.edl_barrier(
edl_env, hdfs, timeout=30 * 60)

procs = start_local_trainers(cluster, pod)
procs = start_local_trainers(
cluster,
pod,
args.training_script,
args.training_script_args,
log_dir=args.log_dir)

alive = watch_local_trainers(procs, cluster.trainers_nranks())

Expand All @@ -194,7 +206,7 @@ def launch(args):

time.sleep(3)

edl_barrier(edl_env, hdfs)
edl_utils.edl_barrier(edl_env, hdfs)


if __name__ == "__main__":
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@

import os
import sys
from edl.collective.edl_utils import Edlenv
from edl.collective.utils import logger, get_logger, terminate_local_procs, get_host_name_ip
from paddle_edl.collective.edl_utils import Edlenv
from paddle_edl.collective.utils import logger, get_logger, terminate_local_procs, get_host_name_ip
from argparse import ArgumentParser, REMAINDER
import six
import copy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import argparse
import copy
import functools
import edl.collective.utils as utils
import paddle_edl.collective.utils as utils

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(utils.add_arguments, argparser=parser)
Expand All @@ -22,7 +22,7 @@
add_arg('del_pods_one_step', int, 1, "")
add_arg('add_pods_one_step', int, 1, "")
add_arg('time_interval_to_change', int, 900, "")
add_arg('server_port', int, 6070, "")
add_arg('server_port', int, 8180, "")

random.seed(10)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ while true ; do
done


src_dir=../../../example/collective/resnet50
src_dir=../../../../example/collective/resnet50
dst_dir=resnet50_pod/${pod_id}

echo "mkdir resnet50_pod/${pod_id}"
Expand All @@ -29,10 +29,9 @@ cp -r ${src_dir}/models ${dst_dir}/models
cp -r ${src_dir}/scripts ${dst_dir}/scripts

if [[ ! -d "${dst_dir}/ImageNet" ]]; then
ln -s /root/go/dataset/ImageNet/ ${dst_dir}/
ln -s ${PADDLE_EDL_IMAGENET_PATH} ${dst_dir}/
fi


if [[ ! -d "${dst_dir}/fleet_checkpoints" ]]; then
ln -s /root/go/checkpoints/resnet50/fleet_checkpoints ${dst_dir}/fleet_checkpoints
ln -s ${PADDLE_EDL_FLEET_CHECKPOINT_PATH} ${dst_dir}/fleet_checkpoints
fi
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ echo "${BASEDIR}"

nohup python -u ${BASEDIR}/job_server_demo.py \
--node_ips ${node_ips} \
--pod_num_of_node 2 \
--pod_num_of_node 8 \
--time_interval_to_change 900 \
--gpu_num_of_node 8 > job_server.log 2>&1 &
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@
# limitations under the License.
import os
pod_id = os.getenv("PADDLE_POD_ID", "")
print(pod_id)
print(pod_id + "__edl_demo__")
2 changes: 2 additions & 0 deletions python/paddle_edl/tests/unittests/start_edl_demo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
python -m paddle_edl.collective.launch edl_demo.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
set -e
#set -x

echo "python_path:${PYTHONPATH}"
unset http_proxy https_proxy
Expand All @@ -8,7 +8,9 @@ unset http_proxy https_proxy
BASEDIR=$(dirname $(readlink -f $0))
echo "${BASEDIR}"

nohup python -m edl.demo.collective.job_server_demo --pod_num_of_node 2 \
rm -rf job_server.log job_client.log ./edl_demo_log

nohup python -m paddle_edl.demo.collective.job_server_demo --pod_num_of_node 2 \
--time_interval_to_change 900 \
--gpu_num_of_node 2 \
--pod_num_of_node 2 \
Expand All @@ -24,7 +26,7 @@ export PADDLE_JOBSERVER="http://127.0.0.1:8180"
export PADDLE_JOB_ID="test_job_id_1234"
export PADDLE_POD_ID="not set"

nohup python -m edl.demo.collective.job_client_demo \
nohup python -m paddle_edl.demo.collective.job_client_demo \
--log_level 20 \
--log_dir ./edl_demo_log \
./start_edl_demo.sh > job_client.log 2>&1 &
Expand All @@ -34,7 +36,7 @@ echo "launcher_pid:${job_client_pid}"
sleep 30s

echo "test request and response"
str="pod_0_0"
str="pod_0_0__edl_demo__"
file=./edl_demo_log/pod_pod_0_0.log

kill ${server_pid} ${job_client_pid}
Expand All @@ -47,5 +49,9 @@ else
cat job_server.log
echo "job_client.log"
cat job_client.log
echo "pod pod 0"
cat edl_demo_log/pod_pod_0_0.log
echo "pod pod 1"
cat edl_demo_log/pod_pod_1_0.log
exit -1
fi
Loading

0 comments on commit ffd1fae

Please sign in to comment.