From ff0d3a205728164eb944a5fa8ead20b5cd8d74ab Mon Sep 17 00:00:00 2001 From: chengduozh Date: Sun, 5 May 2019 10:17:55 +0800 Subject: [PATCH 1/6] use multi process run rcnn --- Mask-RCNN/paddle/rcnn/dist_util.py | 30 +++++++++++++++++++ Mask-RCNN/paddle/rcnn/run_multi_process.sh | 24 +++++++++++++++ Mask-RCNN/paddle/rcnn/train.py | 34 +++++++++++++++++----- 3 files changed, 80 insertions(+), 8 deletions(-) create mode 100644 Mask-RCNN/paddle/rcnn/dist_util.py create mode 100644 Mask-RCNN/paddle/rcnn/run_multi_process.sh diff --git a/Mask-RCNN/paddle/rcnn/dist_util.py b/Mask-RCNN/paddle/rcnn/dist_util.py new file mode 100644 index 0000000000..447c88cb92 --- /dev/null +++ b/Mask-RCNN/paddle/rcnn/dist_util.py @@ -0,0 +1,30 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os +import paddle.fluid as fluid + + +def nccl2_prepare(trainer_id, startup_prog, main_prog): + config = fluid.DistributeTranspilerConfig() + config.mode = "nccl2" + t = fluid.DistributeTranspiler(config=config) + t.transpile(trainer_id, + trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'), + current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'), + startup_program=startup_prog, + program=main_prog) diff --git a/Mask-RCNN/paddle/rcnn/run_multi_process.sh b/Mask-RCNN/paddle/rcnn/run_multi_process.sh new file mode 100644 index 0000000000..e2247a7d8f --- /dev/null +++ b/Mask-RCNN/paddle/rcnn/run_multi_process.sh @@ -0,0 +1,24 @@ +#!bin/bash +set -xe + +#export FLAGS_cudnn_deterministic=true +#export FLAGS_enable_parallel_graph=1 +export FLAGS_eager_delete_tensor_gb=0.0 +export FLAGS_fraction_of_gpu_memory_to_use=0.98 +export FLAGS_memory_fraction_of_eager_deletion=1.0 +export FLAGS_conv_workspace_size_limit=1500 + +base_batch_size=1 + +export CUDA_VISIBLE_DEVICES=0,1 + +device=${CUDA_VISIBLE_DEVICES//,/ } +arr=($device) +num_gpu_devices=${#arr[*]} + +python -m paddle.distributed.launch --gpus ${num_gpu_devices} train.py \ + --model_save_dir=output/ \ + --pretrained_model=../imagenet_resnet50_fusebn/ \ + --data_dir=./dataset/coco \ + --im_per_batch=${base_batch_size} \ + --MASK_ON=True \ No newline at end of file diff --git a/Mask-RCNN/paddle/rcnn/train.py b/Mask-RCNN/paddle/rcnn/train.py index 70f5a897a5..c9a4246c0b 100644 --- a/Mask-RCNN/paddle/rcnn/train.py +++ b/Mask-RCNN/paddle/rcnn/train.py @@ -30,7 +30,7 @@ import models.resnet as resnet from learning_rate import exponential_with_warmup_decay from config import cfg - +import dist_utils def train(): learning_rate = cfg.learning_rate @@ -82,16 +82,14 @@ def train(): var.persistable = True #fluid.memory_optimize(fluid.default_main_program(), skip_opt_set=set(fetch_list)) - - place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() + gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) + place = fluid.CUDAPlace(gpu_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) + if not cfg.parallel: exe.run(fluid.default_startup_program()) if cfg.pretrained_model: - def if_exist(var): return os.path.exists(os.path.join(cfg.pretrained_model, var.name)) - fluid.io.load_vars(exe, cfg.pretrained_model, predicate=if_exist) if cfg.parallel: @@ -99,10 +97,29 @@ def if_exist(var): build_strategy.memory_optimize = False build_strategy.enable_inplace = False + trainer_id = int(os.environ.get('PADDLE_TRAINER_ID', 0)) + num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) + print("PADDLE_TRAINERS_NUM", num_trainers) + print("PADDLE_TRAINER_ID", trainer_id) + build_strategy.num_trainers = num_trainers + build_strategy.trainer_id = trainer_id + # NOTE(zcd): use multi processes to train the model, + # and each process use one GPU card. + if num_trainers > 1: + dist_utils.nccl2_prepare(trainer_id, + fluid.default_startup_program(), + main_prog=fluid.default_main_program()) + + exe.run(fluid.default_startup_program()) + exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = True - train_exe = fluid.ParallelExecutor( - use_cuda=bool(cfg.use_gpu), loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) + train_exe = fluid.ParallelExecutor(use_cuda=bool(cfg.use_gpu), + loss_name=loss.name, + build_strategy=build_strategy, + exec_strategy=exec_strategy, + num_trainers=num_trainers, + trainer_id=trainer_id) else: train_exe = exe @@ -208,3 +225,4 @@ def train_loop(): args = parse_args() print_arguments(args) train() + From f5f9396eaee7b75f6e6b0d201aad058d0c9356a0 Mon Sep 17 00:00:00 2001 From: chengduozh Date: Mon, 6 May 2019 15:23:15 +0800 Subject: [PATCH 2/6] polish code --- Mask-RCNN/paddle/rcnn/{dist_util.py => dist_utils.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Mask-RCNN/paddle/rcnn/{dist_util.py => dist_utils.py} (100%) diff --git a/Mask-RCNN/paddle/rcnn/dist_util.py b/Mask-RCNN/paddle/rcnn/dist_utils.py similarity index 100% rename from Mask-RCNN/paddle/rcnn/dist_util.py rename to Mask-RCNN/paddle/rcnn/dist_utils.py From 74e7ff497434856e5902682579ffe95cf8e210ec Mon Sep 17 00:00:00 2001 From: chengduozh Date: Wed, 15 May 2019 20:11:00 +0800 Subject: [PATCH 3/6] update --- Mask-RCNN/paddle/rcnn/train.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/Mask-RCNN/paddle/rcnn/train.py b/Mask-RCNN/paddle/rcnn/train.py index c9a4246c0b..420e141b14 100644 --- a/Mask-RCNN/paddle/rcnn/train.py +++ b/Mask-RCNN/paddle/rcnn/train.py @@ -32,6 +32,18 @@ from config import cfg import dist_utils +def get_device_num(): + visible_device = os.getenv('CUDA_VISIBLE_DEVICES') + # NOTE(zcd): use multi processes to train the model, + # and each process use one GPU card. + num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) + if num_trainers > 1 : return 1 + if visible_device: + device_num = len(visible_device.split(',')) + else: + device_num = subprocess.check_output(['nvidia-smi','-L']).decode().count('\n') + return device_num + def train(): learning_rate = cfg.learning_rate image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size] @@ -43,8 +55,7 @@ def train(): random.seed(0) np.random.seed(0) - devices = os.getenv("CUDA_VISIBLE_DEVICES") or "" - devices_num = len(devices.split(",")) + devices_num = get_device_num() total_batch_size = devices_num * cfg.TRAIN.im_per_batch use_random = True From be48eecb40dd2b3bcfc92846e97922d396c8298c Mon Sep 17 00:00:00 2001 From: chengduozh Date: Fri, 17 May 2019 19:35:12 +0800 Subject: [PATCH 4/6] update Mask-RCNN paddle rcnn train.py --- Mask-RCNN/paddle/rcnn/dist_utils.py | 33 +++++++++++++++++++++-------- Mask-RCNN/paddle/rcnn/train.py | 18 ++++++---------- 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/Mask-RCNN/paddle/rcnn/dist_utils.py b/Mask-RCNN/paddle/rcnn/dist_utils.py index 447c88cb92..94ceb05e20 100644 --- a/Mask-RCNN/paddle/rcnn/dist_utils.py +++ b/Mask-RCNN/paddle/rcnn/dist_utils.py @@ -18,13 +18,28 @@ import os import paddle.fluid as fluid - def nccl2_prepare(trainer_id, startup_prog, main_prog): - config = fluid.DistributeTranspilerConfig() - config.mode = "nccl2" - t = fluid.DistributeTranspiler(config=config) - t.transpile(trainer_id, - trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'), - current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'), - startup_program=startup_prog, - program=main_prog) + config = fluid.DistributeTranspilerConfig() + config.mode = "nccl2" + t = fluid.DistributeTranspiler(config=config) + t.transpile(trainer_id, + trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'), + current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'), + startup_program=startup_prog, + program=main_prog) + +def prepare_for_multi_process(exe, build_strategy, train_prog, startup_prog): + # prepare for multi-process + trainer_id = int(os.environ.get('PADDLE_TRAINER_ID', 0)) + num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) + print("PADDLE_TRAINERS_NUM", num_trainers) + print("PADDLE_TRAINER_ID", trainer_id) + build_strategy.num_trainers = num_trainers + build_strategy.trainer_id = trainer_id + # NOTE(zcd): use multi processes to train the model, + # and each process use one GPU card. + if num_trainers > 1: + nccl2_prepare(trainer_id, + startup_prog, train_prog) + # the startup_prog are run two times, but it doesn't matter. + exe.run(startup_prog) \ No newline at end of file diff --git a/Mask-RCNN/paddle/rcnn/train.py b/Mask-RCNN/paddle/rcnn/train.py index 420e141b14..09fac1fd0b 100644 --- a/Mask-RCNN/paddle/rcnn/train.py +++ b/Mask-RCNN/paddle/rcnn/train.py @@ -108,20 +108,14 @@ def if_exist(var): build_strategy.memory_optimize = False build_strategy.enable_inplace = False + dist_utils.prepare_for_multi_process( + exe, + build_strategy, + fluid.default_main_program(), + fluid.default_startup_program()) + trainer_id = int(os.environ.get('PADDLE_TRAINER_ID', 0)) num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) - print("PADDLE_TRAINERS_NUM", num_trainers) - print("PADDLE_TRAINER_ID", trainer_id) - build_strategy.num_trainers = num_trainers - build_strategy.trainer_id = trainer_id - # NOTE(zcd): use multi processes to train the model, - # and each process use one GPU card. - if num_trainers > 1: - dist_utils.nccl2_prepare(trainer_id, - fluid.default_startup_program(), - main_prog=fluid.default_main_program()) - - exe.run(fluid.default_startup_program()) exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = True From cb819cf1e19896d4167f3c5fec734b0c9ecb469d Mon Sep 17 00:00:00 2001 From: chengduozh Date: Tue, 21 May 2019 16:57:36 +0800 Subject: [PATCH 5/6] update learning rate --- Mask-RCNN/paddle/rcnn/train.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Mask-RCNN/paddle/rcnn/train.py b/Mask-RCNN/paddle/rcnn/train.py index 01e822d0bc..e87b82e65a 100644 --- a/Mask-RCNN/paddle/rcnn/train.py +++ b/Mask-RCNN/paddle/rcnn/train.py @@ -44,7 +44,12 @@ def get_device_num(): device_num = subprocess.check_output(['nvidia-smi','-L']).decode().count('\n') return device_num +def update_lr(args): + num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) + args.learning_rate = args.learning_rate / num_trainers + def train(): + update_lr(cfg) learning_rate = cfg.learning_rate image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size] From f14fe81b497b258c82d2557689534109ca9830c5 Mon Sep 17 00:00:00 2001 From: chengduozh Date: Tue, 21 May 2019 17:37:37 +0800 Subject: [PATCH 6/6] fix learning rate --- Mask-RCNN/paddle/rcnn/train.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Mask-RCNN/paddle/rcnn/train.py b/Mask-RCNN/paddle/rcnn/train.py index e87b82e65a..7740323b4b 100644 --- a/Mask-RCNN/paddle/rcnn/train.py +++ b/Mask-RCNN/paddle/rcnn/train.py @@ -47,6 +47,10 @@ def get_device_num(): def update_lr(args): num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) args.learning_rate = args.learning_rate / num_trainers + # TODO(zcd): The loss_cls or loss maybe NAN, so we decreate the learning rate here. + # The reasons for this should be analyzed in depth. + if num_trainers > 1: + args.learning_rate = args.learning_rate / 10 def train(): update_lr(cfg)