From e417d542ea402f7f14dac00e13f70e8a578329fc Mon Sep 17 00:00:00 2001 From: YundongYe Date: Thu, 31 Jan 2019 16:21:01 +0800 Subject: [PATCH] [Example] Add an exmaple of horovod with rdma & intel mpi (#2112) --- .../DOCKER.md | 36 +++++ ....example.horovod-with-azure-rdma-intel-mpi | 129 ++++++++++++++++++ .../README.md | 75 ++++++++++ .../silent.cfg | 49 +++++++ 4 files changed, 289 insertions(+) create mode 100644 examples/horovod-with-azure-rdma-intel-mpi/DOCKER.md create mode 100644 examples/horovod-with-azure-rdma-intel-mpi/Dockerfile.example.horovod-with-azure-rdma-intel-mpi create mode 100644 examples/horovod-with-azure-rdma-intel-mpi/README.md create mode 100644 examples/horovod-with-azure-rdma-intel-mpi/silent.cfg diff --git a/examples/horovod-with-azure-rdma-intel-mpi/DOCKER.md b/examples/horovod-with-azure-rdma-intel-mpi/DOCKER.md new file mode 100644 index 0000000000..54f2ea1501 --- /dev/null +++ b/examples/horovod-with-azure-rdma-intel-mpi/DOCKER.md @@ -0,0 +1,36 @@ +## Basic environment + +First of all, PAI runs all jobs in Docker container. + +1: Install Docker CE + +2: Get an account of docker hub to store your image. + +# horovod-with-azure-rdma-intel-mpi on PAI docker env + +## Contents + + +We need to build an horovod-with-azure-rdma-intel-mpi image to run horovod with rdma&&mpi on OpenPAI@Azure, and this can be done with following steps: + + +- Get a license for your intel mpi. And then modify the ```ACTIVATION_TYPE``` in the [silent.cfg](./silent.cfg) + +- Write a horovod-with-azure-rdma-intel-mpi Dockerfile and save it to `Dockerfile.example.horovod-with-azure-rdma-intel-mpi`: + + - You could refer to this [Dockerfile](./Dockerfile.example.horovod-with-azure-rdma-intel-mpi) + - If your intel MPI is activated by a license file. You should copy it to the docker image, when building it. + - You'd better keep the image in a private registry. Because you build the license in the image. + +- Build the Docker image from `Dockerfile.example.horovod-with-azure-rdma-intel-mpi`: + +```bash +$ sudo docker build -f Dockerfile.example.horovod-with-azure-rdma-intel-mpi -t USER/pai.example.horovod-with-azure-rdma-intel-mpi . +``` + +- Push the Docker image to a Docker registry: + +```bash +$ sudo docker push USER/pai.example.horovod-with-azure-rdma-intel-mpi +``` +Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image. diff --git a/examples/horovod-with-azure-rdma-intel-mpi/Dockerfile.example.horovod-with-azure-rdma-intel-mpi b/examples/horovod-with-azure-rdma-intel-mpi/Dockerfile.example.horovod-with-azure-rdma-intel-mpi new file mode 100644 index 0000000000..7f37cdcd88 --- /dev/null +++ b/examples/horovod-with-azure-rdma-intel-mpi/Dockerfile.example.horovod-with-azure-rdma-intel-mpi @@ -0,0 +1,129 @@ +# Tag: nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 +# Label: com.nvidia.cuda.version: 9.0.176 +# Label: com.nvidia.cudnn.version: 7.1.2.21 +# Label: com.nvidia.volumes.needed: nvidia_driver +# Label: maintainer: NVIDIA CORPORATION +# Ubuntu 16.04 +FROM nvidia/cuda@sha256:40db1c98b66e133f54197ba1a66312b9c29842635c8cba5ae66fb56ded695b7c + +ENV TENSORFLOW_VERSION=1.12.0 +ENV HADOOP_VERSION=2.7.2 +LABEL HADOOP_VERSION=2.7.2 + +RUN apt-get -y update && \ + apt-get -y install \ + nano \ + vim \ + joe \ + wget \ + curl \ + jq \ + gawk \ + psmisc \ + python \ + python-dev \ + python-pip \ + python3 \ + python3-dev \ + python3-pip \ + python-six \ + build-essential \ + automake \ + openjdk-8-jdk \ + lsof \ + libcupti-dev \ + # SSH library is necessary for mpi workload. + openssh-server \ + openssh-client \ + build-essential \ + autotools-dev \ + cmake \ + git \ + bash-completion \ + ca-certificates \ + inotify-tools \ + rsync \ + realpath \ + libjpeg-dev \ + libpng-dev \ + net-tools \ + libsm6 \ + libxext6 \ + rpm \ + #For Azure RDMA and intel MPI installation + cpio \ + net-tools \ + libdapl2 \ + dapl2-utils \ + libmlx4-1 \ + libmlx5-1 \ + ibutils \ + librdmacm1 \ + libibverbs1 \ + libmthca1 \ + ibverbs-utils \ + rdmacm-utils \ + perftest \ + kmod + + +# Prepare Hadoop 2.7.2 +RUN wget -qO- http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ + tar xz -C /usr/local && \ + mv /usr/local/hadoop-${HADOOP_VERSION} /usr/local/hadoop + +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ + HADOOP_INSTALL=/usr/local/hadoop \ + NVIDIA_VISIBLE_DEVICES=all + +ENV HADOOP_PREFIX=${HADOOP_INSTALL} \ + HADOOP_BIN_DIR=${HADOOP_INSTALL}/bin \ + HADOOP_SBIN_DIR=${HADOOP_INSTALL}/sbin \ + HADOOP_HDFS_HOME=${HADOOP_INSTALL} \ + HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_INSTALL}/lib/native \ + HADOOP_OPTS="-Djava.library.path=${HADOOP_INSTALL}/lib/native" + +ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:${HADOOP_BIN_DIR}:${HADOOP_SBIN_DIR} \ + LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib/stubs:${JAVA_HOME}/jre/lib/amd64/server + + +# Install NCCL v2.3.7, for CUDA 9.0 +RUN wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64/nvidia-machine-learning-repo-ubuntu1604_1.0.0-1_amd64.deb && \ + dpkg -i nvidia-machine-learning-repo-ubuntu1604_1.0.0-1_amd64.deb && \ + apt install libnccl2=2.3.7-1+cuda9.0 libnccl-dev=2.3.7-1+cuda9.0 + + +# Install intel MPI with the version which azure suggests. +COPY silent.cfg /silent.cfg +ENV MANPATH=/usr/share/man:/usr/local/man \ + COMPILERVARS_ARCHITECTURE=intel64 \ + COMPILERVARS_PLATFORM=linux \ + INTEL_MPI_PATH=/opt/intel/compilers_and_libraries/linux/mpi + +# Install Intel MPI in the Docker Image. +# You should prepare your own intel mpi license to active your intel MPI, and modify the file silent.cfg to set the configuration of activation type. +RUN wget http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/9278/l_mpi_p_5.1.3.223.tgz && \ + tar -xvf /l_mpi_p_5.1.3.223.tgz && \ + cd /l_mpi_p_5.1.3.223 && \ + ./install.sh -s /silent.cfg && \ + . /opt/intel/bin/compilervars.sh && \ + . /opt/intel/compilers_and_libraries/linux/mpi/bin64/mpivars.sh && \ + echo "source /opt/intel/compilers_and_libraries/linux/mpi/bin64/mpivars.sh" >> /root/.bashrc && \ + echo LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:'$LD_LIBRARY_PATH' >> /root/.bashrc + +ENV PATH $PATH:/opt/intel/compilers_and_libraries/linux/mpi/bin64 + +# Install TensorFlow +RUN pip3 install tensorflow-gpu==${TENSORFLOW_VERSION} h5py && \ + pip install tensorflow-gpu==${TENSORFLOW_VERSION} h5py + +# Install Dependencies +RUN pip3 install --no-cache-dir scipy jupyter ipykernel numpy toolz pandas scikit-learn pillow && \ + pip install --no-cache-dir scipy numpy toolz pandas scikit-learn pillow + +# Install Horovod, temporarily using CUDA stubs +RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \ + /bin/bash -c "source /opt/intel/compilers_and_libraries/linux/mpi/intel64/bin/mpivars.sh" && \ + pip3 install --no-cache-dir horovod==0.15.2 && \ + pip install --no-cache-dir horovod==0.15.2 && \ + ldconfig \ No newline at end of file diff --git a/examples/horovod-with-azure-rdma-intel-mpi/README.md b/examples/horovod-with-azure-rdma-intel-mpi/README.md new file mode 100644 index 0000000000..a097f0d391 --- /dev/null +++ b/examples/horovod-with-azure-rdma-intel-mpi/README.md @@ -0,0 +1,75 @@ + + +## horovod in OpenPAI based on Azure's RDMA Capable machines. + +#### Prerequisites + +Please contact with your cluster admin to ensure that whether azure rdma is enabled in your cluster or not. + +#### Prepare build an image with necessary library. + +To run horovod-with-azure-rdma-intel-mpi in OpenPAI, you need to prepare a job configuration file and submit it through webportal. + +Users should prepare their own intel mpi licences to build the docker image. When building docker image, please refer to [DOCKER.md](./DOCKER.md) + +###### +```json +{ + "jobName": "horovod-job", + "image": "your/images/url", + "virtualCluster": "default", + "taskRoles": [ + { + "name": "master", + "taskNumber": 1, + "cpuNumber": 2, + "memoryMB": 10240, + "shmMB": 256, + "gpuNumber": 1, + "minFailedTaskCount": 1, + "minSucceededTaskCount": 1, + "command": "sleep 1000 && Your codeļ¼" + }, + { + "name": "worker", + "taskNumber": 1, + "cpuNumber": 2, + "memoryMB": 10240, + "shmMB": 256, + "gpuNumber": 1, + "minFailedTaskCount": 1, + "minSucceededTaskCount": 1, + "command": "sleep infinity" + } + ], + "jobEnvs": { + "paiAzRDMA": true + } +} +``` + +- For more details on how to write a job configuration file, please refer to [job tutorial](../../docs/job_tutorial.md#json-config-file-for-job-submission). +- Sleep 1000 in ```master-0``` is a hack to ensure that all work containers are ready. You could optimize it in a better way. For example try to ssh all the work containers with the hostanme ```${taskname}-${taskid}``` until sccessful. +- If user wanna a AZ-RDMA capable container. The following parameter is necessary. +```bash +"jobEnvs": { + "paiAzRDMA": true + } +``` \ No newline at end of file diff --git a/examples/horovod-with-azure-rdma-intel-mpi/silent.cfg b/examples/horovod-with-azure-rdma-intel-mpi/silent.cfg new file mode 100644 index 0000000000..a3133979c6 --- /dev/null +++ b/examples/horovod-with-azure-rdma-intel-mpi/silent.cfg @@ -0,0 +1,49 @@ +# Patterns used to check silent configuration file +# +# anythingpat - any string +# filepat - the file location pattern (/file/location/to/license.lic) +# lspat - the license server address pattern (0123@hostname) +# snpat - the serial number pattern (ABCD-01234567) + +# Accept EULA, valid values are: {accept, decline} +ACCEPT_EULA=accept + +# Optional error behavior, valid values are: {yes, no} +CONTINUE_WITH_OPTIONAL_ERROR=yes + +# Install location, valid values are: {/opt/intel, filepat} +PSET_INSTALL_DIR=/opt/intel + +# Continue with overwrite of existing installation directory, valid values are: {yes, no} +CONTINUE_WITH_INSTALLDIR_OVERWRITE=yes + +# List of components to install, valid values are: {ALL, anythingpat} +COMPONENTS=ALL + +# Installation mode, valid values are: {install, modify, repair, uninstall} +PSET_MODE=install + +# Directory for non-RPM database, valid values are: {filepat} +#NONRPM_DB_DIR=filepat + +# Serial number, valid values are: {snpat} +#ACTIVATION_SERIAL_NUMBER=snpat + +# License file or license server, valid values are: {lspat, filepat} +#ACTIVATION_LICENSE_FILE= + +# Activation type, valid values are: {exist_lic, license_server, license_file, trial_lic, serial_number} +ACTIVATION_TYPE=trial_lic + +# Path to the cluster description file, valid values are: {filepat} +#CLUSTER_INSTALL_MACHINES_FILE=filepat + +# Intel(R) Software Improvement Program opt-in, valid values are: {yes, no} +PHONEHOME_SEND_USAGE_DATA=no + +# Perform validation of digital signatures of RPM files, valid values are: {yes, no} +SIGNING_ENABLED=yes + +# Select target architecture of your applications, valid values are: {IA32, INTEL64, ALL} +ARCH_SELECTED=INTEL64 +