[Example] Add an exmaple of horovod with rdma & intel mpi (#2112)

microsoft · Jan 31, 2019 · e417d54 · e417d54
1 parent 1e6c84d
commit e417d54
Show file tree

Hide file tree

Showing 4 changed files with 289 additions and 0 deletions.
diff --git a/examples/horovod-with-azure-rdma-intel-mpi/DOCKER.md b/examples/horovod-with-azure-rdma-intel-mpi/DOCKER.md
@@ -0,0 +1,36 @@
+## Basic environment
+
+First of all, PAI runs all jobs in Docker container.
+
+1: Install Docker CE
+
+2: Get an account of docker hub to store your image.
+
+# horovod-with-azure-rdma-intel-mpi on PAI docker env
+
+## Contents
+
+
+We need to build an horovod-with-azure-rdma-intel-mpi image to run horovod with rdma&&mpi on OpenPAI@Azure, and this can be done with following steps:
+
+
+- Get a license for your intel mpi. And then modify the ```ACTIVATION_TYPE``` in the [silent.cfg](./silent.cfg)
+
+- Write a horovod-with-azure-rdma-intel-mpi Dockerfile and save it to `Dockerfile.example.horovod-with-azure-rdma-intel-mpi`:
+
+    - You could refer to this [Dockerfile](./Dockerfile.example.horovod-with-azure-rdma-intel-mpi)
+    - If your intel MPI is activated by a license file. You should copy it to the docker image, when building it.
+    - You'd better keep the image in a private registry. Because you build the license in the image. 
+
+- Build the Docker image from `Dockerfile.example.horovod-with-azure-rdma-intel-mpi`:
+
+```bash
+$ sudo docker build -f Dockerfile.example.horovod-with-azure-rdma-intel-mpi -t USER/pai.example.horovod-with-azure-rdma-intel-mpi .
+```
+
+- Push the Docker image to a Docker registry:
+
+```bash
+$ sudo docker push USER/pai.example.horovod-with-azure-rdma-intel-mpi
+```
+Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.
diff --git a/...es/horovod-with-azure-rdma-intel-mpi/Dockerfile.example.horovod-with-azure-rdma-intel-mpi b/...es/horovod-with-azure-rdma-intel-mpi/Dockerfile.example.horovod-with-azure-rdma-intel-mpi
@@ -0,0 +1,129 @@
+# Tag: nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+# Label: com.nvidia.cuda.version: 9.0.176
+# Label: com.nvidia.cudnn.version: 7.1.2.21
+# Label: com.nvidia.volumes.needed: nvidia_driver
+# Label: maintainer: NVIDIA CORPORATION <cudatools@nvidia.com>
+# Ubuntu 16.04
+FROM nvidia/cuda@sha256:40db1c98b66e133f54197ba1a66312b9c29842635c8cba5ae66fb56ded695b7c
+
+ENV TENSORFLOW_VERSION=1.12.0
+ENV HADOOP_VERSION=2.7.2
+LABEL HADOOP_VERSION=2.7.2
+
+RUN apt-get -y update && \
+    apt-get -y install \
+      nano \
+      vim \
+      joe \
+      wget \
+      curl \
+      jq \
+      gawk \
+      psmisc \
+      python \
+      python-dev \
+      python-pip \
+      python3 \
+      python3-dev \
+      python3-pip \
+      python-six \
+      build-essential \
+      automake \
+      openjdk-8-jdk \
+      lsof \
+      libcupti-dev \
+      # SSH library is necessary for mpi workload.
+      openssh-server \
+      openssh-client \
+      build-essential \
+      autotools-dev \
+      cmake \
+      git \
+      bash-completion \
+      ca-certificates \
+      inotify-tools \
+      rsync \
+      realpath \
+      libjpeg-dev \
+      libpng-dev \
+      net-tools \
+      libsm6 \
+      libxext6 \
+      rpm \
+      #For Azure RDMA and intel MPI installation
+      cpio \
+      net-tools \
+      libdapl2 \
+      dapl2-utils \
+      libmlx4-1 \
+      libmlx5-1 \
+      ibutils \
+      librdmacm1 \
+      libibverbs1 \
+      libmthca1 \
+      ibverbs-utils \
+      rdmacm-utils \
+      perftest \
+      kmod
+
+
+# Prepare Hadoop 2.7.2
+RUN wget -qO- http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \
+    tar xz -C /usr/local && \
+    mv /usr/local/hadoop-${HADOOP_VERSION} /usr/local/hadoop
+
+ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \
+    HADOOP_INSTALL=/usr/local/hadoop \
+    NVIDIA_VISIBLE_DEVICES=all
+
+ENV HADOOP_PREFIX=${HADOOP_INSTALL} \
+    HADOOP_BIN_DIR=${HADOOP_INSTALL}/bin \
+    HADOOP_SBIN_DIR=${HADOOP_INSTALL}/sbin \
+    HADOOP_HDFS_HOME=${HADOOP_INSTALL} \
+    HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_INSTALL}/lib/native \
+    HADOOP_OPTS="-Djava.library.path=${HADOOP_INSTALL}/lib/native"
+
+ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:${HADOOP_BIN_DIR}:${HADOOP_SBIN_DIR} \
+    LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib/stubs:${JAVA_HOME}/jre/lib/amd64/server
+
+
+# Install NCCL v2.3.7, for CUDA 9.0
+RUN wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64/nvidia-machine-learning-repo-ubuntu1604_1.0.0-1_amd64.deb && \
+    dpkg -i nvidia-machine-learning-repo-ubuntu1604_1.0.0-1_amd64.deb && \
+    apt install libnccl2=2.3.7-1+cuda9.0 libnccl-dev=2.3.7-1+cuda9.0
+
+
+# Install intel MPI with the version which azure suggests.
+COPY silent.cfg /silent.cfg
+ENV MANPATH=/usr/share/man:/usr/local/man \
+    COMPILERVARS_ARCHITECTURE=intel64 \
+    COMPILERVARS_PLATFORM=linux \
+    INTEL_MPI_PATH=/opt/intel/compilers_and_libraries/linux/mpi
+
+# Install Intel MPI in the Docker Image.
+# You should prepare your own intel mpi license to active your intel MPI, and modify the file silent.cfg to set the configuration of activation type.
+RUN wget http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/9278/l_mpi_p_5.1.3.223.tgz && \
+    tar -xvf /l_mpi_p_5.1.3.223.tgz && \
+    cd /l_mpi_p_5.1.3.223 && \
+    ./install.sh -s /silent.cfg && \
+    . /opt/intel/bin/compilervars.sh && \
+    . /opt/intel/compilers_and_libraries/linux/mpi/bin64/mpivars.sh && \
+    echo "source /opt/intel/compilers_and_libraries/linux/mpi/bin64/mpivars.sh" >> /root/.bashrc && \
+    echo LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:'$LD_LIBRARY_PATH' >> /root/.bashrc
+
+ENV PATH $PATH:/opt/intel/compilers_and_libraries/linux/mpi/bin64
+
+# Install TensorFlow
+RUN pip3 install tensorflow-gpu==${TENSORFLOW_VERSION} h5py && \
+    pip install tensorflow-gpu==${TENSORFLOW_VERSION} h5py
+
+# Install Dependencies
+RUN pip3 install --no-cache-dir scipy jupyter ipykernel numpy toolz pandas scikit-learn pillow && \
+    pip install --no-cache-dir scipy numpy toolz pandas scikit-learn pillow
+
+# Install Horovod, temporarily using CUDA stubs
+RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
+	/bin/bash -c "source /opt/intel/compilers_and_libraries/linux/mpi/intel64/bin/mpivars.sh" && \
+    pip3 install --no-cache-dir horovod==0.15.2 && \
+    pip install --no-cache-dir horovod==0.15.2 && \
+    ldconfig
diff --git a/examples/horovod-with-azure-rdma-intel-mpi/README.md b/examples/horovod-with-azure-rdma-intel-mpi/README.md
@@ -0,0 +1,75 @@
+<!--
+  Copyright (c) Microsoft Corporation
+  All rights reserved.
+
+  MIT License
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+  documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+  the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+  to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+  The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+  BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+  DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+-->
+
+## horovod in OpenPAI based on Azure's RDMA Capable machines.
+
+#### Prerequisites
+
+Please contact with your cluster admin to ensure that whether azure rdma is enabled in your cluster or not.
+
+#### Prepare build an image with necessary library.
+
+To run horovod-with-azure-rdma-intel-mpi in OpenPAI, you need to prepare a job configuration file and submit it through webportal.
+
+Users should prepare their own intel mpi licences to build the docker image. When building docker image, please refer to [DOCKER.md](./DOCKER.md)
+
+###### 
+```json
+{
+  "jobName": "horovod-job",
+  "image": "your/images/url",
+  "virtualCluster": "default",
+  "taskRoles": [
+    {
+      "name": "master",
+      "taskNumber": 1,
+      "cpuNumber": 2,
+      "memoryMB": 10240,
+      "shmMB": 256,
+      "gpuNumber": 1,
+      "minFailedTaskCount": 1,
+      "minSucceededTaskCount": 1,
+      "command": "sleep 1000 &&  Your code！"
+    },
+    {
+      "name": "worker",
+      "taskNumber": 1,
+      "cpuNumber": 2,
+      "memoryMB": 10240,
+      "shmMB": 256,
+      "gpuNumber": 1,
+      "minFailedTaskCount": 1,
+      "minSucceededTaskCount": 1,
+      "command": "sleep infinity"
+    }
+  ],
+  "jobEnvs": {
+    "paiAzRDMA": true
+  }
+}
+```  
+
+- For more details on how to write a job configuration file, please refer to [job tutorial](../../docs/job_tutorial.md#json-config-file-for-job-submission).
+- Sleep 1000 in ```master-0``` is a hack to ensure that all work containers are ready. You could optimize it in a better way. For example try to ssh all the work containers with the hostanme ```${taskname}-${taskid}``` until sccessful.
+- If user wanna a AZ-RDMA capable container. The following parameter is necessary.
+```bash
+"jobEnvs": {
+    "paiAzRDMA": true
+  }
+```
diff --git a/examples/horovod-with-azure-rdma-intel-mpi/silent.cfg b/examples/horovod-with-azure-rdma-intel-mpi/silent.cfg
@@ -0,0 +1,49 @@
+# Patterns used to check silent configuration file
+#
+# anythingpat - any string
+# filepat     - the file location pattern (/file/location/to/license.lic)
+# lspat       - the license server address pattern (0123@hostname)
+# snpat       - the serial number pattern (ABCD-01234567)
+
+# Accept EULA, valid values are: {accept, decline}
+ACCEPT_EULA=accept
+
+# Optional error behavior, valid values are: {yes, no}
+CONTINUE_WITH_OPTIONAL_ERROR=yes
+
+# Install location, valid values are: {/opt/intel, filepat}
+PSET_INSTALL_DIR=/opt/intel
+
+# Continue with overwrite of existing installation directory, valid values are: {yes, no}
+CONTINUE_WITH_INSTALLDIR_OVERWRITE=yes
+
+# List of components to install, valid values are: {ALL, anythingpat}
+COMPONENTS=ALL
+
+# Installation mode, valid values are: {install, modify, repair, uninstall}
+PSET_MODE=install
+
+# Directory for non-RPM database, valid values are: {filepat}
+#NONRPM_DB_DIR=filepat
+
+# Serial number, valid values are: {snpat}
+#ACTIVATION_SERIAL_NUMBER=snpat
+
+# License file or license server, valid values are: {lspat, filepat}
+#ACTIVATION_LICENSE_FILE=
+
+# Activation type, valid values are: {exist_lic, license_server, license_file, trial_lic, serial_number}
+ACTIVATION_TYPE=trial_lic
+
+# Path to the cluster description file, valid values are: {filepat}
+#CLUSTER_INSTALL_MACHINES_FILE=filepat
+
+# Intel(R) Software Improvement Program opt-in, valid values are: {yes, no}
+PHONEHOME_SEND_USAGE_DATA=no
+
+# Perform validation of digital signatures of RPM files, valid values are: {yes, no}
+SIGNING_ENABLED=yes
+
+# Select target architecture of your applications, valid values are: {IA32, INTEL64, ALL}
+ARCH_SELECTED=INTEL64
+