diff --git a/deepray/core/base_trainer.py b/deepray/core/base_trainer.py index 183e4b1..ac1cf69 100644 --- a/deepray/core/base_trainer.py +++ b/deepray/core/base_trainer.py @@ -256,7 +256,7 @@ def __init__( self.global_batch_size *= get_world_size() learning_rate *= get_world_size() - # TODO: fuhailin + # TODO: fuhailin # if isinstance(optimizer, optimizers.Optimizer): self.optimizer = optimizer # else: diff --git a/tools/build_base_container.sh b/tools/build_base_container.sh index d106555..f068aed 100755 --- a/tools/build_base_container.sh +++ b/tools/build_base_container.sh @@ -4,11 +4,13 @@ set -x -e PY_VERSION=${1:-"3.8"} TF_VERSION=${2:-"2.9.3"} +CUDA_VERSION=${3:-"11.6.2"} docker build \ -f tools/docker/base_container.Dockerfile \ - --build-arg TF_VERSION=2.9.3 \ - --build-arg TF_PACKAGE=tensorflow \ - --build-arg PY_VERSION=$PY_VERSION \ + --build-arg CUDA_VERSION=${CUDA_VERSION} \ + --build-arg TF_VERSION=${TF_VERSION} \ + --build-arg TF_PACKAGE=tensorflow-gpu \ + --build-arg PY_VERSION=${PY_VERSION} \ --target base_container \ - -t hailinfufu/deepray-release:latest-py${PY_VERSION}-tf${TF_VERSION}-cu116-ubuntu20.04 ./ + -t hailinfufu/deepray-release:latest-py${PY_VERSION}-tf${TF_VERSION}-cu${CUDA_VERSION}-ubuntu20.04 ./ diff --git a/tools/build_dev_container.sh b/tools/build_dev_container.sh index 45f5d99..031f6f0 100755 --- a/tools/build_dev_container.sh +++ b/tools/build_dev_container.sh @@ -4,11 +4,13 @@ set -x -e PY_VERSION=${1:-"3.8"} TF_VERSION=${2:-"2.9.3"} +CUDA_VERSION=${3:-"11.6.2"} docker build \ -f tools/docker/dev_container.Dockerfile \ - --build-arg TF_VERSION=2.9.3 \ + --build-arg CUDA_VERSION=${CUDA_VERSION} \ + --build-arg TF_VERSION=${TF_VERSION} \ --build-arg TF_PACKAGE=tensorflow \ --build-arg PY_VERSION=$PY_VERSION \ --target dev_container \ - -t hailinfufu/deepray-dev:latest-py${PY_VERSION}-tf${TF_VERSION}-cu116-ubuntu20.04 ./ + -t hailinfufu/deepray-dev:latest-py${PY_VERSION}-tf${TF_VERSION}-cu${CUDA_VERSION}-ubuntu20.04 ./ diff --git a/tools/docker/base_container.Dockerfile b/tools/docker/base_container.Dockerfile index 68171d2..270d707 100644 --- a/tools/docker/base_container.Dockerfile +++ b/tools/docker/base_container.Dockerfile @@ -1,11 +1,9 @@ #syntax=docker/dockerfile:1.1.5-experimental -ARG IMAGE_TYPE - +ARG CUDA_VERSION=11.6.2 # Currenly all of our dev images are GPU capable but at a cost of being quite large. # See https://github.com/tensorflow/build/pull/47 -ARG CUDA_DOCKER_VERSION=11.6.2-cudnn8-devel-ubuntu20.04 -FROM nvidia/cuda:${CUDA_DOCKER_VERSION} as base_container -ARG TF_PACKAGE +FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04 as base_container +ARG TF_PACKAGE=tensorflow-gpu ARG TF_VERSION=2.9.3 ARG PY_VERSION=3.8 @@ -41,6 +39,8 @@ RUN bash /install_deps/install_cmake.sh RUN bash /install_deps/install_openmpi.sh RUN pip install --default-timeout=1000 $TF_PACKAGE==$TF_VERSION +# RUN mkdir -p /usr/local/lib/python${PY_VERSION}/dist-packages/tensorflow/include/third_party/gpus/cuda && \ +# ln -s /usr/local/cuda/include /usr/local/lib/python${PY_VERSION}/dist-packages/tensorflow/include/third_party/gpus/cuda COPY requirements.txt /tmp/requirements.txt RUN pip install -r /install_deps/yapf.txt \ diff --git a/tools/docker/dev_container.Dockerfile b/tools/docker/dev_container.Dockerfile index 6432a55..99c6aed 100644 --- a/tools/docker/dev_container.Dockerfile +++ b/tools/docker/dev_container.Dockerfile @@ -1,11 +1,11 @@ #syntax=docker/dockerfile:1.1.5-experimental -ARG IMAGE_TYPE +ARG CUDA_VERSION=11.6.2 ARG TF_VERSION=2.9.3 ARG PY_VERSION=3.8 # Currenly all of our dev images are GPU capable but at a cost of being quite large. # See https://github.com/tensorflow/build/pull/47 -ARG CUDA_DOCKER_VERSION=latest-py${PY_VERSION}-tf${TF_VERSION}-cu116-ubuntu20.04 +ARG CUDA_DOCKER_VERSION=latest-py${PY_VERSION}-tf${TF_VERSION}-cu${CUDA_VERSION}-ubuntu20.04 FROM hailinfufu/deepray-release:${CUDA_DOCKER_VERSION} as dev_container COPY tools/install_deps /install_deps