pytorch · leej3 · May 23, 2024 · Apr 4, 2024 · Apr 2, 2024 · Apr 25, 2024
diff --git a/.github/workflows/gpu-hvd-tests.yml b/.github/workflows/gpu-hvd-tests.yml
@@ -22,7 +22,7 @@ jobs:
   gpu-hvd-tests:
     strategy:
       matrix:
-        pytorch-channel: [pytorch, ]
+        pytorch-channel: [pytorch]
       fail-fast: false
     env:
       DOCKER_IMAGE: "pytorch/conda-builder:cuda12.1"
@@ -128,8 +128,8 @@ jobs:
           # Can't build Horovod with recent pytorch due to pytorch required C++17 standard
           # and horovod is still using C++14
           # HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]
-          # Using a similar hack as described here: 
-          # https://github.com/horovod/horovod/issues/3941#issuecomment-1732505345 
+          # Using a similar hack as described here:
+          # https://github.com/horovod/horovod/issues/3941#issuecomment-1732505345
           git clone --recursive https://github.com/horovod/horovod.git /horovod
           cd /horovod
           sed -i "s/CMAKE_CXX_STANDARD 14/CMAKE_CXX_STANDARD 17/g" CMakeLists.txt
@@ -152,7 +152,7 @@ jobs:
           set -xe
 
           bash tests/run_gpu_tests.sh 2 hvd
-          CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd
+          CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ignite -m distributed -k hvd
 
           EOF
           )

diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
@@ -29,7 +29,7 @@ jobs:
       REPOSITORY: ${{ github.repository }}
       PR_NUMBER: ${{ github.event.pull_request.number }}
     runs-on: linux.8xlarge.nvidia.gpu
-    timeout-minutes: 45
+    timeout-minutes: 85
 
     steps:
       - name: Clean workspace
@@ -121,18 +121,13 @@ jobs:
 
       - name: Run GPU Unit Tests
         continue-on-error: false
-        run: |
-
-          script=$(cat << EOF
-
-          set -xe
-
-          bash tests/run_gpu_tests.sh 2
-
-          EOF
-          )
-
-          docker exec -t pthd /bin/bash -c "${script}"
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 5
+          timeout_minutes: 25
+          shell: bash
+          command: docker exec -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2'
+          new_command_on_retry: docker exec -e USE_LAST_FAILED=1 -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2'
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v3

diff --git a/.github/workflows/hvd-tests.yml b/.github/workflows/hvd-tests.yml
@@ -75,9 +75,13 @@ jobs:
           target_dir: /tmp
 
       - name: Run Tests
-        shell: bash -l {0}
-        run: |
-          bash tests/run_cpu_tests.sh
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 5
+          timeout_minutes: 15
+          shell: bash
+          command: bash tests/run_cpu_tests.sh
+          new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v3

diff --git a/.github/workflows/pytorch-version-tests.yml b/.github/workflows/pytorch-version-tests.yml
@@ -10,15 +10,15 @@ on:
 jobs:
   build:
     runs-on: ubuntu-latest
-    timeout-minutes: 45
+    timeout-minutes: 85
     strategy:
       max-parallel: 5
       fail-fast: false
       matrix:
         python-version: [3.8, 3.9, "3.10"]
         pytorch-version:
           [2.1.2, 2.0.1, 1.13.1, 1.12.1, 1.11.0, 1.10.0, 1.9.1, 1.8.1, 1.5.1]
-        exclude:            
+        exclude:
           - pytorch-version: 1.5.1
             python-version: 3.9
           - pytorch-version: 1.5.1
@@ -78,7 +78,7 @@ jobs:
           pip install -r requirements-dev.txt
           python setup.py install
 
-          # pytorch>=1.9.0,<1.11.0 is using "from setuptools import distutils; distutils.version.LooseVersion" anti-pattern 
+          # pytorch>=1.9.0,<1.11.0 is using "from setuptools import distutils; distutils.version.LooseVersion" anti-pattern
           # which raises the error: AttributeError: module 'distutils' has no attribute 'version' for setuptools>59
           bad_pth_version=$(python -c "import torch; print('.'.join(torch.__version__.split('.')[:2]) in ['1.9', '1.10'])")
           if [ "${bad_pth_version}" == "True" ]; then
@@ -92,9 +92,13 @@ jobs:
           target_dir: /tmp
 
       - name: Run Tests
-        shell: bash -l {0}
-        run: |
-          bash tests/run_cpu_tests.sh "not test_time_profilers"
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 5
+          timeout_minutes: 15
+          shell: bash
+          command: bash tests/run_cpu_tests.sh "not test_time_profilers"
+          new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh "not test_time_profilers"
 
   # create-issue:
   #   runs-on: ubuntu-latest

diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml
@@ -89,13 +89,19 @@ jobs:
           target_dir: /tmp
 
       - name: Run Tests
-        run: |
-          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${Python_ROOT_DIR}/lib
-          export XRT_DEVICE_MAP="CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"
-          export XRT_WORKERS="localservice:0;grpc://localhost:40934"
-
-          python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
-          bash tests/run_tpu_tests.sh
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 5
+          timeout_minutes: 25
+          shell: bash
+          command: |
+            python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
+            bash tests/run_tpu_tests.sh
+          new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_tpu_tests.sh
+        env:
+          LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH }}:${{ env.Python_ROOT_DIR }}/lib
+          XRT_DEVICE_MAP: "CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"
+          XRT_WORKERS: "localservice:0;grpc://localhost:40934"
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v3

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -31,7 +31,7 @@ concurrency:
 jobs:
   cpu-tests:
     runs-on: ${{ matrix.os }}
-    timeout-minutes: 45
+    timeout-minutes: 85
     defaults:
       run:
         shell: bash
@@ -40,7 +40,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest]
-        python-version: ["3.8", "3.9", "3.10", "3.11","3.12"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
         pytorch-channel: [pytorch, pytorch-nightly]
         include:
           # includes a single build on windows
@@ -102,7 +102,7 @@ jobs:
 
       - name: Run Mypy
         # https://github.com/pytorch/ignite/pull/2780
-        # 
+        #
         if: ${{ matrix.os == 'ubuntu-latest' && matrix.pytorch-channel == 'pytorch-nightly'}}
         run: |
           bash ./tests/run_code_style.sh mypy
@@ -120,8 +120,13 @@ jobs:
           cp -R /tmp/MNIST .
 
       - name: Run Tests
-        run: |
-          SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 5
+          timeout_minutes: 15
+          shell: bash
+          command: SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
+          new_command_on_retry: USE_LAST_FAILED=1 SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v3

diff --git a/tests/common-test-functionality.sh b/tests/common-test-functionality.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+# Will catch exit code 5 when tests are deselected from previous passing run
+# (relevent for --last-failed-no-failures none)
+last_failed_no_failures_code=5
+
+#  functions shared across test files
+run_tests() {
+    # Set defaults
+    local core_args="-vvv tests/ignite"
+    local cache_dir=".unknown-cache"
+    local skip_distrib_tests=1
+    local match_tests_expression=""
+    local trap_deselected_exit_code=1
+    local use_last_failed=0
+    local use_coverage=0
+    local world_size=0
+    # Always clean up pytest.ini
+    trap 'rm -f pytest.ini' RETURN
+    # Parse arguments
+    while [[ $# -gt 0 ]]
+    do
+        key="$1"
+        case $key in
+            --core_args)
+            core_args="$2"
+            shift
+            shift
+            ;;
+            --cache_dir)
+            cache_dir="$2"
+            shift
+            shift
+            ;;
+            --skip_distrib_tests)
+            skip_distrib_tests="$2"
+            shift
+            shift
+            ;;
+            --match_tests_expression)
+            match_tests_expression="$2"
+            shift
+            shift
+            ;;
+            --trap_deselected_exit_code)
+            trap_deselected_exit_code="$2"
+            shift
+            shift
+            ;;
+            --use_last_failed)
+            use_last_failed="$2"
+            shift
+            shift
+            ;;
+            --use_coverage)
+            use_coverage="$2"
+            shift
+            shift
+            ;;
+            --world_size)
+            world_size="$2"
+            shift
+            shift
+            ;;
+            *)
+            echo "Error: Unknown argument $key"
+            exit 1
+            shift
+            ;;
+        esac
+    done
+
+    if [ "${skip_distrib_tests}" -eq "1" ]; then
+        # can be overwritten by core_args
+        skip_distrib_opt="-m 'not distributed and not tpu and not multinode_distributed'"
+    else
+        skip_distrib_opt=""
+    fi
+
+
+    echo [pytest] > pytest.ini ; echo "cache_dir=${cache_dir}" >> pytest.ini
+
+    # Assemble options for the pytest command
+    pytest_args="${skip_distrib_opt} ${core_args} --treat-unrun-as-failed -k '${match_tests_expression}'"
+    if [ "${use_last_failed:-0}" -eq "1" ] && [ -d "${cache_dir}" ]; then
+        pytest_args="--last-failed --last-failed-no-failures none ${pytest_args}"
+    fi
+    if [ "${use_coverage}" -eq "1" ]; then
+        pytest_args="--cov ignite --cov-append --cov-report term-missing --cov-report xml ${pytest_args}"
+    fi
+    if [ ! "${world_size}" -eq "0" ]; then
+        export WORLD_SIZE="${world_size}"
+        pytest_args="--dist=each --tx ${WORLD_SIZE}*popen//python=python ${pytest_args}"
+    fi
+
+    # Run the command
+    if [ "$trap_deselected_exit_code" -eq "1" ]; then
+        CUDA_VISIBLE_DEVICES="" eval "pytest ${pytest_args}" || { exit_code=$?; if [ "$exit_code" -eq ${last_failed_no_failures_code} ]; then echo "All tests deselected"; else exit $exit_code; fi; }
+    else
+        CUDA_VISIBLE_DEVICES="" eval "pytest ${pytest_args}"
+    fi
+}