From 698af4b24801c3f061692f255aec5a5d4c8323b3 Mon Sep 17 00:00:00 2001 From: allaffa Date: Sat, 28 Sep 2024 00:12:59 -0400 Subject: [PATCH] remove job submission script previously added by mistake --- job-frontier-ogb-deepspeed.sh | 46 -------------------------------- job-frontier-preonly-nvme.sh | 50 ----------------------------------- 2 files changed, 96 deletions(-) delete mode 100644 job-frontier-ogb-deepspeed.sh delete mode 100755 job-frontier-preonly-nvme.sh diff --git a/job-frontier-ogb-deepspeed.sh b/job-frontier-ogb-deepspeed.sh deleted file mode 100644 index 4b464ae13..000000000 --- a/job-frontier-ogb-deepspeed.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -#SBATCH -A LRN031 -#SBATCH -J HydraGNN -#SBATCH -o job-%j.out -#SBATCH -e job-%j.out -#SBATCH -t 01:00:00 -#SBATCH -p batch -#SBATCH -q debug -#SBATCH -N 8 -#SBATCH -S 1 - -ulimit -n 65536 - -export MPICH_ENV_DISPLAY=0 -export MPICH_VERSION_DISPLAY=0 -export MIOPEN_DISABLE_CACHE=1 -export NCCL_PROTO=Simple - -export OMP_NUM_THREADS=7 -export HYDRAGNN_NUM_WORKERS=0 -export HYDRAGNN_USE_VARIABLE_GRAPH_SIZE=1 -export HYDRAGNN_AGGR_BACKEND=mpi -export HYDRAGNN_VALTEST=0 -export NCCL_P2P_LEVEL=NVL -export NCCL_P2P_DISABLE=1 - -export MPICH_GPU_SUPPORT_ENABLED=1 -export MPICH_GPU_MANAGED_MEMORY_SUPPORT_ENABLED=1 -export MPICH_OFI_NIC_POLICY=GPU - - -source /lustre/orion/cph161/world-shared/mlupopa/module-to-load-frontier.sh - -source /lustre/orion/cph161/world-shared/mlupopa/max_conda_envs_frontier/bin/activate -conda activate hydragnn - -export PYTHONPATH=/lustre/orion/cph161/world-shared/mlupopa/ADIOS_frontier/install/lib/python3.8/site-packages/:$PYTHONPATH - -export PYTHONPATH=$PWD:$PYTHONPATH - - -# both commands should work -srun -N$SLURM_JOB_NUM_NODES -n$((SLURM_JOB_NUM_NODES*8)) -c7 --gres=gpu:8 \ - python -u ./examples/ogb/train_gap.py gap --adios --use_deepspeed -# srun -N$SLURM_JOB_NUM_NODES -n$((SLURM_JOB_NUM_NODES*8)) -c7 --gpus-per-task=1 --gpu-bind=closest \ -# python -u ./examples/ogb/train_gap.py gap --adios --use_deepspeed diff --git a/job-frontier-preonly-nvme.sh b/job-frontier-preonly-nvme.sh deleted file mode 100755 index c9085d225..000000000 --- a/job-frontier-preonly-nvme.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -#SBATCH -A CPH161 -#SBATCH -J HydraGNN -#SBATCH -o job-%j.out -#SBATCH -e job-%j.out -#SBATCH -t 02:00:00 -#SBATCH -p batch -#SBATCH -N 5 -#SBATCH -q debug -#SBATCH -S 0 -#SBATCH -C nvme - -export MPICH_ENV_DISPLAY=1 -export MPICH_VERSION_DISPLAY=1 -export MPICH_GPU_SUPPORT_ENABLED=1 -export MPICH_GPU_MANAGED_MEMORY_SUPPORT_ENABLED=1 -export MPICH_OFI_NIC_POLICY=GPU -export MIOPEN_DISABLE_CACHE=1 -export NCCL_PROTO=Simple - -export OMP_NUM_THREADS=7 -export HYDRAGNN_AGGR_BACKEND=mpi - -source /lustre/orion/cph161/world-shared/mlupopa/module-to-load-frontier.sh - -source /lustre/orion/cph161/world-shared/mlupopa/max_conda_envs_frontier/bin/activate -conda activate hydragnn - -export PYTHONPATH=/lustre/orion/cph161/world-shared/mlupopa/ADIOS_frontier/install/lib/python3.8/site-packages/:$PYTHONPATH - -export PYTHONPATH=$PWD:$PYTHONPATH -cd examples/mptrj/ - -# SBCAST file from Orion to NVMe -- NOTE: ``-C nvme`` is required to use the NVMe drive -sbcast -pf dataset/MPtrj_2022.9_full.json /mnt/bb/$USER/MPtrj_2022.9_full.json -if [ ! "$?" == "0" ]; then - # CHECK EXIT CODE. When SBCAST fails, it may leave partial files on the compute nodes, and if you continue to launch srun, - # your application may pick up partially complete shared library files, which would give you confusing errors. - echo "SBCAST failed!" - exit 1 -fi - -echo -# Showing the file on the current node -- this will be the same on all other nodes in the allocation -echo "*****SBCAST FILE ON CURRENT NODE******" -ls /mnt/bb/$USER/ -echo "**************************************" - -srun -n$((SLURM_JOB_NUM_NODES*4)) python -u train.py --preonly --pickle --tmpfs "/mnt/bb/$USER/" -#python -u train.py --preonly