Skip to content

Commit

Permalink
Merge pull request #43 from JimCircadian/38_dawn_hpc
Browse files Browse the repository at this point in the history
Dawn HPC improvements to pipeline
  • Loading branch information
JimCircadian authored Aug 21, 2024
2 parents 35ee3c4 + aadfc50 commit 4f98699
Show file tree
Hide file tree
Showing 17 changed files with 351 additions and 21 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,8 @@ loader.*.json
*.out
tmp.*
*.swp
*test*
*.png

!ENVS.example
ENVS.*

/tensorboard
23 changes: 23 additions & 0 deletions check_and_move_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/usr/bin/env bash

HEMI=$1
SPLIT=$2
DATASET="$3_$HEMI"
ERROR_FOLDER=network_datasets/${DATASET}/${HEMI}/${SPLIT}.data_errors
CHECK_LOG=logs/check.${DATASET}.${SPLIT}.log

if [ ! -f $CHECK_LOG ]; then
icenet_dataset_check -s $SPLIT dataset_config.${DATASET}.json 2>&1 | tee $CHECK_LOG
fi

mkdir $ERROR_FOLDER

for FILENAME in $( grep 'WARNING' $CHECK_LOG | sed -r \
-e 's/^.+([0-9]{8}\.tfrecord).+$/\1/' \
| uniq ); do
if [ -f network_datasets/${DATASET}/${HEMI}/${SPLIT}/$FILENAME ]; then
echo mv -v network_datasets/${DATASET}/${HEMI}/${SPLIT}/$FILENAME $ERROR_FOLDER;
fi
done

mv -v $CHECK_LOG $ERROR_FOLDER
12 changes: 6 additions & 6 deletions ensemble/predict.tmpl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,12 @@ ensemble:
- ../../../network_datasets
- ../../../processed
- ../../../results
mem: 224gb
cluster: short
email: someone@example.com
length: 00:30:00
mem: 128gb
nodes: 1
ntasks: 2

pre_process: []
post_process: []
Expand All @@ -22,12 +27,7 @@ ensemble:
templatedir: ../template
templates:
- icenet_predict.sh.j2
email: someone@example.com
job_file: icenet_predict.sh
cluster: short
nodes: 1
ntasks: 8
length: 00:30:00
maxruns: 100
maxjobs: 10

Expand Down
8 changes: 8 additions & 0 deletions ensemble/template/dawn.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/usr/bin/env bash

source $HOME/.bashrc

module purge
module load default-dawn
module load dawn-env/2024-04-15 intel-oneapi-ccl intel-oneapi-compilers intel-oneapi-dnn intel-oneapi-dpct intel-oneapi-dpl intel-oneapi-inspector intel-oneapi-mkl intel-oneapi-mpi intel-oneapi-tbb

10 changes: 5 additions & 5 deletions ensemble/train.tmpl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,13 @@ ensemble:
- ../../../network_datasets
- ../../../processed
- ../../../results
cluster: long
email: someone@example.com
gpus: 1
length: 4-00:00:00
mem: 128gb
nodes: 1
ntasks: NTASKS

pre_process:
- name: execute
Expand All @@ -29,12 +34,7 @@ ensemble:
templatedir: ../template
templates:
- icenet_train.sh.j2
email: someone@example.com
job_file: icenet_train.sh
cluster: gpu
nodes: 1
ntasks: NTASKS
length: 4-00:00:00
maxruns: 5
maxjobs: MAXJOBS

Expand Down
11 changes: 11 additions & 0 deletions environment.dawn.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
channels:
- conda-forge
- defaults
dependencies:
- cartopy
- eccodes
- ffmpeg
- hdf5
- netcdf4
- openh264
- xarray
16 changes: 16 additions & 0 deletions quick_diag.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash

# set -u -o pipefail
STRATEGY=${1:-mirrored}
GPUS=${2:-4}

# srun --gres=gpu:4 --job-name=icenet-test --partition=pvc --nodes=1 --time=01:00:00 --pty bash -i
LOGNAME="logs/$STRATEGY.$GPUS.`uuidgen`.log"

{
. ENVS
conda activate $ICENET_CONDA
echo "START: `date +%s`"
icenet_train -b 4 -e 1 -f 1 -n $FILTER_FACTOR -s $STRATEGY --gpus $GPUS -nw --lr 25e-5 -v exp23_south test_south1 42
echo "END: `date +%s`"
} 2>&1 | tee $LOGNAME
12 changes: 6 additions & 6 deletions run_data.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/bash -l

. ENVS

Expand All @@ -17,11 +17,6 @@ BATCH_SIZE=${2:-2}
WORKERS=${3:-8}

if [ ! -f loader.${DATANAME}_${HEMI}.json ]; then
[ ! -z "$PROC_ARGS_ERA5" ] && icenet_process_era5 -v -l $LAG \
$PROC_ARGS_ERA5 \
-ns $TRAIN_START -ne $TRAIN_END -vs $VAL_START -ve $VAL_END -ts $TEST_START -te $TEST_END \
${DATANAME}_${HEMI} $HEMI

[ ! -z "$PROC_ARGS_ORAS5" ] && icenet_process_oras5 -v -l $LAG \
$PROC_ARGS_ORAS5 \
-ns $TRAIN_START -ne $TRAIN_END -vs $VAL_START -ve $VAL_END -ts $TEST_START -te $TEST_END \
Expand All @@ -32,6 +27,11 @@ if [ ! -f loader.${DATANAME}_${HEMI}.json ]; then
-ns $TRAIN_START -ne $TRAIN_END -vs $VAL_START -ve $VAL_END -ts $TEST_START -te $TEST_END \
${DATANAME}_${HEMI} $HEMI

[ ! -z "$PROC_ARGS_ERA5" ] && icenet_process_era5 -v -l $LAG \
$PROC_ARGS_ERA5 \
-ns $TRAIN_START -ne $TRAIN_END -vs $VAL_START -ve $VAL_END -ts $TEST_START -te $TEST_END \
${DATANAME}_${HEMI} $HEMI

icenet_process_metadata ${DATANAME}_${HEMI} $HEMI
else
echo "Skipping preprocessing as loader.${DATANAME}_${HEMI}.json already exists..."
Expand Down
3 changes: 2 additions & 1 deletion run_predict_ensemble.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@ ENSEMBLE_ARGS=""
TRAIN_IDENT=""
ENSEMBLE_SEEDS_DEFAULT=42,46,45

while getopts ":b:df:i:lm:p:r:x" opt; do
while getopts ":b:c:df:i:lm:p:r:x" opt; do
case "$opt" in
b) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_batch=$OPTARG ";;
c) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}cluster=$OPTARG ";;
d) ENSEMBLE_TARGET="dummy";;
f) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_filter_factor=$OPTARG ";;
i) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_ident=$OPTARG ";;
Expand Down
3 changes: 2 additions & 1 deletion run_train_ensemble.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ ENSEMBLE_JOBS=1
ENSEMBLE_NTASKS=4
ENSEMBLE_SEEDS_DEFAULT=42,46,45,17,24,84,83,16,5,3

while getopts ":b:c:de:f:g:j:l:m:n:p:q:r:s:t:" opt; do
while getopts ":b:c:de:f:g:j:l:m:n:o:p:q:r:s:t:" opt; do
case "$opt" in
b) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_batch=$OPTARG ";;
c) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}cluster=$OPTARG ";;
Expand All @@ -29,6 +29,7 @@ while getopts ":b:c:de:f:g:j:l:m:n:p:q:r:s:t:" opt; do
l) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_preload=$OPTARG ";;
m) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}mem=$OPTARG ";;
n) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}nodelist=$OPTARG ";;
o) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}nodes=$OPTARG ";;
p) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_prep=$OPTARG ";;
q) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_queue=$OPTARG ";;
r) ENSEMBLE_RUNS=$OPTARG ;; # Ensemble member run seed values
Expand Down
21 changes: 21 additions & 0 deletions scripts/horovod_north.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash
#SBATCH --job-name=hnorth1
#SBATCH --partition=pvc
#SBATCH --nodes=2
#SBATCH --ntasks=16
#SBATCH --ntasks-per-node=8
#SBATCH --gres=gpu:4
#SBATCH --cpus-per-task=12 # split from 96 cores
#SBATCH --time=1-00:00:00 # job length
#SBATCH --output=logs/train.north.%j.out
#SBATCH --error=logs/train.north.%j.err

source $HOME/.bashrc

module purge
module load default-dawn
module load dawn-env/2024-04-15 intel-oneapi-ccl intel-oneapi-compilers intel-oneapi-dnn intel-oneapi-dpct intel-oneapi-dpl intel-oneapi-inspector intel-oneapi-mkl intel-oneapi-mpi intel-oneapi-tbb

conda activate icenet

mpirun -np 16 icenet_train_horovod --device-type XPU -v --early-stopping 5 -wp test -wu jambyr --shuffle-train -e 100 -b 4 -n 1.44 dataset_config.full_train_north.json hv_north1 42
20 changes: 20 additions & 0 deletions scripts/horovod_slurm_sbatch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash
#SBATCH --job-name=icy_test
#SBATCH --partition=pvc
#SBATCH --ntasks=8
#SBATCH --ntasks-per-node=4
#SBATCH --gres=gpu:4
#SBATCH --cpus-per-task=24 # split from 96 cores
#SBATCH --time=08:00:00 # job length
#SBATCH --output=train.%j.out
#SBATCH --error=train.%j.err

source $HOME/.bashrc

module purge
module load default-dawn
module load dawn-env/2024-04-15 intel-oneapi-ccl intel-oneapi-compilers intel-oneapi-dnn intel-oneapi-dpct intel-oneapi-dpl intel-oneapi-inspector intel-oneapi-mkl intel-oneapi-mpi intel-oneapi-tbb

conda activate icenet

mpirun -np 8 python scripts/horovod_test.py
22 changes: 22 additions & 0 deletions scripts/horovod_small_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
#SBATCH --job-name=hsm1
#SBATCH --partition=pvc
#SBATCH --nodes=1
#SBATCH --ntasks=2
#SBATCH --ntasks-per-node=2
#SBATCH --gres=gpu:4
#SBATCH --exclusive
#SBATCH --cpus-per-task=24 # split from 96 cores
#SBATCH --time=12:00:00 # job length
#SBATCH --output=logs/train.small_north_test.%j.out
#SBATCH --error=logs/train.small_north_test.%j.err

source $HOME/.bashrc

module purge
module load default-dawn
module load dawn-env/2024-04-15 intel-oneapi-ccl intel-oneapi-compilers intel-oneapi-dnn intel-oneapi-dpct intel-oneapi-dpl intel-oneapi-inspector intel-oneapi-mkl intel-oneapi-mpi intel-oneapi-tbb

conda activate icenet

mpirun -np 2 icenet_train_horovod --device-type XPU -v --early-stopping 5 -wp test -wu jambyr --shuffle-train -e 3 -b 4 -n 1.44 dataset_config.full_train_north.json hv_small_test1 42
21 changes: 21 additions & 0 deletions scripts/horovod_south.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash
#SBATCH --job-name=hsouth1
#SBATCH --partition=pvc
#SBATCH --nodes=2
#SBATCH --ntasks=16
#SBATCH --ntasks-per-node=8
#SBATCH --gres=gpu:4
#SBATCH --cpus-per-task=12 # split from 96 cores
#SBATCH --time=1-00:00:00 # job length
#SBATCH --output=logs/train.south.%j.out
#SBATCH --error=logs/train.south.%j.err

source $HOME/.bashrc

module purge
module load default-dawn
module load dawn-env/2024-04-15 intel-oneapi-ccl intel-oneapi-compilers intel-oneapi-dnn intel-oneapi-dpct intel-oneapi-dpl intel-oneapi-inspector intel-oneapi-mkl intel-oneapi-mpi intel-oneapi-tbb

conda activate icenet

mpirun -np 16 icenet_train_horovod --device-type XPU -v --early-stopping 5 -wp test -wu jambyr --shuffle-train -e 100 -b 4 -n 1.44 dataset_config.full_train_south.json hv_south1 42
61 changes: 61 additions & 0 deletions scripts/horovod_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import logging
logging.basicConfig(level=logging.DEBUG)
import tensorflow as tf
import horovod.tensorflow.keras as hvd
from tensorflow.keras.optimizers import Adam

hvd.init()

# https://www.tensorflow.org/guide/keras/distributed_training

# Create a MirroredStrategy.
gpus = tf.config.list_physical_devices('XPU')
print("XPU count is {}".format(len(gpus)))
gpu_ids = []
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'XPU')

from icenet.data.dataset import IceNetDataSet
import icenet.model.losses as losses
import icenet.model.metrics as metrics
import icenet.model.networks.tensorflow as models

batch_size = 1
dataset = IceNetDataSet("dataset_config.full_train_north.json", batch_size=batch_size, shuffling=True)
input_shape = (*dataset.shape, dataset.num_channels)

loss = losses.WeightedMSE()
metrics_list = [
metrics.WeightedBinaryAccuracy(),
metrics.WeightedMAE(),
metrics.WeightedRMSE(),
losses.WeightedMSE()
]
network = models.unet_batchnorm(
custom_optimizer=hvd.DistributedOptimizer(Adam(0.001)),
experimental_run_tf_function=False,
input_shape=input_shape,
loss=loss,
metrics=metrics_list,
filter_size=3,
n_filters_factor=1.44,
n_forecast_days=dataset.n_forecast_days,
)

network.summary()
train_ds, val_ds, test_ds = dataset.get_split_datasets(ratio=1.0)

model_history = network.fit(
#strategy.experimental_distribute_dataset(train_ds),
train_ds,
epochs=100,
steps_per_epoch=dataset.counts["train"] // (batch_size * hvd.size()),
verbose=1 if hvd.rank() == 0 else 0,
callbacks=[
hvd.callbacks.BroadcastGlobalVariablesCallback(0),
],
validation_data=val_ds)
#validation_data=strategy.experimental_distribute_dataset(val_ds),
#max_queue_size=10)
Loading

0 comments on commit 4f98699

Please sign in to comment.