-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #43 from JimCircadian/38_dawn_hpc
Dawn HPC improvements to pipeline
- Loading branch information
Showing
17 changed files
with
351 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,9 +30,8 @@ loader.*.json | |
*.out | ||
tmp.* | ||
*.swp | ||
*test* | ||
*.png | ||
|
||
!ENVS.example | ||
ENVS.* | ||
|
||
/tensorboard |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
#!/usr/bin/env bash | ||
|
||
HEMI=$1 | ||
SPLIT=$2 | ||
DATASET="$3_$HEMI" | ||
ERROR_FOLDER=network_datasets/${DATASET}/${HEMI}/${SPLIT}.data_errors | ||
CHECK_LOG=logs/check.${DATASET}.${SPLIT}.log | ||
|
||
if [ ! -f $CHECK_LOG ]; then | ||
icenet_dataset_check -s $SPLIT dataset_config.${DATASET}.json 2>&1 | tee $CHECK_LOG | ||
fi | ||
|
||
mkdir $ERROR_FOLDER | ||
|
||
for FILENAME in $( grep 'WARNING' $CHECK_LOG | sed -r \ | ||
-e 's/^.+([0-9]{8}\.tfrecord).+$/\1/' \ | ||
| uniq ); do | ||
if [ -f network_datasets/${DATASET}/${HEMI}/${SPLIT}/$FILENAME ]; then | ||
echo mv -v network_datasets/${DATASET}/${HEMI}/${SPLIT}/$FILENAME $ERROR_FOLDER; | ||
fi | ||
done | ||
|
||
mv -v $CHECK_LOG $ERROR_FOLDER |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#!/usr/bin/env bash | ||
|
||
source $HOME/.bashrc | ||
|
||
module purge | ||
module load default-dawn | ||
module load dawn-env/2024-04-15 intel-oneapi-ccl intel-oneapi-compilers intel-oneapi-dnn intel-oneapi-dpct intel-oneapi-dpl intel-oneapi-inspector intel-oneapi-mkl intel-oneapi-mpi intel-oneapi-tbb | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
channels: | ||
- conda-forge | ||
- defaults | ||
dependencies: | ||
- cartopy | ||
- eccodes | ||
- ffmpeg | ||
- hdf5 | ||
- netcdf4 | ||
- openh264 | ||
- xarray |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#!/usr/bin/env bash | ||
|
||
# set -u -o pipefail | ||
STRATEGY=${1:-mirrored} | ||
GPUS=${2:-4} | ||
|
||
# srun --gres=gpu:4 --job-name=icenet-test --partition=pvc --nodes=1 --time=01:00:00 --pty bash -i | ||
LOGNAME="logs/$STRATEGY.$GPUS.`uuidgen`.log" | ||
|
||
{ | ||
. ENVS | ||
conda activate $ICENET_CONDA | ||
echo "START: `date +%s`" | ||
icenet_train -b 4 -e 1 -f 1 -n $FILTER_FACTOR -s $STRATEGY --gpus $GPUS -nw --lr 25e-5 -v exp23_south test_south1 42 | ||
echo "END: `date +%s`" | ||
} 2>&1 | tee $LOGNAME |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#!/bin/bash | ||
#SBATCH --job-name=hnorth1 | ||
#SBATCH --partition=pvc | ||
#SBATCH --nodes=2 | ||
#SBATCH --ntasks=16 | ||
#SBATCH --ntasks-per-node=8 | ||
#SBATCH --gres=gpu:4 | ||
#SBATCH --cpus-per-task=12 # split from 96 cores | ||
#SBATCH --time=1-00:00:00 # job length | ||
#SBATCH --output=logs/train.north.%j.out | ||
#SBATCH --error=logs/train.north.%j.err | ||
|
||
source $HOME/.bashrc | ||
|
||
module purge | ||
module load default-dawn | ||
module load dawn-env/2024-04-15 intel-oneapi-ccl intel-oneapi-compilers intel-oneapi-dnn intel-oneapi-dpct intel-oneapi-dpl intel-oneapi-inspector intel-oneapi-mkl intel-oneapi-mpi intel-oneapi-tbb | ||
|
||
conda activate icenet | ||
|
||
mpirun -np 16 icenet_train_horovod --device-type XPU -v --early-stopping 5 -wp test -wu jambyr --shuffle-train -e 100 -b 4 -n 1.44 dataset_config.full_train_north.json hv_north1 42 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#!/bin/bash | ||
#SBATCH --job-name=icy_test | ||
#SBATCH --partition=pvc | ||
#SBATCH --ntasks=8 | ||
#SBATCH --ntasks-per-node=4 | ||
#SBATCH --gres=gpu:4 | ||
#SBATCH --cpus-per-task=24 # split from 96 cores | ||
#SBATCH --time=08:00:00 # job length | ||
#SBATCH --output=train.%j.out | ||
#SBATCH --error=train.%j.err | ||
|
||
source $HOME/.bashrc | ||
|
||
module purge | ||
module load default-dawn | ||
module load dawn-env/2024-04-15 intel-oneapi-ccl intel-oneapi-compilers intel-oneapi-dnn intel-oneapi-dpct intel-oneapi-dpl intel-oneapi-inspector intel-oneapi-mkl intel-oneapi-mpi intel-oneapi-tbb | ||
|
||
conda activate icenet | ||
|
||
mpirun -np 8 python scripts/horovod_test.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#!/bin/bash | ||
#SBATCH --job-name=hsm1 | ||
#SBATCH --partition=pvc | ||
#SBATCH --nodes=1 | ||
#SBATCH --ntasks=2 | ||
#SBATCH --ntasks-per-node=2 | ||
#SBATCH --gres=gpu:4 | ||
#SBATCH --exclusive | ||
#SBATCH --cpus-per-task=24 # split from 96 cores | ||
#SBATCH --time=12:00:00 # job length | ||
#SBATCH --output=logs/train.small_north_test.%j.out | ||
#SBATCH --error=logs/train.small_north_test.%j.err | ||
|
||
source $HOME/.bashrc | ||
|
||
module purge | ||
module load default-dawn | ||
module load dawn-env/2024-04-15 intel-oneapi-ccl intel-oneapi-compilers intel-oneapi-dnn intel-oneapi-dpct intel-oneapi-dpl intel-oneapi-inspector intel-oneapi-mkl intel-oneapi-mpi intel-oneapi-tbb | ||
|
||
conda activate icenet | ||
|
||
mpirun -np 2 icenet_train_horovod --device-type XPU -v --early-stopping 5 -wp test -wu jambyr --shuffle-train -e 3 -b 4 -n 1.44 dataset_config.full_train_north.json hv_small_test1 42 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#!/bin/bash | ||
#SBATCH --job-name=hsouth1 | ||
#SBATCH --partition=pvc | ||
#SBATCH --nodes=2 | ||
#SBATCH --ntasks=16 | ||
#SBATCH --ntasks-per-node=8 | ||
#SBATCH --gres=gpu:4 | ||
#SBATCH --cpus-per-task=12 # split from 96 cores | ||
#SBATCH --time=1-00:00:00 # job length | ||
#SBATCH --output=logs/train.south.%j.out | ||
#SBATCH --error=logs/train.south.%j.err | ||
|
||
source $HOME/.bashrc | ||
|
||
module purge | ||
module load default-dawn | ||
module load dawn-env/2024-04-15 intel-oneapi-ccl intel-oneapi-compilers intel-oneapi-dnn intel-oneapi-dpct intel-oneapi-dpl intel-oneapi-inspector intel-oneapi-mkl intel-oneapi-mpi intel-oneapi-tbb | ||
|
||
conda activate icenet | ||
|
||
mpirun -np 16 icenet_train_horovod --device-type XPU -v --early-stopping 5 -wp test -wu jambyr --shuffle-train -e 100 -b 4 -n 1.44 dataset_config.full_train_south.json hv_south1 42 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import logging | ||
logging.basicConfig(level=logging.DEBUG) | ||
import tensorflow as tf | ||
import horovod.tensorflow.keras as hvd | ||
from tensorflow.keras.optimizers import Adam | ||
|
||
hvd.init() | ||
|
||
# https://www.tensorflow.org/guide/keras/distributed_training | ||
|
||
# Create a MirroredStrategy. | ||
gpus = tf.config.list_physical_devices('XPU') | ||
print("XPU count is {}".format(len(gpus))) | ||
gpu_ids = [] | ||
for gpu in gpus: | ||
tf.config.experimental.set_memory_growth(gpu, True) | ||
if gpus: | ||
tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'XPU') | ||
|
||
from icenet.data.dataset import IceNetDataSet | ||
import icenet.model.losses as losses | ||
import icenet.model.metrics as metrics | ||
import icenet.model.networks.tensorflow as models | ||
|
||
batch_size = 1 | ||
dataset = IceNetDataSet("dataset_config.full_train_north.json", batch_size=batch_size, shuffling=True) | ||
input_shape = (*dataset.shape, dataset.num_channels) | ||
|
||
loss = losses.WeightedMSE() | ||
metrics_list = [ | ||
metrics.WeightedBinaryAccuracy(), | ||
metrics.WeightedMAE(), | ||
metrics.WeightedRMSE(), | ||
losses.WeightedMSE() | ||
] | ||
network = models.unet_batchnorm( | ||
custom_optimizer=hvd.DistributedOptimizer(Adam(0.001)), | ||
experimental_run_tf_function=False, | ||
input_shape=input_shape, | ||
loss=loss, | ||
metrics=metrics_list, | ||
filter_size=3, | ||
n_filters_factor=1.44, | ||
n_forecast_days=dataset.n_forecast_days, | ||
) | ||
|
||
network.summary() | ||
train_ds, val_ds, test_ds = dataset.get_split_datasets(ratio=1.0) | ||
|
||
model_history = network.fit( | ||
#strategy.experimental_distribute_dataset(train_ds), | ||
train_ds, | ||
epochs=100, | ||
steps_per_epoch=dataset.counts["train"] // (batch_size * hvd.size()), | ||
verbose=1 if hvd.rank() == 0 else 0, | ||
callbacks=[ | ||
hvd.callbacks.BroadcastGlobalVariablesCallback(0), | ||
], | ||
validation_data=val_ds) | ||
#validation_data=strategy.experimental_distribute_dataset(val_ds), | ||
#max_queue_size=10) |
Oops, something went wrong.