Skip to content

Commit

Permalink
Experiments to run
Browse files Browse the repository at this point in the history
  • Loading branch information
Erotemic committed Jul 6, 2024
1 parent 53487d1 commit 5cf9448
Show file tree
Hide file tree
Showing 8 changed files with 960 additions and 10 deletions.
20 changes: 20 additions & 0 deletions experiments/models.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@

- /home/joncrall/data/dvc-repos/shitspotter_expt_dvc/training/toothbrush/joncrall/ShitSpotter/runs/shitspotter_fromv28_newdata_20240615_v1/lightning_logs/version_0/checkpoints/epoch=0000-step=000043-val_loss=0.008.ckpt.pt
- /home/joncrall/data/dvc-repos/shitspotter_expt_dvc/training/toothbrush/joncrall/ShitSpotter/runs/shitspotter_fromv28_newdata_20240615_v1/lightning_logs/version_0/checkpoints/epoch=0002-step=000129-val_loss=0.008.ckpt.pt
- /home/joncrall/data/dvc-repos/shitspotter_expt_dvc/training/toothbrush/joncrall/ShitSpotter/runs/shitspotter_fromv28_newdata_20240615_v1/lightning_logs/version_2/checkpoints/epoch=0004-step=000215-val_loss=0.017.ckpt.pt
- /home/joncrall/data/dvc-repos/shitspotter_expt_dvc/training/toothbrush/joncrall/ShitSpotter/runs/shitspotter_fromv28_newdata_20240615_v1/lightning_logs/version_2/checkpoints/epoch=0005-step=000258-val_loss=0.016.ckpt.pt
- /home/joncrall/data/dvc-repos/shitspotter_expt_dvc/training/toothbrush/joncrall/ShitSpotter/runs/shitspotter_fromv28_newdata_20240615_v1/lightning_logs/version_2/checkpoints/epoch=0012-step=000559-val_loss=0.017.ckpt.pt
- /home/joncrall/data/dvc-repos/shitspotter_expt_dvc/training/toothbrush/joncrall/ShitSpotter/runs/shitspotter_fromv28_newdata_20240615_v1/lightning_logs/version_3/checkpoints/epoch=0017-step=012294-val_loss=0.017.ckpt.pt
- /home/joncrall/data/dvc-repos/shitspotter_expt_dvc/training/toothbrush/joncrall/ShitSpotter/runs/shitspotter_fromv28_newdata_20240615_v1/lightning_logs/version_3/checkpoints/epoch=0024-step=017075-val_loss=0.016.ckpt.pt
- /home/joncrall/data/dvc-repos/shitspotter_expt_dvc/training/toothbrush/joncrall/ShitSpotter/runs/shitspotter_fromv28_newdata_20240615_v1/lightning_logs/version_3/checkpoints/epoch=0026-step=018441-val_loss=0.017.ckpt.pt
- /home/joncrall/data/dvc-repos/shitspotter_expt_dvc/training/toothbrush/joncrall/ShitSpotter/runs/shitspotter_fromv28_newdata_20240615_v1/lightning_logs/version_3/checkpoints/epoch=0034-step=023905-val_loss=0.018.ckpt.pt
- /home/joncrall/data/dvc-repos/shitspotter_expt_dvc/training/toothbrush/joncrall/ShitSpotter/runs/shitspotter_fromv28_newdata_20240615_v1/lightning_logs/version_3/checkpoints/epoch=0037-step=025954-val_loss=0.017.ckpt.pt
- /home/joncrall/data/dvc-repos/shitspotter_expt_dvc/training/toothbrush/joncrall/ShitSpotter/runs/shitspotter_scratch_20240618_noboxes_v2/lightning_logs/version_0/checkpoints/epoch=0041-step=057372-val_loss=0.022.ckpt.pt
- /home/joncrall/data/dvc-repos/shitspotter_expt_dvc/training/toothbrush/joncrall/ShitSpotter/runs/shitspotter_scratch_20240618_noboxes_v2/lightning_logs/version_0/checkpoints/epoch=0062-step=086058-val_loss=0.022.ckpt.pt
- /home/joncrall/data/dvc-repos/shitspotter_expt_dvc/training/toothbrush/joncrall/ShitSpotter/runs/shitspotter_scratch_20240618_noboxes_v2/lightning_logs/version_0/checkpoints/epoch=0063-step=087424-val_loss=0.021.ckpt.pt
- /home/joncrall/data/dvc-repos/shitspotter_expt_dvc/training/toothbrush/joncrall/ShitSpotter/runs/shitspotter_scratch_20240618_noboxes_v2/lightning_logs/version_0/checkpoints/epoch=0065-step=090156-val_loss=0.022.ckpt.pt
- /home/joncrall/data/dvc-repos/shitspotter_expt_dvc/training/toothbrush/joncrall/ShitSpotter/runs/shitspotter_scratch_20240618_noboxes_v2/lightning_logs/version_0/checkpoints/epoch=0072-step=099718-val_loss=0.022.ckpt.pt
- /home/joncrall/data/dvc-repos/shitspotter_dvc/models/shitspotter_from_v027_halfres_v028-epoch=0121-step=000488-val_loss=0.005.ckpt.pt
- /home/joncrall/data/dvc-repos/shitspotter_dvc/models/shitspotter_from_v027_halfres_v028-epoch=0179-step=000720-val_loss=0.005.ckpt.pt
- /home/joncrall/data/dvc-repos/shitspotter_dvc/models/shitspotter_scratch_v025-version_2-epoch=1277-step=005112-val_loss=0.600.ckpt.pt

15 changes: 15 additions & 0 deletions experiments/train_toothbrush_scratch_noboxes_schedule.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
cmd_queue new "shitspotter_train_queue"

cmd_queue submit "shitspotter_train_queue" -- bash ~/code/shitspotter/experiments/train_toothbrush_scratch_noboxes_v3.sh
cmd_queue submit "shitspotter_train_queue" -- bash ~/code/shitspotter/experiments/train_toothbrush_scratch_noboxes_v4.sh
cmd_queue submit "shitspotter_train_queue" -- bash ~/code/shitspotter/experiments/train_toothbrush_scratch_noboxes_v5.sh
cmd_queue submit "shitspotter_train_queue" -- bash ~/code/shitspotter/experiments/train_toothbrush_scratch_noboxes_v6.sh
cmd_queue submit "shitspotter_train_queue" -- bash ~/code/shitspotter/experiments/train_toothbrush_scratch_noboxes_v7.sh
cmd_queue submit "shitspotter_train_queue" -- bash ~/code/shitspotter/experiments/train_toothbrush_scratch_noboxes_v8.sh


cmd_queue show "shitspotter_train_queue"


# Execute your queue.
cmd_queue run "shitspotter_train_queue" --backend=tmux --workers=2 --gpus="0,1"
11 changes: 1 addition & 10 deletions experiments/train_toothbrush_scratch_noboxes_v3.sh
Original file line number Diff line number Diff line change
@@ -1,17 +1,8 @@
#!/bin/bash
__doc__="
With a batch size of 24 and lr=3e-4 and training only the saliency head
Observations:
* dark vs light separation by step 684 on training data
* some structure segmentation by step 2050 on training data
* validation progress by epoch 3 (5000 steps)
* training results by epoch 16 (21,856 steps) have significant mistakes, but also some strong true positives
* validation progress from epoch 8 (12294 steps) to epoch 15 (21,856 steps) has surprisingly strong heatmaps, but many mistakes
"

export CUDA_VISIBLE_DEVICES=1
#export CUDA_VISIBLE_DEVICES=0
DVC_DATA_DPATH=$HOME/data/dvc-repos/shitspotter_dvc
DVC_EXPT_DPATH=$HOME/data/dvc-repos/shitspotter_expt_dvc
WORKDIR=$DVC_EXPT_DPATH/training/$HOSTNAME/$USER
Expand Down
185 changes: 185 additions & 0 deletions experiments/train_toothbrush_scratch_noboxes_v4.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
#!/bin/bash
__doc__="
"

#export CUDA_VISIBLE_DEVICES=0
DVC_DATA_DPATH=$HOME/data/dvc-repos/shitspotter_dvc
DVC_EXPT_DPATH=$HOME/data/dvc-repos/shitspotter_expt_dvc
WORKDIR=$DVC_EXPT_DPATH/training/$HOSTNAME/$USER

DATASET_CODE=ShitSpotter
KWCOCO_BUNDLE_DPATH=$DVC_DATA_DPATH


TRAIN_FPATH=$KWCOCO_BUNDLE_DPATH/train_imgs5747_1e73d54f.kwcoco.zip
VALI_FPATH=$KWCOCO_BUNDLE_DPATH/vali_imgs691_99b22ad0.kwcoco.zip

inspect_kwcoco_files(){
kwcoco stats "$TRAIN_FPATH" "$VALI_FPATH"
kwcoco info "$VALI_FPATH" -g 1
kwcoco info "$VALI_FPATH" -v 1
#kwcoco info "$VALI_FPATH" -a 1
#geowatch stats "$TRAIN_FPATH" "$VALI_FPATH"
}
#inspect_kwcoco_files
EXPERIMENT_NAME="shitspotter_scratch_20240618_noboxes_v4"

CHANNELS="phone:(red|green|blue)"
DEFAULT_ROOT_DIR=$WORKDIR/$DATASET_CODE/runs/$EXPERIMENT_NAME
TARGET_LR=1e-4
WEIGHT_DECAY=$(python -c "print($TARGET_LR * 0.01)")
PERTERB_SCALE=$(python -c "print($TARGET_LR * 0.003)")
DEVICES=$(python -c "if 1:
import os
n = len(os.environ.get('CUDA_VISIBLE_DEVICES', '').split(','))
print(','.join(list(map(str, range(n)))) + ',')
")
ACCELERATOR=gpu
STRATEGY=$(python -c "if 1:
import os
n = len(os.environ.get('CUDA_VISIBLE_DEVICES', '').split(','))
print('ddp' if n > 1 else 'auto')
")
DDP_WORKAROUND=$(python -c "if 1:
import os
n = len(os.environ.get('CUDA_VISIBLE_DEVICES', '').split(','))
print(int(n > 1))
")
echo "DEVICES = $DEVICES"
echo "DDP_WORKAROUND = $DDP_WORKAROUND"
echo "WEIGHT_DECAY = $WEIGHT_DECAY"


MAX_STEPS=163840
MAX_EPOCHS=120
TRAIN_BATCHES_PER_EPOCH=16384
VALI_BATCHES_PER_EPOCH=4096
ACCUMULATE_GRAD_BATCHES=12
BATCH_SIZE=2
TRAIN_ITEMS_PER_EPOCH=$(python -c "print($TRAIN_BATCHES_PER_EPOCH * $BATCH_SIZE)")
echo "TRAIN_ITEMS_PER_EPOCH = $TRAIN_ITEMS_PER_EPOCH"

python -m geowatch.cli.experimental.recommend_size_adjustments \
--MAX_STEPS=$MAX_STEPS \
--MAX_EPOCHS=$MAX_EPOCHS \
--BATCH_SIZE=$BATCH_SIZE \
--ACCUMULATE_GRAD_BATCHES=$ACCUMULATE_GRAD_BATCHES \
--TRAIN_BATCHES_PER_EPOCH="$TRAIN_BATCHES_PER_EPOCH" \
--TRAIN_ITEMS_PER_EPOCH="$TRAIN_ITEMS_PER_EPOCH"


# Find the most recent checkpoint (TODO add utility for this)
PREV_CHECKPOINT_TEXT=$(python -m geowatch.cli.experimental.find_recent_checkpoint --default_root_dir="$DEFAULT_ROOT_DIR")
echo "PREV_CHECKPOINT_TEXT = $PREV_CHECKPOINT_TEXT"
if [[ "$PREV_CHECKPOINT_TEXT" == "None" ]]; then
PREV_CHECKPOINT_ARGS=()
else
PREV_CHECKPOINT_ARGS=(--ckpt_path "$PREV_CHECKPOINT_TEXT")
fi
echo "${PREV_CHECKPOINT_ARGS[@]}"

#export TORCH_DISTRIBUTED_DEBUG=DETAIL



DDP_WORKAROUND=$DDP_WORKAROUND python -m geowatch.tasks.fusion fit --config "
data:
select_videos : $SELECT_VIDEOS
num_workers : 8
train_dataset : $TRAIN_FPATH
vali_dataset : $VALI_FPATH
window_dims : '416,416'
time_steps : 1
time_sampling : uniform
#time_kernel : '[0.0s,]'
window_resolution : 0.5
input_resolution : 0.5
output_resolution : 0.5
neg_to_pos_ratio : 1.0
batch_size : $BATCH_SIZE
normalize_perframe : false
normalize_peritem : false
max_items_per_epoch : $TRAIN_ITEMS_PER_EPOCH
channels : '$CHANNELS'
min_spacetime_weight : 0.6
temporal_dropout_rate : 0.5
channel_dropout_rate : 0.5
modality_dropout_rate : 0.5
temporal_dropout : 0.0
channel_dropout : 0.05
modality_dropout : 0.05
mask_low_quality : False
mask_samecolor_method : None
observable_threshold : 0.0
quality_threshold : 0.0
weight_dilate : 5
dist_weights : False
use_centered_positives : True
use_grid_positives : True
use_grid_negatives : True
normalize_inputs : 80960
balance_areas : false
model:
class_path: MultimodalTransformer
init_args:
saliency_weights : '{fg: 1.0, bg: 1.0}'
class_weights : 'auto'
tokenizer : linconv
arch_name : smt_it_stm_s12
decoder : mlp
positive_change_weight : 1
negative_change_weight : 0.01
stream_channels : 16
class_loss : 'dicefocal'
saliency_loss : 'focal'
saliency_head_hidden : 4
change_head_hidden : 6
class_head_hidden : 6
global_change_weight : 0.00
global_class_weight : 0.00
global_box_weight : 0.00
global_saliency_weight : 1.00
multimodal_reduce : max
continual_learning : false
perterb_scale : $PERTERB_SCALE
optimizer:
class_path: torch.optim.AdamW
init_args:
lr : $TARGET_LR
weight_decay : $WEIGHT_DECAY
lr_scheduler:
class_path: torch.optim.lr_scheduler.OneCycleLR
init_args:
max_lr: $TARGET_LR
total_steps: $MAX_STEPS
anneal_strategy: cos
pct_start: 0.3
trainer:
accumulate_grad_batches: $ACCUMULATE_GRAD_BATCHES
default_root_dir : $DEFAULT_ROOT_DIR
accelerator : $ACCELERATOR
devices : $DEVICES
strategy : $STRATEGY
limit_train_batches : $TRAIN_BATCHES_PER_EPOCH
limit_val_batches : $VALI_BATCHES_PER_EPOCH
log_every_n_steps : 1
check_val_every_n_epoch: 1
enable_checkpointing: true
enable_model_summary: true
num_sanity_val_steps : 0
max_epochs: $MAX_EPOCHS
callbacks:
- class_path: pytorch_lightning.callbacks.ModelCheckpoint
init_args:
monitor: val_loss
mode: min
save_top_k: 5
filename: '{epoch:04d}-{step:06d}-{val_loss:.3f}.ckpt'
save_last: true
torch_globals:
float32_matmul_precision: auto
initializer:
init: noop
" "${PREV_CHECKPOINT_ARGS[@]}"
Loading

0 comments on commit 5cf9448

Please sign in to comment.