New table entry

Erotemic · Jun 16, 2024 · 440fe7e · 440fe7e
1 parent c254d2a
commit 440fe7e
Show file tree

Hide file tree

Showing 7 changed files with 239 additions and 3 deletions.
diff --git a/README.rst b/README.rst
@@ -53,10 +53,10 @@ The `IPFS CID <https://docs.ipfs.tech/concepts/content-addressing/>`_ (Content I
 
 .. code::
 
-    bafybeibw5xqmdiycd7vw5qqdf3ceidjbq3cv4taalkc3ruu3qeqmqdy6sm
+    bafybeia44hiextgcpjfvglib66gxziaf7jkvno63p7h7fsqkxi5vpgpvay
 
 The dataset can be viewed in a webbrowser through an IPFS gateway:
-https://ipfs.io/ipfs/bafybeibw5xqmdiycd7vw5qqdf3ceidjbq3cv4taalkc3ruu3qeqmqdy6sm
+https://ipfs.io/ipfs/bafybeia44hiextgcpjfvglib66gxziaf7jkvno63p7h7fsqkxi5vpgpvay
 
 If you have an IPFS node, please help keep this dataset alive and available by pinning it.
 
@@ -81,6 +81,7 @@ Recent Updates
 Check back for updates, but because this is a personal project, it might take
 some time for it to fully drop.
 
+* 2024-06-15 - Small image drop. Working on writeup. Training new models.
 * 2024-05-21 - Slowing down release cycles. Still collecting images at roughly the same rate. CIDs for recent and previous releases are now in the CID table.
 * 2024-03-30 - This includes recent models that have been performing reasonably well.
 * 2024-02-29 - Going to change this year to be 1/3 validation, next update will have a new split. Will also rework this README eventually.
@@ -313,6 +314,8 @@ registration via the SIFT+RANSAC algorithm.
 +-------------+----------+---------------------+-----------------------+-----------------------+--------------------------------------------------------------+
 | 2024-05-21  | 6373     | ~2255               | 1640                  | 2252                  | bafybeidle54us5cdwpzzis4h52wjmtsk643gprx7nvvtd6g26mxq76kfjm  |
 +-------------+----------+---------------------+-----------------------+-----------------------+--------------------------------------------------------------+
+| 2024-06-15  | 6545     | ~2313               | 1684                  | 2311                  | bafybeia44hiextgcpjfvglib66gxziaf7jkvno63p7h7fsqkxi5vpgpvay  |
++-------------+----------+---------------------+-----------------------+-----------------------+--------------------------------------------------------------+
 
 
 For further details, see the `Datasheet <DATASHEET.md>`_.

diff --git a/experiments/train_toothbrush.sh b/experiments/train_toothbrush.sh
@@ -0,0 +1,178 @@
+#!/bin/bash
+#
+export CUDA_VISIBLE_DEVICES=0,1
+DVC_DATA_DPATH=$HOME/data/dvc-repos/shitspotter_dvc
+DVC_EXPT_DPATH=$HOME/data/dvc-repos/shitspotter_expt_dvc
+WORKDIR=$DVC_EXPT_DPATH/training/$HOSTNAME/$USER
+
+DATASET_CODE=ShitSpotter
+KWCOCO_BUNDLE_DPATH=$DVC_DATA_DPATH
+
+
+TRAIN_FPATH=$KWCOCO_BUNDLE_DPATH/train_imgs5747_1e73d54f.kwcoco.zip
+VALI_FPATH=$KWCOCO_BUNDLE_DPATH/vali_imgs691_99b22ad0.kwcoco.zip
+
+inspect_kwcoco_files(){
+    kwcoco stats "$TRAIN_FPATH" "$VALI_FPATH"
+    kwcoco info "$VALI_FPATH" -g 1
+    kwcoco info "$VALI_FPATH" -v 1
+    #kwcoco info "$VALI_FPATH" -a 1
+    #geowatch stats "$TRAIN_FPATH" "$VALI_FPATH"
+}
+#inspect_kwcoco_files
+EXPERIMENT_NAME="shitspotter_fromv28_newdata_20240615_v1"
+
+CHANNELS="phone:(red|green|blue)"
+DEFAULT_ROOT_DIR=$WORKDIR/$DATASET_CODE/runs/$EXPERIMENT_NAME
+TARGET_LR=3e-4
+WEIGHT_DECAY=$(python -c "print($TARGET_LR * 0.01)")
+PERTERB_SCALE=$(python -c "print($TARGET_LR * 0.003)")
+ETA_MIN=$(python -c "print($TARGET_LR * 0.0001)")
+DEVICES=$(python -c "if 1:
+    import os
+    n = len(os.environ.get('CUDA_VISIBLE_DEVICES', '').split(','))
+    print(','.join(list(map(str, range(n)))) + ',')
+")
+ACCELERATOR=gpu
+STRATEGY=$(python -c "if 1:
+    import os
+    n = len(os.environ.get('CUDA_VISIBLE_DEVICES', '').split(','))
+    print('ddp' if n > 1 else 'auto')
+")
+DDP_WORKAROUND=$(python -c "if 1:
+    import os
+    n = len(os.environ.get('CUDA_VISIBLE_DEVICES', '').split(','))
+    print(int(n > 1))
+")
+echo "DEVICES = $DEVICES"
+echo "DDP_WORKAROUND = $DDP_WORKAROUND"
+echo "WEIGHT_DECAY = $WEIGHT_DECAY"
+
+
+MAX_STEPS=163840
+MAX_EPOCHS=120
+TRAIN_BATCHES_PER_EPOCH=16384
+VALI_BATCHES_PER_EPOCH=4096
+ACCUMULATE_GRAD_BATCHES=12
+BATCH_SIZE=2
+TRAIN_ITEMS_PER_EPOCH=$(python -c "print($TRAIN_BATCHES_PER_EPOCH * $BATCH_SIZE)")
+echo "TRAIN_ITEMS_PER_EPOCH = $TRAIN_ITEMS_PER_EPOCH"
+
+python -m geowatch.cli.experimental.recommend_size_adjustments \
+    --MAX_STEPS=$MAX_STEPS \
+    --MAX_EPOCHS=$MAX_EPOCHS \
+    --BATCH_SIZE=$BATCH_SIZE \
+    --ACCUMULATE_GRAD_BATCHES=$ACCUMULATE_GRAD_BATCHES \
+    --TRAIN_BATCHES_PER_EPOCH="$TRAIN_BATCHES_PER_EPOCH" \
+    --TRAIN_ITEMS_PER_EPOCH="$TRAIN_ITEMS_PER_EPOCH"
+
+
+# Find the most recent checkpoint (TODO add utility for this)
+PREV_CHECKPOINT=$(python -m geowatch.cli.experimental.find_recent_checkpoint --default_root_dir="$DEFAULT_ROOT_DIR")
+echo "PREV_CHECKPOINT = $PREV_CHECKPOINT"
+
+INITIALIZER=$DVC_DATA_DPATH/models/shitspotter_from_v027_halfres_v028-epoch=0179-step=000720-val_loss=0.005.ckpt.pt
+
+
+DDP_WORKAROUND=$DDP_WORKAROUND python -m geowatch.tasks.fusion fit --config "
+data:
+    select_videos          : $SELECT_VIDEOS
+    num_workers            : 0
+    train_dataset          : $TRAIN_FPATH
+    vali_dataset           : $VALI_FPATH
+    window_dims            : '416,416'
+    time_steps             : 1
+    time_sampling          : uniform
+    #time_kernel            : '[0.0s,]'
+    window_resolution     : 0.5
+    input_resolution      : 0.5
+    output_resolution     : 0.5
+    neg_to_pos_ratio       : 1.0
+    batch_size             : $BATCH_SIZE
+    normalize_perframe     : false
+    normalize_peritem      : false
+    max_items_per_epoch    : $TRAIN_ITEMS_PER_EPOCH
+    channels               : '$CHANNELS'
+    min_spacetime_weight   : 0.6
+    temporal_dropout_rate  : 0.5
+    channel_dropout_rate   : 0.5
+    modality_dropout_rate  : 0.5
+    temporal_dropout       : 0.0
+    channel_dropout        : 0.05
+    modality_dropout       : 0.05
+    mask_low_quality       : False
+    mask_samecolor_method  : None
+    observable_threshold   : 0.0
+    quality_threshold      : 0.0
+    weight_dilate          : 5
+    dist_weights           : False
+    use_centered_positives : True
+    use_grid_positives     : True
+    use_grid_negatives     : True
+    normalize_inputs       : 80960
+    balance_areas          : false
+model:
+    class_path: MultimodalTransformer
+    init_args:
+        saliency_weights       : '{fg: 1.0, bg: 1.0}'
+        class_weights          : 'auto'
+        tokenizer              : linconv
+        arch_name              : smt_it_stm_s24
+        decoder                : mlp
+        positive_change_weight : 1
+        negative_change_weight : 0.01
+        stream_channels        : 16
+        class_loss             : 'dicefocal'
+        saliency_loss          : 'focal'
+        saliency_head_hidden   : 4
+        change_head_hidden     : 6
+        class_head_hidden      : 6
+        global_change_weight   : 0.00
+        global_class_weight    : 0.00
+        global_box_weight      : 0.00
+        global_saliency_weight : 1.00
+        multimodal_reduce      : max
+        continual_learning     : false
+        perterb_scale          : $PERTERB_SCALE
+optimizer:
+    class_path: torch.optim.AdamW
+    init_args:
+        lr           : $TARGET_LR
+        weight_decay : $WEIGHT_DECAY
+lr_scheduler:
+  class_path: torch.optim.lr_scheduler.OneCycleLR
+  init_args:
+    max_lr: $TARGET_LR
+    total_steps: $MAX_STEPS
+    anneal_strategy: cos
+    pct_start: 0.3
+trainer:
+    accumulate_grad_batches: $ACCUMULATE_GRAD_BATCHES
+    default_root_dir     : $DEFAULT_ROOT_DIR
+    accelerator          : $ACCELERATOR
+    devices              : $DEVICES
+    strategy             : $STRATEGY
+    limit_train_batches  : $TRAIN_BATCHES_PER_EPOCH
+    limit_val_batches    : $VALI_BATCHES_PER_EPOCH
+    log_every_n_steps    : 1
+    check_val_every_n_epoch: 1
+    enable_checkpointing: true
+    enable_model_summary: true
+    num_sanity_val_steps : 0
+    max_epochs: $MAX_EPOCHS
+    callbacks:
+        - class_path: pytorch_lightning.callbacks.ModelCheckpoint
+          init_args:
+              monitor: val_loss
+              mode: min
+              save_top_k: 5
+              filename: '{epoch:04d}-{step:06d}-{val_loss:.3f}.ckpt'
+              save_last: true
+
+torch_globals:
+    float32_matmul_precision: auto
+
+initializer:
+    init: $INITIALIZER
+"
+#--ckpt_path="$PREV_CHECKPOINT"
diff --git a/papers/application-2024/main.tex b/papers/application-2024/main.tex
@@ -118,6 +118,21 @@ \subsection{Dataset Distribution}
 Discuss distributing the dataset via IPFS versus centralized distribution
 systems.
 
+Decentralized Method - IPFS
+Centralized Method - Girder
+
+Observations:
+* IPFS via https using gateways does not always work well.
+* IPFS usually works well if you use the CLI.
+* IPFS is easier to update.
+* IPFS does rehash every file, which induces an O(N) scalability constraint.
+* IPFS does rehash every file, which induces an O(N) scalability constraint.
+
+
+IPFS vs BitTorrent:
+https://gist.github.com/liamzebedee/224494052fb6037d07a4293ceca9d6e7
+
+
 \subsection{Experiments}
 
 Measure the performance of our algorithm versus a baseline.

diff --git a/shitspotter/cid_revisions.txt b/shitspotter/cid_revisions.txt
@@ -26,3 +26,4 @@ bafybeia2gphecs3pbrccwopg63aka7lxy5vj6btcwyazf47q6jlqjgagru
 bafybeibw5xqmdiycd7vw5qqdf3ceidjbq3cv4taalkc3ruu3qeqmqdy6sm
 bafybeidle54us5cdwpzzis4h52wjmtsk643gprx7nvvtd6g26mxq76kfjm
 bafybeidle54us5cdwpzzis4h52wjmtsk643gprx7nvvtd6g26mxq76kfjm
+bafybeia44hiextgcpjfvglib66gxziaf7jkvno63p7h7fsqkxi5vpgpvay
diff --git a/shitspotter/phone_manager.py b/shitspotter/phone_manager.py
@@ -58,7 +58,7 @@ def main(TransferConfig, cmdline=True, **kwargs):
         prepared_transfer_fpath = cache_dpath / 'prepared_transfer.pkl'
         if lock_fpath.exists():
             raise Exception(
-                f'Previous transfer lockfile exists: {prepared_transfer_fpath}. '
+                f'Previous transfer lockfile exists: {lock_fpath}. '
                 'Needs to implement resume or cleanup dirty state')
         lock_fpath.touch()
 
@@ -362,6 +362,8 @@ class CopyManager:
         """
         TODO: wrap some super fast protocol like rsync.
         Progress bars like with dvc would be neat.
+
+        TODO: see kwutil CopyManager
         """
         pass
 

diff --git a/shitspotter/pin_table.txt b/shitspotter/pin_table.txt
@@ -44,3 +44,6 @@ ipfs pin ls --type="recursive" --names | grep bafybeiczi4pn4na2iw7c66bpbf5rdr3ua
 #ipfs pin add --progress --name shitspotter-2023-01-01-draft bafybeif2yoidrnrzbpofcdlvl33em5e6eoslk4ryb7pe6ployl7najdi7q
 
 
+
+ipfs pin add --name shitspotter-assets-poop-2024-06-15-T163943 --progress -- bafybeie2sfu46vhjnjtamf6a2fsep64anoasqsi4yp5wc3r5trcaf6zkke
+ipfs pin add --name shitspotter-2024-06-16 --progress bafybeia44hiextgcpjfvglib66gxziaf7jkvno63p7h7fsqkxi5vpgpvay
diff --git a/update.sh b/update.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+__doc__="
+This script updates the main kwcoco files based on labelme annotations and
+produces the data splits.
+"
+
+echo "
+To add a new cohort of data use
+
+* Plug phone into computer.
+
+* In USB preferences, enable 'File trasfer / Android Auto'.
+
+* Run code to transfer and organize new images
+
+.. code::
+
+    python -m shitspotter.phone_manager
+
+* Add manual annotations with labelme
+"
+
+# The gather script
+python -m shitspotter.gather
+
+# The train/vali splits
+python -m shitspotter.make_splits
+
+# The matching script
+python -m shitspotter.matching autofind_pair_hueristic
+
+# The plots script
+python -m shitspotter.plots update_analysis_plots
+