paper is starting to look better

Erotemic · Oct 22, 2024 · c537cfa · c537cfa
1 parent 9f93c5c
commit c537cfa
Show file tree

Hide file tree

Showing 6 changed files with 179 additions and 58 deletions.
diff --git a/dev/poc/estimate_train_resources.py b/dev/poc/estimate_train_resources.py
@@ -3,6 +3,33 @@
 import ubelt as ub
 
 
+def estimate_training_duration(checkpoint_fpaths):
+    """
+    Given a list of checkpoints, estimate how long training took.
+    """
+    import kwutil
+    ckpt_times = []
+    for cpkt_fpath in checkpoint_fpaths:
+        ckpt_time = kwutil.datetime.coerce(cpkt_fpath.stat().st_mtime)
+        ckpt_times.append(ckpt_time)
+
+    if len(ckpt_times):
+        min_dtime = min(*ckpt_times)
+        max_dtime = max(*ckpt_times)
+        duration = max_dtime - min_dtime
+    else:
+        min_dtime = None
+        max_dtime = None
+        duration = 0
+
+    info = {
+        'min_dtime': min_dtime,
+        'max_dtime': max_dtime,
+        'duration': duration,
+    }
+    return info
+
+
 class EstimateTrainResourcesCLI(scfg.DataConfig):
     run_dpath = '/data/joncrall/dvc-repos/shitspotter_expt_dvc/training/toothbrush/joncrall/ShitSpotter/runs/'
     # param1 = scfg.Value(None, help='param1')
@@ -43,27 +70,20 @@ def main(cls, cmdline=1, **kwargs):
 
             checkpoint_fpaths = list(ckpt_dpath.glob('*.ckpt'))
             print(f'checkpoint_fpaths = {ub.urepr(checkpoint_fpaths, nl=1)}')
-
-            hparams_time = kwutil.datetime.coerce(hparams_fpath.stat().st_mtime)
-
-            ckpt_times = []
-            for cpkt_fpath in checkpoint_fpaths:
-                ckpt_time = kwutil.datetime.coerce(cpkt_fpath.stat().st_mtime)
-                ckpt_times.append(ckpt_time)
-
-            min_dtime = min(hparams_time, *ckpt_times)
-            max_dtime = max(hparams_time, *ckpt_times)
-            duration = max_dtime - min_dtime
+            # hparams_time = kwutil.datetime.coerce(hparams_fpath.stat().st_mtime)
+            info = estimate_training_duration(checkpoint_fpaths)
+            # min_dtime = min(hparams_time, *ckpt_times)
+            # max_dtime = max(hparams_time, *ckpt_times)
+            # duration = max_dtime - min_dtime
             row = {
                 'expt_name': expt_name,
                 'dpath': dpath,
-                'duration': duration,
+                **info,
             }
             rows.append(row)
 
         # Infer more information from each training directory
         # including lineage
-        import kwutil
         for row in ub.ProgIter(rows, desc='Loading more train info'):
             dpath = row['dpath']
 

diff --git a/papers/application-2024/appendix.tex b/papers/application-2024/appendix.tex
@@ -133,17 +133,17 @@ \section{Extra Dataset Comparison}
 \Cref{fig:combo_polygon_centroid_relative_distribution} show the distribution of centroid positions (relative to the image size).
 
 
-\begin{figure*}[ht]
-\centering
-\includegraphics[width=1.0\textwidth]{plots/appendix/dataset_compare/combo_all_polygons.png.png}
-\caption[]{
-    A comparison of all of the annotations for different datasets including ours.
-    All polygon annotations drawn in a single plot with 0.8 opacity to
-    demonstrate the distribution in annotation location, shape, and size with
-    respect to image coordinates.
-}
-\label{fig:compare_allannots}
-\end{figure*}
+%\begin{figure*}[ht]
+%\centering
+%\includegraphics[width=1.0\textwidth]{plots/appendix/dataset_compare/combo_all_polygons.png.png}
+%\caption[]{
+%    A comparison of all of the annotations for different datasets including ours.
+%    All polygon annotations drawn in a single plot with 0.8 opacity to
+%    demonstrate the distribution in annotation location, shape, and size with
+%    respect to image coordinates.
+%}
+%\label{fig:compare_allannots}
+%\end{figure*}
 
 
 \begin{figure*}[ht]
@@ -197,13 +197,14 @@ \section{Extra Dataset Comparison}
 \end{figure*}
 
 
-\section{GeoWATCH Models}
+\section{VIT-sseg Models}
 \label{sec:models}
 
-This section provides more details about the training of GeoWATCH models.
+This section provides more details about the training of VIT-sseg models.
 
-We use the training, prediction, and evaluation system presented in \cite{Greenwell_2024_WACV,
-  crall_geowatch_2024}, which utilizes polygon annotations to train a pixelwise binary segmentation model.
+To train VIT-sseg models we use the training, prediction, and evaluation system presented in
+  \cite{Greenwell_2024_WACV, crall_geowatch_2024}, which utilizes polygon annotations to train a pixelwise
+  binary segmentation model.
 %It is important to note that this baseline is limited in that it only considers a single VIT-based
 %  \cite{dosovitskiy_image_2021} architecture, and does not attempt to explore all state-of-the-art methods.
 
@@ -237,7 +238,7 @@ \section{GeoWATCH Models}
 Our effective batch size is 24 with a real batch size of 2 and 12 accumulate gradient steps.
 This setup consumes approximately 20 GB of GPU RAM during training.
 
-\subsection{GeoWATCH Model Experiments}
+\subsection{VIT-sseg Model Experiments}
 
 To establish a strong baseline, we evaluated 35 training runs where we varied input resolutions, window
   sizes, model depth, and other parameters.

diff --git a/papers/application-2024/main.tex b/papers/application-2024/main.tex
@@ -14,7 +14,7 @@
 \nonanonymoustrue % comment out to be anonymous
 
 \newif\ifuseappendix
-%\useappendixtrue % comment out to remove appendix
+\useappendixtrue % comment out to remove appendix
 
 \newif\ifuseacknowledgement
 \useacknowledgementtrue % comment out to remove acknowledgements
@@ -102,6 +102,7 @@
 }
 \maketitle
 
+
 %%%%%%%%% ABSTRACT
 \begin{abstract}
 
@@ -186,13 +187,27 @@ \section{Introduction}
 Image W \times H is the pixel width and height of the image with the median area. Annot Area$^{0.5}$ is the median sqrt(area) in pixels of the annotation polygon or box. The Size column refers to the amount of information in  gigabytes needed to download the entire dataset.
 Annot Type refers to if the dataset is annotated with bounding boxes,
 image-level classification labels, or polygon segmentations.
-%Of the datasets in this table, ours has the highest image resolution
+\Cref{fig:compare_allannots} provides a visual gist of the distribution of annotation shape, size, and positional in each dataset.
+%Of the datasets in this table, ours has the highest image resolution.
 %and the smallest annotation size relative to that resolution.
-% Of the waste related datasets, ours is among the largest, and of the poop related datasets, it is the largest.
+%Of the waste related datasets and in terms of number of images, ours is among the largest, and of the poop related datasets, it is the largest.
 }
 \label{tab:related_datasets}
 \end{table*}
 
+\begin{figure*}[ht]
+\centering
+\includegraphics[width=1.0\textwidth]{plots/appendix/dataset_compare/combo_all_polygons.png.png}
+\caption[]{
+    A comparison of all of the annotations for different datasets including ours.
+    All polygon annotations drawn in a single plot with $0.8$ opacity to
+    demonstrate the distribution in annotation location, shape, and size with
+    respect to image coordinates.
+}
+\label{fig:compare_allannots}
+\end{figure*}
+
+
 In addition to enabling several applications, poop detection is an interesting benchmark problem.
 It is relatively simple, with a narrow focus on a single class, making it suitable for exploring the
   capabilities of object detection models that target a single labeled class.
@@ -287,7 +302,7 @@ \section{Related Work}
 In \Cref{sec:distribution} will discuss the logistics and tradeoffs between different mechanisms to
   distribute datasets with a focus on comparing centralized and decentralized methods.
 IPFS~\cite{cohen_incentives_2003} and BitTorrent~\cite{benet_ipfs_2014} are the decentralized distribution
-  mechanism we evaluate, but there are others such as Secure Scuttlebut \cite{tarr_secure_2019} and Hypercore
+  mechanisms we evaluate, but there are others such as Secure Scuttlebut \cite{tarr_secure_2019} and Hypercore
   \cite{frazee_dep-0002_nodate}, which we did not test.
 
   % Is hypercore dat? https://dat-ecosystem.org/ https://datproject.org/
@@ -340,7 +355,8 @@ \subsection{Dataset Collection}
 
 In addition to the primary dataset, we also received 84 images from contributors.
 Most of these images do not follow the B/A/N protocol and are marked for use only in testing and are
-  \emph{not} included in the following analysis unless explicitly noted.
+  \emph{not} included in the following analysis.
+  %unless explicitly noted.
 
 
 \subsection{Dataset Annotation}
@@ -373,11 +389,13 @@ \subsection{Dataset Annotation}
 \centering
 \includegraphics[width=.4\textwidth]{figures/umap-screenshot-edited.png}%
 \caption[]{
-    Example images from the dataset based on UMAP \cite{mcinnes_umap_2020}
-    clusters over a 200 image subset of the dataset.  Each row corresponds to a
-    selection from a 2D UMap projection shown on the left.  The highlighted
-    nodes circled in blue in the cluster visualization in each row correspond
-    to the images shown on the right.
+    Example images from the dataset based on UMAP \cite{mcinnes_umap_2020} clusters over a 200 image subset
+      of the dataset.
+    Each row corresponds to a selection from a 2D UMAP projection shown on the left.
+    The highlighted nodes circled in blue in the cluster visualization in each row correspond to the images
+      with annotations (drawn in green) shown on the right.
+    An interesting observation is that the clear separation into two UMAP blobs represents snowy versus
+      non-snowy images.
 }
 \label{fig:umap_dataset_viz}
 \end{figure}
@@ -485,9 +503,9 @@ \section{Baseline Models}
 \centering
 \begin{tabular}{ll|rrrr|rrrr}
 \toprule
-dataset split: & {} & \multicolumn{4}{c}{Test} & \multicolumn{4}{c}{Validation} \\
-evaluation type: & {} & \multicolumn{2}{c}{Box} & \multicolumn{2}{c}{Pixel} & \multicolumn{2}{c}{Box} & \multicolumn{2}{c}{Pixel} \\
-{} &  \# params & AP & AUC & AP & AUC & AP & AUC & AP & AUC \\
+\multicolumn{2}{c}{Dataset split:} & \multicolumn{4}{c}{Test} & \multicolumn{4}{c}{Validation} \\
+\multicolumn{2}{c}{Evaluation type:} & \multicolumn{2}{c}{Box} & \multicolumn{2}{c}{Pixel} & \multicolumn{2}{c}{Box} & \multicolumn{2}{c}{Pixel} \\
+Model type &  \# Params & AP & AUC & AP & AUC & AP & AUC & AP & AUC \\
 \midrule
 MaskRCNN-pretrained &  43.9e6 &  0.661 &   0.692 &    0.847 &     0.858 &  0.612 &   0.721 &    0.858 &     0.905 \\
 MaskRCNN-scratch    &  43.9e6 &  0.384 &   0.573 &    0.581 &     0.804 &  0.255 &   0.576 &    0.434 &     0.891 \\
@@ -507,7 +525,8 @@ \section{Baseline Models}
     All metrics were computed using scikit-learn \cite{scikit-learn}.
     We note an important limitation of our results: much more time was spent
     tuning the VIT-sseg model. It is likely that MaskRCNN results could be
-    improved with further tuning. But these are baseline models; our core contribution is the dataset.
+    improved with further tuning. 
+    %But these are baseline models; our core contribution is the dataset.
 }
 \label{tab:model_results}
 \end{table*}
@@ -537,6 +556,9 @@ \section{Baseline Models}
     The second row in each subfigure is the predicted heatmap, illustrating the model's output before
       binarization.
     The threshold for binarization was set to $0.5$ in all cases.
+    All three methods show clear responses to objects of interest, but cases where objects are close-up 
+      and partially deteriorated do seem to be a common failure mode.
+
 }
 \label{fig:test_results_all_models}
 \end{figure*}
@@ -559,6 +581,11 @@ \section{Baseline Models}
 \caption[]{
     Qualitative results using the top-performing model on the validation set, applied to a selection of
       images from the validation set. See \Cref{fig:test_results_all_models} for an explanation of the visualizations.
+    Each model was selected based on its performance on this dataset, which may
+      cause spurious cases that agree with the truth labels, but this dataset
+      was never used to compute a gradient, which still make these valuable
+      results for assessing generalizability. Notably the models were able to
+      pick out camouflaged cases on the left.
 }
 \label{fig:vali_results_all_models}
 \end{figure*}
@@ -642,16 +669,34 @@ \section{Baseline Models}
 %\label{fig:test_heatmaps_with_best_vali_model}
 %\end{figure*}
 
-All models were trained on a single machine with an 11900k CPU and a single 3090 GPU.
 
-Overall, prediction and evaluation on trained models took 15.6 days with prediction consuming 109.6.3 kWh of
-  electricity and causing an estimated emissions of 23.0 \cotwo kg as measured by CodeCarbon
-  \cite{lacoste2019codecarbon}.
+All models were trained on a single machine with an Intel Core i9-11900K CPU and an NVIDIA GeForce RTX 3090
+  GPU.
+The total time spent on prediction and evaluation across all experiments was 15.6 days, with prediction
+  consuming 109.6.3 kWh of electricity and causing an estimated emissions of 23.0 \cotwo kg as measured by
+  CodeCarbon \cite{lacoste2019codecarbon}.
 
-todo: train time resource usage for maskrcnn and vit, reacnknowledge
-limitation, break down results over each.
+We estimated train-time resource usage during training using indirect methods, assuming a constant power
+  draw of 345W from the RTX 3090 GPU.
+Electricity consumption was approximated accordingly, while emissions were calculated using a conversion
+  ratio of 0.21 $\frac{\textrm{kg}\cotwo{}}{\textrm{kWh}}$ derived from our prediction time measurements.
+Based on file timestamps, we estimated that running 44 different training runs took approximately 159.66
+  days, resulting in an estimated electricity usage and emissions of 1321.99 kWh and 277.612 $\cotwo$ kg,
+  respectively.
+
+A key limitation of these results is the imbalance between model types, with 42 out of 44 trained models
+  being VIT-ssegs and only two MaskRCNN models, each taking approximately 8 hours to train.
+More details on the VIT-sseg experiments can be found in the supplemental materials.
+
 
-Report training time, energy usage, and carbon footprint with details in supplemental materials.
+%train$^{*}$ & time        & 158.95 days      &     3.78 days  &   42 \\
+%train$^{*}$ & electricity & 1,316.07 kWh     &     31.34 kWh  &   42 \\
+%train$^{*}$ & emissions   & 276.37 \cotwo kg & 6.58 \cotwo kg &   42 \\
+
+%todo: train time resource usage for maskrcnn and vit, reacnknowledge
+%limitation, break down results over each.
+
+%Report training time, energy usage, and carbon footprint with details in supplemental materials.
 
 
 
@@ -744,8 +789,9 @@ \section{Open Data Distribution}
 % BitTorrent can be vulnerable to MITM:
 % https://www.reddit.com/r/technology/comments/1dpinuw/south_korean_telecom_company_attacks_torrent/
 
-The reproducibility "crisis" in science has raised concerns across various disciplines
-  \cite{baker_reproducibility_2016}.
+
+Empirical evidence suggests that a substantial proportion of scientific studies have low reproducibility
+  rates, which has raised concerns across various disciplines \cite{baker_reproducibility_2016}.
 Ideally, all scientific research should be independently reproducible.
 Despite higher success rates in computer science (up to 60\%) compared to other fields, there is still room for improvement
 \cite{NEURIPS2019_c429429b, collberg2016repeatability, desai_what_2024}.

diff --git a/papers/application-2024/scripts/build_v2_result_table.py b/papers/application-2024/scripts/build_v2_result_table.py
@@ -715,6 +715,9 @@ def main():
     grouped['co2_kg'].sum()
 
     detectron = deduped[deduped['dname'].str.contains('detectron')]
+    print('time', kwutil.timedelta.coerce(detectron['duration'].sum()).to('pint').to('days'))
+    print('kwh', detectron['kwh'].sum())
+    print('co2', detectron['co2_kg'].sum())
 
     for key, group in list(deduped.groupby(['node_type', 'dname', 'dataset_name'])):
         print(key)

diff --git a/papers/application-2024/scripts/estimate_training_resources.py b/papers/application-2024/scripts/estimate_training_resources.py
@@ -3,6 +3,9 @@
 python ~/code/shitspotter/dev/poc/estimate_train_resources.py
 """
 import kwutil.util_units
+import ubelt as ub
+import sys
+sys.path.append(ub.expandpath('~/code/shitspotter/dev/poc'))
 
 reg = kwutil.util_units.unit_registry()
 # gpu_power = 350 * reg.watt
@@ -24,3 +27,49 @@
 
 cost_to_offset = dollar_per_kg * co2_kg
 print(f'cost_to_offset = ${cost_to_offset:4.2f}')
+
+
+# Detectron training results
+runs_dpath = ub.Path('$HOME/data/dvc-repos/shitspotter_expt_dvc/training/toothbrush/joncrall/ShitSpotter/runs').expand()
+detectron_dpaths = [
+    runs_dpath / 'train_baseline_maskrcnn_scratch_v4',
+    runs_dpath / 'train_baseline_maskrcnn_v3',
+]
+from estimate_train_resources import estimate_training_duration, find_offset_cost  # NOQA
+
+rows = []
+for train_dpath in detectron_dpaths:
+    for dpath in train_dpath.ls():
+        if dpath.is_dir():
+            checkpoint_paths = list(dpath.glob('*.pth'))
+            info = estimate_training_duration(checkpoint_paths)
+            info['duration_human'] = kwutil.timedelta.coerce(
+                info['duration']).format(unit='auto', precision=2)
+            info['num_checkpoints'] = len(checkpoint_paths)
+            info['dpath'] = dpath
+            info.update(find_offset_cost(info['duration']))
+            rows.append(info)
+
+
+print(f'rows = {ub.urepr(rows, nl=2)}')
+
+"""
+
+
+GeoWatch:
+
+train$^{*}$ & time        & 158.95 days
+train$^{*}$ & electricity & 1,316.07 kWh
+train$^{*}$ & emissions   & 276.37 \cotwo kg
+
+Detectron
+
+17.0 hours
+1.2426682788143752 CO2
+5.917467994354167 kWh
+
+total:
+159.66 days
+1321.99 kWh
+277.612 \cotwo kg
+"""