Skip to content

Commit

Permalink
paper is starting to look better
Browse files Browse the repository at this point in the history
  • Loading branch information
Erotemic committed Oct 22, 2024
1 parent 9f93c5c commit c537cfa
Show file tree
Hide file tree
Showing 6 changed files with 179 additions and 58 deletions.
46 changes: 33 additions & 13 deletions dev/poc/estimate_train_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,33 @@
import ubelt as ub


def estimate_training_duration(checkpoint_fpaths):
"""
Given a list of checkpoints, estimate how long training took.
"""
import kwutil
ckpt_times = []
for cpkt_fpath in checkpoint_fpaths:
ckpt_time = kwutil.datetime.coerce(cpkt_fpath.stat().st_mtime)
ckpt_times.append(ckpt_time)

if len(ckpt_times):
min_dtime = min(*ckpt_times)
max_dtime = max(*ckpt_times)
duration = max_dtime - min_dtime
else:
min_dtime = None
max_dtime = None
duration = 0

info = {
'min_dtime': min_dtime,
'max_dtime': max_dtime,
'duration': duration,
}
return info


class EstimateTrainResourcesCLI(scfg.DataConfig):
run_dpath = '/data/joncrall/dvc-repos/shitspotter_expt_dvc/training/toothbrush/joncrall/ShitSpotter/runs/'
# param1 = scfg.Value(None, help='param1')
Expand Down Expand Up @@ -43,27 +70,20 @@ def main(cls, cmdline=1, **kwargs):

checkpoint_fpaths = list(ckpt_dpath.glob('*.ckpt'))
print(f'checkpoint_fpaths = {ub.urepr(checkpoint_fpaths, nl=1)}')

hparams_time = kwutil.datetime.coerce(hparams_fpath.stat().st_mtime)

ckpt_times = []
for cpkt_fpath in checkpoint_fpaths:
ckpt_time = kwutil.datetime.coerce(cpkt_fpath.stat().st_mtime)
ckpt_times.append(ckpt_time)

min_dtime = min(hparams_time, *ckpt_times)
max_dtime = max(hparams_time, *ckpt_times)
duration = max_dtime - min_dtime
# hparams_time = kwutil.datetime.coerce(hparams_fpath.stat().st_mtime)
info = estimate_training_duration(checkpoint_fpaths)
# min_dtime = min(hparams_time, *ckpt_times)
# max_dtime = max(hparams_time, *ckpt_times)
# duration = max_dtime - min_dtime
row = {
'expt_name': expt_name,
'dpath': dpath,
'duration': duration,
**info,
}
rows.append(row)

# Infer more information from each training directory
# including lineage
import kwutil
for row in ub.ProgIter(rows, desc='Loading more train info'):
dpath = row['dpath']

Expand Down
33 changes: 17 additions & 16 deletions papers/application-2024/appendix.tex
Original file line number Diff line number Diff line change
Expand Up @@ -133,17 +133,17 @@ \section{Extra Dataset Comparison}
\Cref{fig:combo_polygon_centroid_relative_distribution} show the distribution of centroid positions (relative to the image size).


\begin{figure*}[ht]
\centering
\includegraphics[width=1.0\textwidth]{plots/appendix/dataset_compare/combo_all_polygons.png.png}
\caption[]{
A comparison of all of the annotations for different datasets including ours.
All polygon annotations drawn in a single plot with 0.8 opacity to
demonstrate the distribution in annotation location, shape, and size with
respect to image coordinates.
}
\label{fig:compare_allannots}
\end{figure*}
%\begin{figure*}[ht]
%\centering
%\includegraphics[width=1.0\textwidth]{plots/appendix/dataset_compare/combo_all_polygons.png.png}
%\caption[]{
% A comparison of all of the annotations for different datasets including ours.
% All polygon annotations drawn in a single plot with 0.8 opacity to
% demonstrate the distribution in annotation location, shape, and size with
% respect to image coordinates.
%}
%\label{fig:compare_allannots}
%\end{figure*}


\begin{figure*}[ht]
Expand Down Expand Up @@ -197,13 +197,14 @@ \section{Extra Dataset Comparison}
\end{figure*}


\section{GeoWATCH Models}
\section{VIT-sseg Models}
\label{sec:models}

This section provides more details about the training of GeoWATCH models.
This section provides more details about the training of VIT-sseg models.

We use the training, prediction, and evaluation system presented in \cite{Greenwell_2024_WACV,
crall_geowatch_2024}, which utilizes polygon annotations to train a pixelwise binary segmentation model.
To train VIT-sseg models we use the training, prediction, and evaluation system presented in
\cite{Greenwell_2024_WACV, crall_geowatch_2024}, which utilizes polygon annotations to train a pixelwise
binary segmentation model.
%It is important to note that this baseline is limited in that it only considers a single VIT-based
% \cite{dosovitskiy_image_2021} architecture, and does not attempt to explore all state-of-the-art methods.

Expand Down Expand Up @@ -237,7 +238,7 @@ \section{GeoWATCH Models}
Our effective batch size is 24 with a real batch size of 2 and 12 accumulate gradient steps.
This setup consumes approximately 20 GB of GPU RAM during training.

\subsection{GeoWATCH Model Experiments}
\subsection{VIT-sseg Model Experiments}

To establish a strong baseline, we evaluated 35 training runs where we varied input resolutions, window
sizes, model depth, and other parameters.
Expand Down
92 changes: 69 additions & 23 deletions papers/application-2024/main.tex
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
\nonanonymoustrue % comment out to be anonymous

\newif\ifuseappendix
%\useappendixtrue % comment out to remove appendix
\useappendixtrue % comment out to remove appendix

\newif\ifuseacknowledgement
\useacknowledgementtrue % comment out to remove acknowledgements
Expand Down Expand Up @@ -102,6 +102,7 @@
}
\maketitle


%%%%%%%%% ABSTRACT
\begin{abstract}

Expand Down Expand Up @@ -186,13 +187,27 @@ \section{Introduction}
Image W \times H is the pixel width and height of the image with the median area. Annot Area$^{0.5}$ is the median sqrt(area) in pixels of the annotation polygon or box. The Size column refers to the amount of information in gigabytes needed to download the entire dataset.
Annot Type refers to if the dataset is annotated with bounding boxes,
image-level classification labels, or polygon segmentations.
%Of the datasets in this table, ours has the highest image resolution
\Cref{fig:compare_allannots} provides a visual gist of the distribution of annotation shape, size, and positional in each dataset.
%Of the datasets in this table, ours has the highest image resolution.
%and the smallest annotation size relative to that resolution.
% Of the waste related datasets, ours is among the largest, and of the poop related datasets, it is the largest.
%Of the waste related datasets and in terms of number of images, ours is among the largest, and of the poop related datasets, it is the largest.
}
\label{tab:related_datasets}
\end{table*}

\begin{figure*}[ht]
\centering
\includegraphics[width=1.0\textwidth]{plots/appendix/dataset_compare/combo_all_polygons.png.png}
\caption[]{
A comparison of all of the annotations for different datasets including ours.
All polygon annotations drawn in a single plot with $0.8$ opacity to
demonstrate the distribution in annotation location, shape, and size with
respect to image coordinates.
}
\label{fig:compare_allannots}
\end{figure*}


In addition to enabling several applications, poop detection is an interesting benchmark problem.
It is relatively simple, with a narrow focus on a single class, making it suitable for exploring the
capabilities of object detection models that target a single labeled class.
Expand Down Expand Up @@ -287,7 +302,7 @@ \section{Related Work}
In \Cref{sec:distribution} will discuss the logistics and tradeoffs between different mechanisms to
distribute datasets with a focus on comparing centralized and decentralized methods.
IPFS~\cite{cohen_incentives_2003} and BitTorrent~\cite{benet_ipfs_2014} are the decentralized distribution
mechanism we evaluate, but there are others such as Secure Scuttlebut \cite{tarr_secure_2019} and Hypercore
mechanisms we evaluate, but there are others such as Secure Scuttlebut \cite{tarr_secure_2019} and Hypercore
\cite{frazee_dep-0002_nodate}, which we did not test.

% Is hypercore dat? https://dat-ecosystem.org/ https://datproject.org/
Expand Down Expand Up @@ -340,7 +355,8 @@ \subsection{Dataset Collection}

In addition to the primary dataset, we also received 84 images from contributors.
Most of these images do not follow the B/A/N protocol and are marked for use only in testing and are
\emph{not} included in the following analysis unless explicitly noted.
\emph{not} included in the following analysis.
%unless explicitly noted.


\subsection{Dataset Annotation}
Expand Down Expand Up @@ -373,11 +389,13 @@ \subsection{Dataset Annotation}
\centering
\includegraphics[width=.4\textwidth]{figures/umap-screenshot-edited.png}%
\caption[]{
Example images from the dataset based on UMAP \cite{mcinnes_umap_2020}
clusters over a 200 image subset of the dataset. Each row corresponds to a
selection from a 2D UMap projection shown on the left. The highlighted
nodes circled in blue in the cluster visualization in each row correspond
to the images shown on the right.
Example images from the dataset based on UMAP \cite{mcinnes_umap_2020} clusters over a 200 image subset
of the dataset.
Each row corresponds to a selection from a 2D UMAP projection shown on the left.
The highlighted nodes circled in blue in the cluster visualization in each row correspond to the images
with annotations (drawn in green) shown on the right.
An interesting observation is that the clear separation into two UMAP blobs represents snowy versus
non-snowy images.
}
\label{fig:umap_dataset_viz}
\end{figure}
Expand Down Expand Up @@ -485,9 +503,9 @@ \section{Baseline Models}
\centering
\begin{tabular}{ll|rrrr|rrrr}
\toprule
dataset split: & {} & \multicolumn{4}{c}{Test} & \multicolumn{4}{c}{Validation} \\
evaluation type: & {} & \multicolumn{2}{c}{Box} & \multicolumn{2}{c}{Pixel} & \multicolumn{2}{c}{Box} & \multicolumn{2}{c}{Pixel} \\
{} & \# params & AP & AUC & AP & AUC & AP & AUC & AP & AUC \\
\multicolumn{2}{c}{Dataset split:} & \multicolumn{4}{c}{Test} & \multicolumn{4}{c}{Validation} \\
\multicolumn{2}{c}{Evaluation type:} & \multicolumn{2}{c}{Box} & \multicolumn{2}{c}{Pixel} & \multicolumn{2}{c}{Box} & \multicolumn{2}{c}{Pixel} \\
Model type & \# Params & AP & AUC & AP & AUC & AP & AUC & AP & AUC \\
\midrule
MaskRCNN-pretrained & 43.9e6 & 0.661 & 0.692 & 0.847 & 0.858 & 0.612 & 0.721 & 0.858 & 0.905 \\
MaskRCNN-scratch & 43.9e6 & 0.384 & 0.573 & 0.581 & 0.804 & 0.255 & 0.576 & 0.434 & 0.891 \\
Expand All @@ -507,7 +525,8 @@ \section{Baseline Models}
All metrics were computed using scikit-learn \cite{scikit-learn}.
We note an important limitation of our results: much more time was spent
tuning the VIT-sseg model. It is likely that MaskRCNN results could be
improved with further tuning. But these are baseline models; our core contribution is the dataset.
improved with further tuning.
%But these are baseline models; our core contribution is the dataset.
}
\label{tab:model_results}
\end{table*}
Expand Down Expand Up @@ -537,6 +556,9 @@ \section{Baseline Models}
The second row in each subfigure is the predicted heatmap, illustrating the model's output before
binarization.
The threshold for binarization was set to $0.5$ in all cases.
All three methods show clear responses to objects of interest, but cases where objects are close-up
and partially deteriorated do seem to be a common failure mode.

}
\label{fig:test_results_all_models}
\end{figure*}
Expand All @@ -559,6 +581,11 @@ \section{Baseline Models}
\caption[]{
Qualitative results using the top-performing model on the validation set, applied to a selection of
images from the validation set. See \Cref{fig:test_results_all_models} for an explanation of the visualizations.
Each model was selected based on its performance on this dataset, which may
cause spurious cases that agree with the truth labels, but this dataset
was never used to compute a gradient, which still make these valuable
results for assessing generalizability. Notably the models were able to
pick out camouflaged cases on the left.
}
\label{fig:vali_results_all_models}
\end{figure*}
Expand Down Expand Up @@ -642,16 +669,34 @@ \section{Baseline Models}
%\label{fig:test_heatmaps_with_best_vali_model}
%\end{figure*}

All models were trained on a single machine with an 11900k CPU and a single 3090 GPU.

Overall, prediction and evaluation on trained models took 15.6 days with prediction consuming 109.6.3 kWh of
electricity and causing an estimated emissions of 23.0 \cotwo kg as measured by CodeCarbon
\cite{lacoste2019codecarbon}.
All models were trained on a single machine with an Intel Core i9-11900K CPU and an NVIDIA GeForce RTX 3090
GPU.
The total time spent on prediction and evaluation across all experiments was 15.6 days, with prediction
consuming 109.6.3 kWh of electricity and causing an estimated emissions of 23.0 \cotwo kg as measured by
CodeCarbon \cite{lacoste2019codecarbon}.

todo: train time resource usage for maskrcnn and vit, reacnknowledge
limitation, break down results over each.
We estimated train-time resource usage during training using indirect methods, assuming a constant power
draw of 345W from the RTX 3090 GPU.
Electricity consumption was approximated accordingly, while emissions were calculated using a conversion
ratio of 0.21 $\frac{\textrm{kg}\cotwo{}}{\textrm{kWh}}$ derived from our prediction time measurements.
Based on file timestamps, we estimated that running 44 different training runs took approximately 159.66
days, resulting in an estimated electricity usage and emissions of 1321.99 kWh and 277.612 $\cotwo$ kg,
respectively.

A key limitation of these results is the imbalance between model types, with 42 out of 44 trained models
being VIT-ssegs and only two MaskRCNN models, each taking approximately 8 hours to train.
More details on the VIT-sseg experiments can be found in the supplemental materials.


Report training time, energy usage, and carbon footprint with details in supplemental materials.
%train$^{*}$ & time & 158.95 days & 3.78 days & 42 \\
%train$^{*}$ & electricity & 1,316.07 kWh & 31.34 kWh & 42 \\
%train$^{*}$ & emissions & 276.37 \cotwo kg & 6.58 \cotwo kg & 42 \\

%todo: train time resource usage for maskrcnn and vit, reacnknowledge
%limitation, break down results over each.

%Report training time, energy usage, and carbon footprint with details in supplemental materials.



Expand Down Expand Up @@ -744,8 +789,9 @@ \section{Open Data Distribution}
% BitTorrent can be vulnerable to MITM:
% https://www.reddit.com/r/technology/comments/1dpinuw/south_korean_telecom_company_attacks_torrent/

The reproducibility "crisis" in science has raised concerns across various disciplines
\cite{baker_reproducibility_2016}.

Empirical evidence suggests that a substantial proportion of scientific studies have low reproducibility
rates, which has raised concerns across various disciplines \cite{baker_reproducibility_2016}.
Ideally, all scientific research should be independently reproducible.
Despite higher success rates in computer science (up to 60\%) compared to other fields, there is still room for improvement
\cite{NEURIPS2019_c429429b, collberg2016repeatability, desai_what_2024}.
Expand Down
3 changes: 3 additions & 0 deletions papers/application-2024/scripts/build_v2_result_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -715,6 +715,9 @@ def main():
grouped['co2_kg'].sum()

detectron = deduped[deduped['dname'].str.contains('detectron')]
print('time', kwutil.timedelta.coerce(detectron['duration'].sum()).to('pint').to('days'))
print('kwh', detectron['kwh'].sum())
print('co2', detectron['co2_kg'].sum())

for key, group in list(deduped.groupby(['node_type', 'dname', 'dataset_name'])):
print(key)
Expand Down
49 changes: 49 additions & 0 deletions papers/application-2024/scripts/estimate_training_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
python ~/code/shitspotter/dev/poc/estimate_train_resources.py
"""
import kwutil.util_units
import ubelt as ub
import sys
sys.path.append(ub.expandpath('~/code/shitspotter/dev/poc'))

reg = kwutil.util_units.unit_registry()
# gpu_power = 350 * reg.watt
Expand All @@ -24,3 +27,49 @@

cost_to_offset = dollar_per_kg * co2_kg
print(f'cost_to_offset = ${cost_to_offset:4.2f}')


# Detectron training results
runs_dpath = ub.Path('$HOME/data/dvc-repos/shitspotter_expt_dvc/training/toothbrush/joncrall/ShitSpotter/runs').expand()
detectron_dpaths = [
runs_dpath / 'train_baseline_maskrcnn_scratch_v4',
runs_dpath / 'train_baseline_maskrcnn_v3',
]
from estimate_train_resources import estimate_training_duration, find_offset_cost # NOQA

rows = []
for train_dpath in detectron_dpaths:
for dpath in train_dpath.ls():
if dpath.is_dir():
checkpoint_paths = list(dpath.glob('*.pth'))
info = estimate_training_duration(checkpoint_paths)
info['duration_human'] = kwutil.timedelta.coerce(
info['duration']).format(unit='auto', precision=2)
info['num_checkpoints'] = len(checkpoint_paths)
info['dpath'] = dpath
info.update(find_offset_cost(info['duration']))
rows.append(info)


print(f'rows = {ub.urepr(rows, nl=2)}')

"""
GeoWatch:
train$^{*}$ & time & 158.95 days
train$^{*}$ & electricity & 1,316.07 kWh
train$^{*}$ & emissions & 276.37 \cotwo kg
Detectron
17.0 hours
1.2426682788143752 CO2
5.917467994354167 kWh
total:
159.66 days
1321.99 kWh
277.612 \cotwo kg
"""
Loading

0 comments on commit c537cfa

Please sign in to comment.