-
Notifications
You must be signed in to change notification settings - Fork 1
/
train.py
285 lines (245 loc) · 9.94 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
#!/usr/bin/env python3
import logging
import os
import time
from dataclasses import dataclass
import hydra
import numpy as np
import omegaconf
import pandas as pd
import torch
import torchvision
import wandb
from hydra.core.config_store import ConfigStore
from hydra.utils import get_original_cwd
from netcal.metrics import ECE
from torch.utils.data import DataLoader, Subset
from torchvision import transforms
from tqdm import tqdm
import helper as h
import likelihoods
import priors
import sfr
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class MetricLogger:
def __init__(self):
self.df = pd.DataFrame(columns=["Model", "loss", "acc", "nlpd", "ece"])
def log(self, metrics: dict, name: str):
logger.info(
f"{name} NLPD {metrics['nlpd']} | ACC: {metrics['acc']} | ECE: {metrics['ece']}"
)
metrics.update({"Model": name})
if wandb.run is not None:
self.df.loc[len(self.df.index)] = metrics
wandb.log({"Metrics": wandb.Table(data=self.df)})
@dataclass
class TrainConfig:
# Dataset
dataset: str = "FMNIST" # "FMNIST"/"CIFAR10"/"MNIST"
train_val_split: float = 0.8
debug: bool = False # If true only use 500 data points
# SFR config
prior_precision: float = 0.0013
num_inducing: int = 2048
# dual_batch_size: int = 5000
dual_batch_size: int = 2048
jitter: float = 1e-6
likelihood_eps: float = 0.0 # for numerical stability
# Training config
batch_size: int = 64
lr: float = 1e-3
n_epochs: int = 10000
# Early stopping on validation loss
early_stop_patience: int = 5
early_stop_min_delta: float = 0.0
optimize_prior_prec: bool = False # if True use BayesOpt to tune prior precision
# Experiment config
logging_epoch_freq: int = 2
test_batch_size: int = 2048 # batch size for computing metrics
seed: int = 42
device: str = "cuda" # "cpu" or "cuda" etc
# W&B config
use_wandb: bool = False
wandb_project_name: str = "sfr"
wandb_run_name: str = f"FMNIST__{time.time()}"
cs = ConfigStore.instance()
cs.store(name="train_config", node=TrainConfig)
@hydra.main(version_base="1.3", config_path="./cfgs", config_name="train")
def train(cfg: TrainConfig):
##### Make experiment reproducible #####
torch.cuda.manual_seed(cfg.seed)
torch.manual_seed(cfg.seed)
np.random.seed(cfg.seed)
eval('setattr(torch.backends.cudnn, "determinstic", True)')
eval('setattr(torch.backends.cudnn, "benchmark", False)')
##### Use GPU if requested and available #####
if "cuda" in cfg.device:
if torch.cuda.is_available():
cfg.device = "cuda"
else:
logger.info("CUDA requested but not available")
cfg.device = "cpu"
logger.info(f"Using device: {cfg.device}")
##### Initialize W&B #####
if cfg.use_wandb:
wandb.init(
project=cfg.wandb_project_name,
name=cfg.wandb_run_name,
group=cfg.dataset,
tags=[cfg.dataset, f"M={cfg.num_inducing}"],
config=omegaconf.OmegaConf.to_container(
cfg, resolve=True, throw_on_missing=True
),
dir=get_original_cwd(), # don't nest wandb inside hydra dir
)
ckpt_path = os.path.join(get_original_cwd(), "best_ckpt.pt")
##### Load data with train/val/test split #####
save_dir = f"{get_original_cwd()}/data"
if "FMNIST" in cfg.dataset:
dataset_fn = torchvision.datasets.FashionMNIST
normalize_transform = transforms.Normalize((0.2860,), (0.3530,))
# Calculated with ds_train.train_data.float().mean()/255
elif "MNIST" in cfg.dataset:
dataset_fn = torchvision.datasets.MNIST
# Calculated with ds_train.train_data.float().mean()/255
normalize_transform = transforms.Normalize((0.1307,), (0.3081,))
elif "CIFAR10" in cfg.dataset:
dataset_fn = torchvision.datasets.CIFAR10
# Calculated with ds_train.data.mean(axis=(0,1,2))/255
normalize_transform = transforms.Normalize(
(0.49139968, 0.48215841, 0.44653091), (0.24703223, 0.24348513, 0.26158784)
)
else:
raise NotImplementedError("Only MNIST/FMNIST/CIFAR10 supported for cfg.dataset")
transform = transforms.Compose([transforms.ToTensor(), normalize_transform])
ds_train = dataset_fn(
f"{save_dir}/{cfg.dataset}", download=True, train=True, transform=transform
)
output_dim = len(ds_train.classes)
if ds_train.data.ndim == 3:
in_channels = 1
elif ds_train.data.ndim == 4:
in_channels = ds_train.data.shape[-1]
num_data = len(ds_train) if not cfg.debug else 500
idxs = np.random.permutation(num_data)
split_idx = int(cfg.train_val_split * num_data)
if cfg.debug:
ds_test = Subset(ds_train, idxs[split_idx + 1 :])
else:
ds_test = dataset_fn(
f"{save_dir}/{cfg.dataset}", download=True, train=False, transform=transform
)
train_loader = DataLoader(
Subset(ds_train, idxs[:split_idx]), batch_size=cfg.batch_size, shuffle=True
)
val_loader = DataLoader(
Subset(ds_train, idxs[split_idx + 1 :]),
batch_size=cfg.test_batch_size,
shuffle=True,
)
test_loader = DataLoader(
ds_test, batch_size=cfg.test_batch_size, shuffle=True, pin_memory=True
)
##### Instantiate SFR #####
# TODO This doesn't use tanh...
network = h.CIFAR10Net(in_channels=in_channels, n_out=output_dim, use_tanh=True)
prior = priors.Gaussian(
params=network.parameters, prior_precision=cfg.prior_precision
)
likelihood = likelihoods.CategoricalLh(EPS=cfg.likelihood_eps)
model = sfr.SFR(
network=network,
prior=prior,
likelihood=likelihood,
output_dim=output_dim,
num_inducing=cfg.num_inducing,
dual_batch_size=cfg.dual_batch_size,
jitter=cfg.jitter,
device=cfg.device,
)
optimizer = torch.optim.Adam([{"params": model.parameters()}], lr=cfg.lr)
early_stopper = h.EarlyStopper(
patience=int(cfg.early_stop_patience / cfg.logging_epoch_freq),
min_delta=cfg.early_stop_min_delta,
)
@torch.no_grad()
def evaluate(model: sfr.SFR, data_loader: DataLoader, sfr_pred: bool = False):
model.eval()
probs, targets, val_losses = [], [], []
dtype = next(model.parameters()).dtype # because change NN from float to double
for data, target in data_loader:
data = data.to(dtype).to(cfg.device)
target = target.to(cfg.device)
if sfr_pred: # predict with SFR
probs.append(model(data.to(cfg.device))[0])
else: # predict with NN
probs.append(torch.softmax(model.network(data), dim=-1))
targets.append(target)
val_losses.append(model.loss(data, target))
val_loss = torch.mean(torch.stack(val_losses, 0)).cpu().numpy().item()
targets = torch.cat(targets, dim=0).cpu().numpy()
probs = torch.cat(probs).cpu().numpy()
acc = (probs.argmax(-1) == targets).mean()
ece = ECE(bins=15).measure(probs, targets)
dist = torch.distributions.Categorical(torch.Tensor(probs))
nlpd = -dist.log_prob(torch.Tensor(targets)).mean().numpy()
metrics = {"loss": val_loss, "acc": acc, "nlpd": nlpd, "ece": ece}
model.train()
return metrics
##### Train NN weights with empirical regularized risk #####
best_loss = float("inf")
for epoch_idx in tqdm(list(range(cfg.n_epochs)), total=cfg.n_epochs):
with tqdm(train_loader, unit="batch") as tepoch:
for data, target in tepoch:
tepoch.set_description(f"Epoch {epoch_idx}/{cfg.n_epochs}")
loss = model.loss(data.to(cfg.device), target.to(cfg.device))
optimizer.zero_grad()
loss.backward()
optimizer.step()
tepoch.set_postfix(loss=loss.item())
if wandb.run is not None:
wandb.log({"train_loss": loss})
if epoch_idx % cfg.logging_epoch_freq == 0:
val_metrics = evaluate(model, data_loader=val_loader)
val_loss = val_metrics["loss"]
if wandb.run is not None:
wandb.log({"val_loss": val_loss, "epoch": epoch_idx})
if val_loss < best_loss:
best_loss = val_loss
torch.save({"model": model.state_dict()}, ckpt_path)
if early_stopper(val_loss):
logger.info("Early stopping criteria met, stopping training...")
break
logger.info("Finished training")
##### Load the best checkpoint (with lowest validation loss) #####
ckpt = torch.load(ckpt_path)
model.load_state_dict(ckpt["model"])
##### Fit SFR posterior (dual parameters etc) #####
logger.info("Fitting SFR...")
model.fit(train_loader=train_loader)
logger.info("Finished fitting SFR")
##### Log metrics on test set #####
metric_logger = MetricLogger()
# Calculate metrics for NN MAP
nn_metrics = evaluate(model, data_loader=test_loader, sfr_pred=False)
metric_logger.log(nn_metrics, name="NN")
# Calculate metrics for SFR
sfr_metrics = evaluate(model, data_loader=test_loader, sfr_pred=True)
metric_logger.log(sfr_metrics, name="SFR")
##### (Optionally) Optimize the prior precision posthoc using BO #####
if cfg.optimize_prior_prec:
model.optimize_prior_precision(
pred_type="gp",
method="bo",
val_loader=val_loader,
n_samples=100,
prior_prec_min=1e-8,
prior_prec_max=1.0,
num_trials=20,
)
##### Calculate metrics for SFR with prior precision tuned #####
sfr_metrics = evaluate(model, data_loader=test_loader, sfr_pred=True)
metric_logger.log(sfr_metrics, name="SFR (δ-tuned)")
if __name__ == "__main__":
train() # pyright: ignore