This repository has been archived by the owner on May 22, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 62
/
trainer.py
124 lines (98 loc) · 5.67 KB
/
trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import time
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from dataset import VideoDataset, VideoDataset1M
from network import R2Plus1DClassifier
# Use GPU if available else revert to CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device being used:", device)
def train_model(num_classes, directory, layer_sizes=[2, 2, 2, 2], num_epochs=45, save=True, path="model_data.pth.tar"):
"""Initalizes and the model for a fixed number of epochs, using dataloaders from the specified directory,
selected optimizer, scheduler, criterion, defualt otherwise. Features saving and restoration capabilities as well.
Adapted from the PyTorch tutorial found here: https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
Args:
num_classes (int): Number of classes in the data
directory (str): Directory where the data is to be loaded from
layer_sizes (list, optional): Number of blocks in each layer. Defaults to [2, 2, 2, 2], equivalent to ResNet18.
num_epochs (int, optional): Number of epochs to train for. Defaults to 45.
save (bool, optional): If true, the model will be saved to path. Defaults to True.
path (str, optional): The directory to load a model checkpoint from, and if save == True, save to. Defaults to "model_data.pth.tar".
"""
# initalize the ResNet 18 version of this model
model = R2Plus1DClassifier(num_classes=num_classes, layer_sizes=layer_sizes).to(device)
criterion = nn.CrossEntropyLoss() # standard crossentropy loss for classification
optimizer = optim.SGD(model.parameters(), lr=0.01) # hyperparameters as given in paper sec 4.1
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) # the scheduler divides the lr by 10 every 10 epochs
# prepare the dataloaders into a dict
train_dataloader = DataLoader(VideoDataset(directory), batch_size=10, shuffle=True, num_workers=4)
# IF training on Kinetics-600 and require exactly a million samples each epoch,
# import VideoDataset1M and uncomment the following
# train_dataloader = DataLoader(VideoDataset1M(directory), batch_size=32, num_workers=4)
val_dataloader = DataLoader(VideoDataset(directory, mode='val'), batch_size=14, num_workers=4)
dataloaders = {'train': train_dataloader, 'val': val_dataloader}
dataset_sizes = {x: len(dataloaders[x].dataset) for x in ['train', 'val']}
# saves the time the process was started, to compute total time at the end
start = time.time()
epoch_resume = 0
# check if there was a previously saved checkpoint
if os.path.exists(path):
# loads the checkpoint
checkpoint = torch.load(path)
print("Reloading from previously saved checkpoint")
# restores the model and optimizer state_dicts
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['opt_dict'])
# obtains the epoch the training is to resume from
epoch_resume = checkpoint["epoch"]
for epoch in tqdm(range(epoch_resume, num_epochs), unit="epochs", initial=epoch_resume, total=num_epochs):
# each epoch has a training and validation step, in that order
for phase in ['train', 'val']:
# reset the running loss and corrects
running_loss = 0.0
running_corrects = 0
# set model to train() or eval() mode depending on whether it is trained
# or being validated. Primarily affects layers such as BatchNorm or Dropout.
if phase == 'train':
# scheduler.step() is to be called once every epoch during training
scheduler.step()
model.train()
else:
model.eval()
for inputs, labels in dataloaders[phase]:
# move inputs and labels to the device the training is taking place on
inputs = inputs.to(device)
labels = labels.to(device)
optimizer.zero_grad()
# keep intermediate states iff backpropagation will be performed. If false,
# then all intermediate states will be thrown away during evaluation, to use
# the least amount of memory possible.
with torch.set_grad_enabled(phase=='train'):
outputs = model(inputs)
# we're interested in the indices on the max values, not the values themselves
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# Backpropagate and optimize iff in training mode, else there's no intermediate
# values to backpropagate with and will throw an error.
if phase == 'train':
loss.backward()
optimizer.step()
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print(f"{phase} Loss: {epoch_loss} Acc: {epoch_acc}")
# save the model if save=True
if save:
torch.save({
'epoch': epoch + 1,
'state_dict': model.state_dict(),
'acc': epoch_acc,
'opt_dict': optimizer.state_dict(),
}, path)
# print the total time needed, HH:MM:SS format
time_elapsed = time.time() - start
print(f"Training complete in {time_elapsed//3600}h {(time_elapsed%3600)//60}m {time_elapsed %60}s")