-
Notifications
You must be signed in to change notification settings - Fork 0
/
trainer.py
113 lines (97 loc) · 4.55 KB
/
trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import logging
import math
from functools import partial
from typing import Dict
import torch
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LambdaLR
from transformers import Trainer, get_scheduler
logger = logging.getLogger(__name__)
# customized optimizer lr group
class TrainerLRGroup(Trainer):
def create_scheduler(self, num_training_steps: int, optimizer: torch.optim.Optimizer = None): # type: ignore
"""
Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
passed as an argument.
Args:
num_training_steps (int): The number of training steps to do.
"""
if self.lr_scheduler is None:
if self.args.lr_scheduler_type == "cosine":
self.lr_scheduler = get_cosine_schedule_with_warmup(
self.optimizer,
num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
num_training_steps=num_training_steps,
)
else:
self.lr_scheduler = get_scheduler(
self.args.lr_scheduler_type,
optimizer=self.optimizer if optimizer is None else optimizer,
num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
num_training_steps=num_training_steps,
)
self._created_lr_scheduler = True
return self.lr_scheduler
def log(self, logs: Dict[str, float]) -> None:
"""
Log `logs` on the various objects watching training.
Subclass and override this method to inject custom behavior.
Args:
logs (`Dict[str, float]`):
The values to log.
"""
if self.state.epoch is not None:
logs["epoch"] = round(self.state.epoch, 4) # changed 2 to 4
if hasattr(self.model_wrapped.model.layers[0].self_attn, "scale_param"):
scales = [self.model_wrapped.model.layers[i].self_attn.scale_param.data.mean().item() for i in range(12)]
scales = [scale for scale in scales if scale != 0.0]
if len(scales) != 0:
logs["scale"] = sum(scales) / len(scales) # type: ignore
output = {**logs, **{"step": self.state.global_step}}
self.state.log_history.append(output)
self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
# Adapted from transformers/optimization.py (v4.35.2)
def _get_cosine_schedule_with_warmup_lr_lambda(
current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float, eta_min: float
):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
return max(0.0, eta_min + 0.5 * (1 - eta_min) * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
def get_cosine_schedule_with_warmup(
optimizer: Optimizer,
num_warmup_steps: int,
num_training_steps: int,
num_cycles: float = 0.5,
eta_min: float = 0.1,
last_epoch: int = -1,
):
"""
Create a schedule with a learning rate that decreases following the values of the cosine function between the
initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
initial lr set in the optimizer.
Args:
optimizer ([`~torch.optim.Optimizer`]):
The optimizer for which to schedule the learning rate.
num_warmup_steps (`int`):
The number of steps for the warmup phase.
num_training_steps (`int`):
The total number of training steps.
num_cycles (`float`, *optional*, defaults to 0.5):
The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
following a half-cosine).
eta_min(`float`, *optional*, defaults to 0.1):
The percentage of minimum learning rate relative to the initial lr.
last_epoch (`int`, *optional*, defaults to -1):
The index of the last epoch when resuming training.
Return:
`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
"""
lr_lambda = partial(
_get_cosine_schedule_with_warmup_lr_lambda,
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps,
num_cycles=num_cycles,
eta_min=eta_min,
)
return LambdaLR(optimizer, lr_lambda, last_epoch)