forked from samim23/translator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
model_utils.lua
187 lines (153 loc) · 6.05 KB
/
model_utils.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
--[[
In the code, combine_all_parameters just creates a contiguous 1D tensor
large enough to hold all the parameters, then points all the modules to
this one 1D tensor. The existing weights and gradients tensors are
discarded.
Suppose you had 3 modules, M1,M2,M3, where M1 and M2 shared their
parameters (and gradients), but M3 did not. Calling combine_all_parameters
on these 3 will produce a 1D tensor with space for the params of M1 and M2
and M3, with the params for the first two both being references to the same
place in memory for M1 and M2. The params tensor for M3 will point to the
latter half of the new 1D tensor. The 1D tensor gets returned as the first
return value of combine_all_parameters, and the corresponding gradParams
(constructed at the same time) is returned too.
This gives us a 1D view of the params in a set of modules, similarly to now
nn.Container modules (e.g. nn.Sequential, among others) perform this
reallocation. As for torch.pointer, that is used to check if two objects
are the same object.
Your LSTM code is fine, splitting it up into multiple lines like that
doesn't change anything. The parameter combining isn't related to the
inputs/outputs of the modules, just to the parameters/gradParams inside the
module objects.
]]--
require 'torch'
local model_utils = {}
function model_utils.combine_all_parameters(...)
--[[ like module:getParameters, but operates on many modules ]]--
-- get parameters
local networks = {...}
local parameters = {}
local gradParameters = {}
for i = 1, #networks do
local net_params, net_grads = networks[i]:parameters()
if net_params then
for _, p in pairs(net_params) do
parameters[#parameters + 1] = p
end
for _, g in pairs(net_grads) do
gradParameters[#gradParameters + 1] = g
end
end
end
local function storageInSet(set, storage)
local storageAndOffset = set[torch.pointer(storage)]
if storageAndOffset == nil then
return nil
end
local _, offset = unpack(storageAndOffset)
return offset
end
-- this function flattens arbitrary lists of parameters,
-- even complex shared ones
local function flatten(parameters)
if not parameters or #parameters == 0 then
return torch.Tensor()
end
local Tensor = parameters[1].new
local storages = {}
local nParameters = 0
for k = 1,#parameters do
local storage = parameters[k]:storage()
if not storageInSet(storages, storage) then
storages[torch.pointer(storage)] = {storage, nParameters}
nParameters = nParameters + storage:size()
end
end
local flatParameters = Tensor(nParameters):fill(1)
local flatStorage = flatParameters:storage()
for k = 1,#parameters do
local storageOffset = storageInSet(storages, parameters[k]:storage())
parameters[k]:set(flatStorage,
storageOffset + parameters[k]:storageOffset(),
parameters[k]:size(),
parameters[k]:stride())
parameters[k]:zero()
end
local maskParameters= flatParameters:float():clone()
local cumSumOfHoles = flatParameters:float():cumsum(1)
local nUsedParameters = nParameters - cumSumOfHoles[#cumSumOfHoles]
local flatUsedParameters = Tensor(nUsedParameters)
local flatUsedStorage = flatUsedParameters:storage()
for k = 1,#parameters do
local offset = cumSumOfHoles[parameters[k]:storageOffset()]
parameters[k]:set(flatUsedStorage,
parameters[k]:storageOffset() - offset,
parameters[k]:size(),
parameters[k]:stride())
end
for _, storageAndOffset in pairs(storages) do
local k, v = unpack(storageAndOffset)
flatParameters[{{v+1,v+k:size()}}]:copy(Tensor():set(k))
end
if cumSumOfHoles:sum() == 0 then
flatUsedParameters:copy(flatParameters)
else
local counter = 0
for k = 1,flatParameters:nElement() do
if maskParameters[k] == 0 then
counter = counter + 1
flatUsedParameters[counter] = flatParameters[counter+cumSumOfHoles[k]]
end
end
assert (counter == nUsedParameters)
end
return flatUsedParameters
end
-- flatten parameters and gradients
local flatParameters = flatten(parameters)
local flatGradParameters = flatten(gradParameters)
-- return new flat vector that contains all discrete parameters
return flatParameters, flatGradParameters
end
function model_utils.clone_many_times(net, T)
local clones = {}
local params, gradParams
if net.parameters then
params, gradParams = net:parameters()
if params == nil then
params = {}
end
end
local paramsNoGrad
if net.parametersNoGrad then
paramsNoGrad = net:parametersNoGrad()
end
local mem = torch.MemoryFile("w"):binary()
mem:writeObject(net)
for t = 1, T do
-- We need to use a new reader for each clone.
-- We don't want to use the pointers to already read objects.
local reader = torch.MemoryFile(mem:storage(), "r"):binary()
local clone = reader:readObject()
reader:close()
if net.parameters then
local cloneParams, cloneGradParams = clone:parameters()
local cloneParamsNoGrad
for i = 1, #params do
cloneParams[i]:set(params[i])
cloneGradParams[i]:set(gradParams[i])
end
if paramsNoGrad then
cloneParamsNoGrad = clone:parametersNoGrad()
for i =1,#paramsNoGrad do
cloneParamsNoGrad[i]:set(paramsNoGrad[i])
end
end
end
clones[t] = clone
collectgarbage()
end
mem:close()
return clones
end
return model_utils