forked from barko/dawg
-
Notifications
You must be signed in to change notification settings - Fork 0
/
d_sgbt.ml
401 lines (328 loc) · 12 KB
/
d_sgbt.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
(** Friedman's Stochastic Gradient Boosting Trees *)
let pr = Printf.printf
let sp = Printf.sprintf
let random_seed = [| 9271 ; 12074; 3; 12921; 92; 763 |]
type loss_type = [ `Logistic | `Square ]
open Sgbt
type t = {
(* how many observations are there in the training set? *)
n : int;
(* what is the map of feature id's to features? *)
feature_map : D_feat_map.t;
(* how do we find best splits? *)
splitter : Loss.splitter;
(* how do we evaluate a tree over all the observations in the
training set? *)
eval : (Dog_t.feature_id -> Feat.afeature) -> Model_t.l_tree -> float array;
(* how do we create random paritions of the training set (and
subsets thereof) ? *)
sampler : Sampler.t;
(* what fold does each observation in the training set belong
to? *)
folds : int array;
}
let exceed_max_trees num_iters max_trees_opt =
match max_trees_opt with
| None -> false
| Some max_trees ->
num_iters >= max_trees
let reset t first_tree =
let gamma = t.eval (D_feat_map.a_find_by_id t.feature_map) first_tree in
t.splitter#clear;
(match t.splitter#boost gamma with
| `NaN -> assert false
| `Ok -> ()
)
type learning_iteration = {
(* iteration number; also, number of trees *)
i : int ;
(* what is the fold currently being held out for the purpose of
identifying the optimal termination point? This fold is the
validation fold. *)
fold : int;
(* is the observation in the 'working folds' or the 'validation
fold' ? *)
fold_set : bool array;
learning_rate : float;
first_loss : float;
prev_loss : float;
trees : Model_t.l_tree list;
convergence_rate_smoother : Rls1.t;
random_state : Random.State.t;
(* what is the first tree? *)
first_tree : Model_t.l_tree;
(* should learning stop, returning the best model produced so
far? *)
timeout : unit -> bool
}
let rec learn_with_fold_rate conf t iteration =
let m = {
Tree.max_depth = conf.max_depth;
feature_map = t.feature_map;
splitter = t.splitter
} in
(* draw a random subset of this fold *)
Sampler.shuffle t.sampler iteration.random_state;
let sub_set = Sampler.array (
fun ~index ~value ->
(* sample half the data that is also in the current fold *)
iteration.fold_set.(index) && value mod 2 = 0
) t.sampler in
match Tree.make m 0 sub_set with
| None ->
print_endline "converged: no more trees";
`Converged (iteration.learning_rate, iteration.trees)
| Some tree ->
let shrunken_tree = Tree.shrink iteration.learning_rate tree in
let gamma = t.eval (Feat_map.a_find_by_id t.feature_map) shrunken_tree in
match t.splitter#boost gamma with
| `NaN -> (
pr "diverged: nan\n%!";
cut_learning_rate conf t iteration
)
| `Ok ->
let { Loss.s_wrk; s_val; has_converged; val_loss } =
t.splitter#metrics (Array.get iteration.fold_set) in
(* compute convergence rate and smooth it *)
let convergence_rate =
(iteration.prev_loss -. val_loss) /. val_loss in
let convergence_rate_smoother = Rls1.add
iteration.convergence_rate_smoother convergence_rate in
let convergence_rate_hat = Rls1.theta convergence_rate_smoother in
pr "iter % 3d % 7d %s %s %+.4e %+.4e\n%!"
iteration.fold
iteration.i
s_wrk
s_val
convergence_rate
convergence_rate_hat;
if has_converged then (
pr "converged: metrics inidicate continuing is pointless\n";
`Converged (iteration.learning_rate, iteration.trees)
)
else if val_loss >= 2.0 *. iteration.prev_loss then (
pr "diverged: loss rose dramatically!\n";
cut_learning_rate conf t iteration
)
else if iteration.timeout () then (
pr "timeout!\n";
`Timeout iteration.trees
)
else if exceed_max_trees iteration.i conf.max_trees_opt then (
(* convergence, kinda *)
pr "tree limit constraint met\n";
`Converged (iteration.learning_rate, iteration.trees)
)
else if convergence_rate_hat < conf.min_convergence_rate then (
(* convergence! *)
pr "converged: rate exceeded\n";
`Converged (iteration.learning_rate, iteration.trees)
)
else
(* continue learning *)
let iteration = {
iteration with
prev_loss = val_loss;
i = iteration.i + 1;
trees = shrunken_tree :: iteration.trees;
convergence_rate_smoother;
} in
learn_with_fold_rate conf t iteration
and cut_learning_rate conf t iteration =
(* cut the learning rate in half and try again *)
let learning_rate = 0.5 *. iteration.learning_rate in
pr "reducing learning rate from %f to %f\n"
iteration.learning_rate learning_rate;
let shrunken_first_tree = Tree.shrink learning_rate iteration.first_tree in
reset t shrunken_first_tree;
let new_random_seed = [| Random.int 10_000 |] in
let iteration = {
iteration with
learning_rate;
random_state = Random.State.make new_random_seed;
prev_loss = iteration.first_loss;
i = 1;
trees = [shrunken_first_tree]
} in
learn_with_fold_rate conf t iteration
let learn_with_fold conf t fold initial_learning_rate deadline =
let fold_set = Array.init t.n (fun i -> t.folds.(i) <> fold) in
let leaf0 = t.splitter#first_tree fold_set in
let shrunken_leaf0 = Tree.shrink initial_learning_rate leaf0 in
reset t shrunken_leaf0;
let { Loss.s_wrk; s_val; val_loss = first_val_loss } =
t.splitter#metrics (Array.get fold_set) in
pr "fold % 3d %s %s\n%!" fold s_wrk s_val;
let new_random_seed = [| Random.int 10_000 |] in
let timeout =
match conf.deadline with
| None -> fun () -> false (* don't timeout *)
| Some deadline ->
fun () ->
Unix.gettimeofday () >= deadline (* obey deadline *)
in
let convergence_rate_smoother = Rls1.create
conf.convergence_rate_smoother_forgetful_factor in
let iteration = {
i = 1;
fold;
fold_set;
first_loss = first_val_loss;
prev_loss = first_val_loss;
first_tree = leaf0;
trees = [shrunken_leaf0];
learning_rate = initial_learning_rate;
convergence_rate_smoother;
random_state = Random.State.make new_random_seed;
timeout
} in
learn_with_fold_rate conf t iteration
let folds_of_feature conf sampler feature_map n y_feature_id =
match conf.fold_feature_opt with
| None ->
(* we don't have a fold feature; randomly assign folds *)
let fold = Sampler.array (
fun ~index ~value ->
value mod conf.num_folds
) sampler in
fold, feature_map
| Some feature_descr ->
let fold_feature =
match D_feat_map.a_find_all feature_map feature_descr with
| [ feature ] -> feature
| [] ->
pr "no feature %s\n%!"
(Feat_utils.string_of_feature_descr feature_descr);
exit 1
| _ :: _ :: _ ->
pr "more than one feature satisfying %s\n%!"
(Feat_utils.string_of_feature_descr feature_descr);
exit 1
in
let fold_feature_id = Feat_utils.id_of_feature fold_feature in
let feature_map = D_feat_map.deactivate feature_map fold_feature_id in
let num_observations = D_feat_map.num_observations feature_map in
match Feat_utils.folds_of_feature ~n:num_observations
~num_folds:conf.num_folds fold_feature with
| `Folds fold ->
fold, feature_map
| `TooManyOrdinalFolds cardinality ->
pr "the cardinality of ordinal fold feature (%d) is \
too large relative to the number of folds (%d)\n%!"
cardinality conf.num_folds;
exit 1
| `CategoricalCardinalityMismatch cardinality ->
pr "the cardinality of the categorical fold feature (%d) \
must equal the number of folds (%d)\n%!"
cardinality conf.num_folds;
exit 1
let learn conf =
let dog_rw = Dog_io.RW.create conf.dog_file_path None in
let feature_map = D_feat_map.create dog_rw in
let num_observations = Dog_io.RW.num_observations dog_rw in
let n = num_observations in
assert ( conf.num_folds > 0 );
if conf.num_folds >= n then (
pr "number of folds %d must be smaller than the number of observations \
%d\n%!" conf.num_folds n;
exit 1
);
let y_feature =
let y_features = D_feat_map.a_find_all feature_map conf.y in
match y_features with
| [] ->
pr "target %s not found\n%!"
(Feat_utils.string_of_feature_descr conf.y);
exit 1
| one :: two :: _ ->
pr "more than one target feature %s\n%!"
(Feat_utils.string_of_feature_descr conf.y);
exit 1
| [ y_feature ]-> y_feature
in
(* remove target from the feature set *)
let feature_map = D_feat_map.deactivate feature_map
(Feat_utils.id_of_feature y_feature) in
(* remove excluded features, if any *)
let feature_map =
match conf.excluded_feature_name_regexp_opt with
| None -> feature_map
| Some rex ->
let is_excluded feature =
match Feat_utils.name_of_feature feature with
| None -> false (* anonymous features cannot be excluded *)
| Some name ->
Pcre.pmatch ~rex name
in
let feature_map = D_feat_map.deactivate_if feature_map is_excluded in
feature_map
in
let random_state = Random.State.make random_seed in
let sampler = Sampler.create n in
Sampler.shuffle sampler random_state;
let folds, feature_map =
let y_feature_id = Feat_utils.id_of_feature y_feature in
folds_of_feature conf sampler feature_map n y_feature_id
in
pr "features: active=%d inactive=%d\n%!"
(D_feat_map.num_active feature_map) (D_feat_map.num_inactive feature_map);
let splitter : Loss.splitter =
match conf.loss_type with
| `Logistic ->
new Logistic.splitter y_feature num_observations
| `Square ->
new Square.splitter y_feature num_observations
in
let eval = Tree.mk_eval num_observations in
let t = {
n;
feature_map;
splitter;
eval;
sampler;
folds
} in
let rec loop fold trees_list initial_learning_rate =
if fold < conf.num_folds then
match learn_with_fold conf t fold initial_learning_rate 0.0 with
| `Converged (effective_learning_rate, trees) ->
let trees_list = List.rev_append trees trees_list in
(* set the initial learning rate of the next fold model to the
effective learning rate of the previous one; this means that
the learning rate can gradually decrease from one fold to the
next, as a learning attempt on folds fail (diverge) *)
loop (fold + 1) trees_list effective_learning_rate
| `Timeout trees ->
(* time's up! only include [trees] if no other trees were
previously learned. *)
match trees_list with
| [] -> trees
| _ -> trees_list
else
trees_list
in
(* combine the model learned for each fold into a mega-model,
where these sequence of trees are simply averaged (bagged!) *)
let trees = loop 0 [] conf.initial_learning_rate in
let trees =
let fold_weight = 1.0 /. (float conf.num_folds) in
List.rev_map (
fun tree ->
Tree.shrink fold_weight tree
) trees
in
let trees, features = Model_utils.l_to_c feature_map trees in
(* write model file *)
let () =
(* open channel *)
let ouch = open_out conf.output_file_path in
(* open output buffer *)
let out_buf = Bi_outbuf.create_channel_writer ouch in
(* write model to buffer *)
splitter#write_model trees features out_buf;
(* flush buffer *)
Bi_outbuf.flush_channel_writer out_buf;
(* close channel *)
close_out ouch
in
()