forked from barko/dawg
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sgbt.ml
474 lines (397 loc) · 14.7 KB
/
sgbt.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
(** Friedman's Stochastic Gradient Boosting Trees *)
let pr = Printf.printf
let epr = Printf.eprintf
let random_seed = [| 9271 ; 12074; 3; 12921; 92; 763 |]
type loss_type = [ `Logistic | `Square ]
type feature_monotonicity = (Feat_utils.feature_descr * Dog_t.monotonicity) list
type conf = {
loss_type : loss_type;
dog_file_path : string;
num_folds : int;
min_convergence_rate : float;
initial_learning_rate : float;
y : Feat_utils.feature_descr;
max_depth : int;
convergence_rate_smoother_forgetful_factor : float;
deadline : float option;
output_file_path : string;
excluded_feature_name_regexp_opt : Pcre.regexp option;
fold_feature_opt : Feat_utils.feature_descr option;
max_trees_opt : int option;
binarization_threshold_opt : Logistic.binarization_threshold option;
feature_monotonicity : feature_monotonicity;
}
type t = {
(* how many observations are there in the training set? *)
n : int;
(* what is the map of feature id's to features? *)
feature_map : Feat_map.t;
(* how do we find best splits? *)
splitter : Loss.splitter;
(* how do we evaluate a tree over all the observations in the
training set? *)
eval : (Dog_t.feature_id -> Feat.afeature) -> Model_t.l_tree -> float array;
(* how do we create random paritions of the training set (and
subsets thereof) ? *)
sampler : Sampler.t;
(* what fold does each observation in the training set belong
to? *)
folds : int array;
}
let exceed_max_trees num_iters max_trees_opt =
match max_trees_opt with
| None -> false
| Some max_trees ->
num_iters >= max_trees
let reset t first_tree =
let gamma = t.eval (Feat_map.a_find_by_id t.feature_map) first_tree in
t.splitter#clear;
(match t.splitter#boost gamma with
| `NaN -> assert false
| `Ok -> ()
)
type learning_iteration = {
(* iteration number; also, number of trees *)
i : int ;
(* what is the fold currently being held out for the purpose of
identifying the optimal termination point? This fold is the
validation fold. *)
fold : int;
(* is the observation in the 'working folds' or the 'validation
fold' ? *)
fold_set : bool array;
learning_rate : float;
first_loss : float;
prev_loss : float;
trees : Model_t.l_tree list;
convergence_rate_smoother : Rls1.t;
random_state : Random.State.t;
(* what is the first tree? *)
first_tree : Model_t.l_tree;
(* should learning stop, returning the best model produced so
far? *)
timeout : unit -> bool
}
let rec learn_with_fold_rate conf t iteration =
let feature_monotonicity_map =
Feat_map.assoc t.feature_map conf.feature_monotonicity
in
let m = {
Tree.max_depth = conf.max_depth;
feature_map = t.feature_map;
feature_monotonicity_map;
splitter = t.splitter
} in
(* draw a random subset of this fold *)
Sampler.shuffle t.sampler iteration.random_state;
let sub_set = Sampler.array (
fun ~index ~value ->
(* sample half the data that is also in the current fold *)
iteration.fold_set.(index) && value mod 2 = 0
) t.sampler in
match Tree.make m 0 sub_set with
| None ->
print_endline "converged: no more trees";
`Converged (iteration.learning_rate, iteration.trees)
| Some tree ->
let shrunken_tree = Tree.shrink iteration.learning_rate tree in
let gamma = t.eval (Feat_map.a_find_by_id t.feature_map) shrunken_tree in
match t.splitter#boost gamma with
| `NaN -> (
pr "diverged: nan\n%!";
cut_learning_rate conf t iteration
)
| `Ok ->
let { Loss.s_wrk; s_val; has_converged; val_loss } =
t.splitter#metrics (Array.get iteration.fold_set) in
(* compute convergence rate and smooth it *)
let convergence_rate =
(iteration.prev_loss -. val_loss) /. val_loss in
let convergence_rate_smoother = Rls1.add
iteration.convergence_rate_smoother convergence_rate in
let convergence_rate_hat = Rls1.theta convergence_rate_smoother in
pr "iter % 3d % 7d %s %s %+.4e %+.4e\n%!"
iteration.fold
iteration.i
s_wrk
s_val
convergence_rate
convergence_rate_hat;
if has_converged then (
pr "converged: metrics inidicate continuing is pointless\n";
`Converged (iteration.learning_rate, iteration.trees)
)
else if val_loss >= 2.0 *. iteration.prev_loss then (
pr "diverged: loss rose dramatically!\n";
cut_learning_rate conf t iteration
)
else if iteration.timeout () then (
pr "timeout!\n";
`Timeout iteration.trees
)
else if exceed_max_trees iteration.i conf.max_trees_opt then (
(* convergence, kinda *)
pr "tree limit constraint met\n";
`Converged (iteration.learning_rate, iteration.trees)
)
else if convergence_rate_hat < conf.min_convergence_rate then (
(* convergence! *)
pr "converged: rate exceeded\n";
`Converged (iteration.learning_rate, iteration.trees)
)
else
(* continue learning *)
let iteration = {
iteration with
prev_loss = val_loss;
i = iteration.i + 1;
trees = shrunken_tree :: iteration.trees;
convergence_rate_smoother;
} in
learn_with_fold_rate conf t iteration
and cut_learning_rate conf t iteration =
(* cut the learning rate in half and try again *)
let learning_rate = 0.5 *. iteration.learning_rate in
pr "reducing learning rate from %f to %f\n"
iteration.learning_rate learning_rate;
let first_tree = iteration.first_tree in
reset t first_tree;
let new_random_seed = [| Random.int 10_000 |] in
let iteration = {
iteration with
learning_rate;
random_state = Random.State.make new_random_seed;
prev_loss = iteration.first_loss;
i = 1;
trees = [first_tree]
} in
learn_with_fold_rate conf t iteration
let learn_with_fold conf t fold initial_learning_rate deadline =
let fold_set = Array.init t.n (fun i -> t.folds.(i) <> fold) in
let first_tree = t.splitter#first_tree fold_set in
reset t first_tree;
let { Loss.s_wrk; s_val; val_loss = first_val_loss } =
t.splitter#metrics (Array.get fold_set) in
pr "fold % 3d %s %s\n%!" fold s_wrk s_val;
let new_random_seed = [| Random.int 10_000 |] in
let timeout =
match conf.deadline with
| None -> fun () -> false (* don't timeout *)
| Some deadline ->
fun () ->
Unix.gettimeofday () >= deadline (* obey deadline *)
in
let convergence_rate_smoother = Rls1.create
conf.convergence_rate_smoother_forgetful_factor in
let iteration = {
i = 1;
fold;
fold_set;
first_loss = first_val_loss;
prev_loss = first_val_loss;
first_tree;
trees = [first_tree];
learning_rate = initial_learning_rate;
convergence_rate_smoother;
random_state = Random.State.make new_random_seed;
timeout
} in
learn_with_fold_rate conf t iteration
let folds_of_feature_name conf sampler feature_map n y_feature_id =
match conf.fold_feature_opt with
| None ->
(* randomly choose fold assignments *)
let folds = Sampler.array (
fun ~index ~value ->
value mod conf.num_folds
) sampler in
folds, feature_map
| Some fold_feature ->
match Feat_map.find feature_map fold_feature with
| [] ->
pr "feature %S to be used for fold assignment not found\n%!"
(Feat_utils.string_of_feature_descr fold_feature);
exit 1
| fold_features ->
let num_fold_features = List.length fold_features in
if num_fold_features > 1 then
pr "[WARNING] There are %d fold features satisfying %s\n%!"
num_fold_features
(Feat_utils.string_of_feature_descr fold_feature);
(* arbitrarily pick the first fold feature (should there be
more than one) *)
let i_fold_feature = List.hd fold_features in
let fold_feature_id = Feat_utils.id_of_feature i_fold_feature in
if fold_feature_id = y_feature_id then (
epr "[ERROR] Fold feature and target feature must be different\n%!";
exit 1
);
(* fold feature found; use it to construct folds *)
let a_fold_feature = Feat_map.i_to_a feature_map i_fold_feature in
match Feat_utils.folds_of_feature ~n ~num_folds:conf.num_folds
a_fold_feature with
| `TooManyCategoricalFolds cardinality ->
epr "[ERROR] The cardinality of categorical feature %s is %d, which is \
too small relative to the number of folds %d\n%!"
(Feat_utils.string_of_feature_descr fold_feature)
cardinality conf.num_folds;
exit 1
| `TooManyOrdinalFolds cardinality ->
epr "[ERROR] The cardinality of ordinal feature %s is %d, which is \
too small relative to the number of folds %d\n%!"
(Feat_utils.string_of_feature_descr fold_feature)
cardinality conf.num_folds;
exit 1
| `Folds folds ->
(* remove all the fold_features from the [feature_map] *)
let feature_map =
List.fold_left (
fun feature_map_0 a_fold_feature ->
let fold_feature_id = Feat_utils.id_of_feature
a_fold_feature in
let fold_feature_name = Feat_utils.name_of_feature a_fold_feature in
let () = match fold_feature_name with
| Some name ->
epr "[INFO] excluding fold feature %s (id: %d)\n%!"
name fold_feature_id;
| None ->
epr "[INFO] excluding nameless fold feature (id: %d)\n%!"
fold_feature_id;
in
Feat_map.remove feature_map_0 fold_feature_id
) feature_map fold_features in
folds, feature_map
let learn conf =
let dog_reader = Dog_io.RO.create conf.dog_file_path in
let feature_map = Feat_map.create dog_reader in
let num_observations =
let dog = Dog_io.RO.dog dog_reader in
dog.Dog_t.num_observations
in
let n = num_observations in
assert ( conf.num_folds > 0 );
if conf.num_folds >= n then (
pr "number of folds %d must be smaller than the number of observations \
%d\n%!" conf.num_folds n;
exit 1
);
let y_feature =
match Feat_map.find feature_map conf.y with
| [] ->
pr "target %s not found\n%!"
(Feat_utils.string_of_feature_descr conf.y);
exit 1
| (_ :: _ :: _) as y_features ->
pr "%d target features satisfying %s found; only one expected\n%!"
(List.length y_features)
(Feat_utils.string_of_feature_descr conf.y);
exit 1
| [y_feature] ->
Feat_map.i_to_a feature_map y_feature
in
(* remove target from the feature set *)
let feature_map = Feat_map.remove feature_map
(Feat_utils.id_of_feature y_feature) in
(* remove excluded features, if any *)
let feature_map, num_excluded_features =
match conf.excluded_feature_name_regexp_opt with
| None -> feature_map, 0
| Some rex ->
let num_excluded = ref 0 in
let is_excluded feature =
match Feat_utils.name_of_feature feature with
| None -> false (* anonymous features cannot be excluded *)
| Some name ->
let is_ex = Pcre.pmatch ~rex name in
if is_ex then
incr num_excluded;
is_ex
in
let is_included _ feature =
not (is_excluded feature)
in
let feature_map = Feat_map.filter feature_map is_included in
feature_map, !num_excluded
in
let random_state = Random.State.make random_seed in
let sampler = Sampler.create n in
Sampler.shuffle sampler random_state;
let folds, feature_map =
let y_feature_id = Feat_utils.id_of_feature y_feature in
folds_of_feature_name conf sampler feature_map n y_feature_id
in
assert (
(* make sure fold features (if any) are gone from the
[feature_map] *)
match conf.fold_feature_opt with
| Some fold_feature -> (
let ff = Feat_map.find feature_map fold_feature in
match ff with
| [] -> true
| _ -> false
)
| None -> true
);
pr "features: included=%d excluded=%d\n%!"
(Feat_map.length feature_map) num_excluded_features;
let splitter : Loss.splitter =
match conf.loss_type with
| `Logistic ->
new Logistic.splitter
conf.binarization_threshold_opt y_feature num_observations
| `Square ->
new Square.splitter y_feature num_observations
in
let eval = Tree.mk_eval num_observations in
let t = {
n;
feature_map;
splitter;
eval;
sampler;
folds
} in
let rec loop fold trees_list initial_learning_rate =
if fold < conf.num_folds then
match learn_with_fold conf t fold initial_learning_rate 0.0 with
| `Converged (effective_learning_rate, trees) ->
let trees_list = List.rev_append trees trees_list in
(* set the initial learning rate of the next fold model to the
effective learning rate of the previous one; this means that
the learning rate can gradually decrease from one fold to the
next, as a learning attempt on folds fail (diverge) *)
loop (fold + 1) trees_list effective_learning_rate
| `Timeout trees ->
(* time's up! only include [trees] if no other trees were
previously learned. *)
match trees_list with
| [] -> trees
| _ -> trees_list
else
trees_list
in
(* combine the model learned for each fold into a mega-model,
where these sequence of trees are simply averaged (bagged!) *)
let trees = loop 0 [] conf.initial_learning_rate in
let trees =
let fold_weight = 1.0 /. (float conf.num_folds) in
List.rev_map (
fun tree ->
Tree.shrink fold_weight tree
) trees
in
let trees, features = Model_utils.l_to_c feature_map trees in
(* write model file *)
let () =
(* open channel *)
let ouch = open_out conf.output_file_path in
(* open output buffer *)
let out_buf = Bi_outbuf.create_channel_writer ouch in
(* write model to buffer *)
splitter#write_model trees features out_buf;
(* flush buffer *)
Bi_outbuf.flush_channel_writer out_buf;
(* close channel *)
close_out ouch
in
()