-
Notifications
You must be signed in to change notification settings - Fork 0
/
041_RANDOM_FOREST.py
82 lines (67 loc) · 3.63 KB
/
041_RANDOM_FOREST.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
print ' >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> '
print ' RANDOM FOREST ---', 'VAR SEL:', method, '- SEED:', str(SEED), '- N° VAR:', str(eff_nvar)
print ' >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> '
model = 'RANDOM_FOREST'
dir_dest = 'results/MODELING/CLASSIFICATION/' + model + '/'
create_dir( dir_dest )
training_set, validation_set, test_set, \
X_tr, X_val, X_ts, Y_tr, \
Y_val, Y_ts = load_data_for_modeling( SEED, predictors)
parameters = create_parameters_rf( method, nvar,
eff_nvar, SEED,
max_features_all = list(set(np.random.randint(2, eff_nvar, 6))))
inputs = range( len(parameters))
tr_val_error = Parallel(n_jobs = njob)(delayed(parallel_rf)(i) for i in inputs)
train_accuracy = []
valid_accuracy = []
for accuracy in tr_val_error:
train_accuracy.append( accuracy[0])
valid_accuracy.append(accuracy[1] )
parameters['validation_accuracy'] = valid_accuracy
parameters['training_accuracy'] = train_accuracy
# parameters.to_csv(tree_dir_dest + 'validation.csv', index = False)
update_validation( MODEL = model, PARAMETERS = parameters, path = dir_dest )
ix_max = parameters.validation_accuracy.nlargest(1).index
n_estimators = parameters.ix[ix_max, 'n_estimators'].values[0]
max_depth = parameters.ix[ix_max, 'max_depth'].values[0]
min_samples_split = parameters.ix[ix_max, 'min_samples_split'].values[0]
max_features = parameters.ix[ix_max, 'max_features'].values[0]
random_forest = RandomForestClassifier(n_estimators = n_estimators,
max_depth = max_depth,
min_samples_split = min_samples_split,
max_features = max_features,
n_jobs = 4)
final_rf = random_forest.fit( X_tr, Y_tr )
probs = final_rf.predict_proba(X_ts)
prediction = []; [prediction.append( p[1]) for p in probs]
ROC = ROC_analysis( Y_ts, prediction, label = model,
probability_tresholds = probs_to_check)
ROC.to_csv(dir_dest + 'ROC.csv', index = False)
update_metrics(ROC, SEED, method, eff_nvar )
importance = create_variable_score ( model = model, SEED = SEED, VARIABLES = X_tr.columns,
SCORE = final_rf.feature_importances_,
method_var_sel = method, n_var = eff_nvar )
update_var_score( importance )
# ''' POST PROCESSING '''
# test_set = pd.concat( [ test_set, pd.Series(prediction)], axis = 1 )
# test_set_prediction = pd.concat([pd.Series( test_set.index.tolist()),
# test_set[test_set.columns[-3:]]],
# axis = 1)
# test_set_prediction.columns = ['ID', 'Y', 'ENERGY', 'Probability']
# update_prediction(prediction = test_set_prediction, SEED = SEED, MODEL = model, METHOD = method, NVAR = eff_nvar,)
# # test_set_prediction.to_csv( dir_dest + 'prediction_' + str(SEED) + '.csv')
#
# for energy in test_set.ENERGY.unique():
# if energy > 0:
# #energy = test_set.ENERGY.unique()[4]
# df = test_set[test_set.ENERGY == energy]
# probabilities = df.ix[:, -1].tolist()
# ROC_subset = ROC_analysis(y_true = df.Y.tolist(), y_prob = probabilities , label = model,
# probability_tresholds = probs_to_check)
# cols_roc = ROC_subset.columns.tolist() +[ 'Energy']
# ROC_subset = pd.concat( [ROC_subset,
# pd.Series( np.repeat(energy, len(probs_to_check)))],
# axis = 1 )
# ROC_subset.columns = cols_roc
# update_subset_metrics(ROC_subset, SEED, method, eff_nvar)
#