-
Notifications
You must be signed in to change notification settings - Fork 0
/
eval_feat_select.py
93 lines (78 loc) · 3.43 KB
/
eval_feat_select.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from sklearn.model_selection import train_test_split
# import sklearn tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from imodels import HSTreeClassifierCV
import numpy as np
from sklearn.model_selection import cross_val_score
import warnings
from collections import defaultdict
from sklearn import metrics as skm
from sklearn.model_selection import train_test_split, StratifiedKFold
def evaluate_features(
X, y, seed=42, class_weight="balanced", return_pr_curve=False, n_splits=3
):
if class_weight == "balanced":
class_weight = y.size / y.sum()
# X_train, X_test, y_train, y_test = train_test_split(
# X, y, test_size=test_size, random_state=seed, stratify=y
# )
# print(y_train.mean(), y_test.mean(), y_test.sum())
# compute cross validation scores
met_scores = defaultdict(list)
# for scoring in ["roc_auc", "accuracy", "f1", "precision", "recall"]:
# with warnings.catch_warnings():
# warnings.filterwarnings("ignore", message=".*Precision is ill-defined*")
# generate stratified kfolds split
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# initialize model
# m = DecisionTreeClassifier(
# random_state=seed, class_weight={0: 1, 1: class_weight}
# )
m = LogisticRegression(random_state=seed, class_weight={0: 1, 1: class_weight})
# m = LogisticRegressionCV(
# random_state=seed, class_weight={0: 1, 1: class_weight}
# )
# fit model
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message=".*lbfgs failed to converge*")
m.fit(X_train, y_train)
# predict
y_pred_proba = m.predict_proba(X_test)[:, 1]
y_pred = m.predict(X_test)
# y_pred = (
# y_pred_proba > y_pred_proba.min()
# ) # threshold at non-minimum probability
# calculate metrics
met_scores["accuracy"].append(skm.accuracy_score(y_test, y_pred))
met_scores["f1"].append(skm.f1_score(y_test, y_pred, zero_division=0))
met_scores["precision"].append(
skm.precision_score(y_test, y_pred, zero_division=0)
)
met_scores["recall"].append(skm.recall_score(y_test, y_pred, zero_division=0))
met_scores["roc_auc"].append(skm.roc_auc_score(y_test, y_pred_proba))
met_scores = {
k: np.mean(v) for k, v in met_scores.items() if not k.endswith("_curve")
}
# met_scores["roc_auc_curve"] = met_scores["roc_auc_curve"][0]
if return_pr_curve:
met_scores["roc_auc_curve"] = skm.precision_recall_curve(y_test, y_pred_proba)
# met_scores[scoring] = np.mean(
# cross_val_score(m, X, y, cv=5, scoring=scoring)
# )
# print(f"{scoring}: {scores.mean():.3f} +/- {scores.std():.3f}")
# m.fit(X_train, y_train)
# y_pred = m.predict(X_test)
# y_pred_proba = m.predict_proba(X_test)[:, 1]
# # calculate metrics
# met_scores = {
# "accuracy": accuracy_score(y_test, y_pred),
# "f1": f1_score(y_test, y_pred),
# "precision": precision_score(y_test, y_pred),
# "recall": recall_score(y_test, y_pred),
# "roc_auc": roc_auc_score(y_test, y_pred_proba),
# }
return met_scores