-
Notifications
You must be signed in to change notification settings - Fork 29
/
commons.py
54 lines (45 loc) · 1.9 KB
/
commons.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import rcParams
#function for removing features with high vif
def drop_high_vif(X, thresh=100):
cols = X.columns
variables = np.arange(X.shape[1])
dropped=True
while dropped:
dropped=False
c = X[cols[variables]].values
vif = [variance_inflation_factor(c, ix) for ix in np.arange(c.shape[1])]
maxloc = vif.index(max(vif))
if max(vif) > thresh:
print('dropping \'' + X[cols[variables]].columns[maxloc] + '\' at index: ' + str(maxloc))
variables = np.delete(variables, maxloc)
dropped=True
print('Remaining variables:')
print(X.columns[variables])
return X[cols[variables]]
#function for listing vif values
def vif_values(X):
add_constant(X)
df=pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])], index=X.columns)
return df
# function for creating a feature importance dataframe
def feature_importance(column_names, importances):
df = pd.DataFrame({'feature': column_names,
'feature_importance': importances}) \
.sort_values('feature_importance', ascending = False) \
.reset_index(drop = True)
return df
# plotting a feature importance dataframe (horizontal barchart)
def plot_feature_importance(imp_df, title):
# figure size in inches
rcParams['figure.figsize'] = 11.7,8.27
imp_df.columns = ['feature', 'feature_importance']
sns.barplot(x = 'feature_importance', y = 'feature', data = imp_df, color = 'royalblue') \
.set_title(title, fontsize = 20)
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100