-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
113 lines (103 loc) · 4.86 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import numpy as np
import pandas as pd
from pipeline import G2JN_Pipeline
import time
def load_boston():
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
boston_dataset = pd.read_csv("data/housing.csv", header=None, delimiter=r"\s+", names=column_names)
X_bos, y_bos = boston_dataset.iloc[:,:-1], boston_dataset.iloc[:,-1]
name = "Boston-Housing"
return X_bos, y_bos, name
def load_motors(one_hot=False):
# ONE HOT VERSION
if one_hot:
motors_dataset = pd.read_csv('data/freMTPL2freq.csv')#.sample(frac=0.1,random_state=1).reset_index(drop=True)
motors_dataset['Frequency'] = motors_dataset['ClaimNb'] / motors_dataset['Exposure']
X_mot, y_mot = motors_dataset.drop(['IDpol', 'ClaimNb', 'Exposure', 'Frequency'],axis=1), motors_dataset['Frequency']
categorical_columns = X_mot.dtypes[X_mot.dtypes == 'object'].index
# Perform one-hot encoding on the categorical columns
one_hot_df = pd.get_dummies(X_mot[categorical_columns], dtype=float)
X_mot = (X_mot.drop(categorical_columns,axis=1)).join(one_hot_df).reset_index(drop=True)
name = "French-Motor-claims"
return X_mot, y_mot,name
motors_dataset = pd.read_csv('data/freMTPL2freq.csv')
# Convert Categorical features to Numerical Features
for col in ['Area','VehBrand','VehGas','Region']:
d = {}
for i, val in enumerate(motors_dataset[col].unique()):
d[val] = i + 1
motors_dataset[col] = motors_dataset[col].apply(lambda x: d[x])
# Calculate Frequency
motors_dataset['Frequency'] = motors_dataset['ClaimNb'] / motors_dataset['Exposure']
X_mot, y_mot = motors_dataset.drop(['IDpol', 'ClaimNb', 'Exposure', 'Frequency'],axis=1), motors_dataset['Frequency']
name = "French-Motor-claims"
return X_mot, y_mot,name
def parameters_tuning(X,y,name):
params_df = pd.DataFrame()
res = G2JN_Pipeline(X, y, name)
res.fit(conf_int =95)
for thres in [0.5,0.7]:
for percenti_t in [25,50,75]:
for f_thr in [10,30,95]:
for min_in_bin in [5,10,15]:
for mutate in [True,False]:
if mutate:
for frac in [0.05,0.2,0.5]:
try:
res.transform( samples_per_bin =30,
max_bins = 750,
method='mean',
threshold=thres,
percentile_threshold = percenti_t,
min_amount_samples_in_bin = min_in_bin,
mutate =mutate,
frac = frac,
f_thr = f_thr)
params_df = pd.concat([params_df,pd.DataFrame(res.parameters,index=[0])]).reset_index(drop=True)
params_df.to_csv('motors_tuning_ALL_CTGAN.csv',index=False)
except:
continue
else:
try:
res.transform( samples_per_bin =30,
max_bins = 750,
method='mean',
threshold=thres,
percentile_threshold = percenti_t,
min_amount_samples_in_bin = min_in_bin,
mutate =mutate,
frac = frac,
f_thr = f_thr)
params_df = pd.concat([params_df,pd.DataFrame(res.parameters,index=[0])]).reset_index(drop=True)
params_df.to_csv('motors_tuning_ALL_CTGAN.csv',index=False)
except:
continue
return
if __name__ == "__main__":
#parameters_tuning(load_motors())
start = time.time()
X_bos, y_bos, name = load_boston()
print("--------------------------------"*3)
print("--------------------------------"*3)
print("--------------------------------"*3)
pipline_bos = G2JN_Pipeline(X_bos, y_bos, name)
pipline_bos.fit()
pipline_bos.transform()
print("--------------------------------"*3)
print("--------------------------------"*3)
print("--------------------------------"*3)
X_mot, y_mot, name = load_motors()
pipline_mot = G2JN_Pipeline(X_mot, y_mot, name )
pipline_mot.fit()
pipline_mot.transform( max_bins = 1000, # 750 or 1000
threshold=0.3,
min_amount_samples_in_bin = 10,
percentile_threshold = 50,
mutate =False,
f_thr = False)
end = time.time()
elapsed_time = end - start
minutes = int(elapsed_time / 60)
seconds = int(elapsed_time % 60)
print()
print("Elapsed time:", minutes, "minutes", seconds, "seconds")