-
Notifications
You must be signed in to change notification settings - Fork 0
/
Framingham_utils.py
201 lines (167 loc) · 7.39 KB
/
Framingham_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# Copyright (C) 2023 Antonio Rodriguez
#
# This file is part of CVD_risk_and_TL.
#
# CVD_risk_and_TL is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# CVD_risk_and_TL is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with CVD_risk_and_TL.
# If not, see <http://www.gnu.org/licenses/>.
# Dependencies
import os
import pandas as pd
import numpy as np
from typing import Tuple
from sklearn.preprocessing import OneHotEncoder
def prepare_Framingham(dataset_path : str, filename : str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str, str]:
"""Read the Framingham dataset from a .csv file and suit it to be processed
as a pd.DataFrame. This converted DataFrame is returned.
Args:
-----
dataset_path: path where dataset is stored.
filename : file name of the .csv containing the dataset.
Returns:
--------
data: dataframe containing the whole dataset
X : dataframe containing the dataset features
Y : dataframe containing only the target variable
cols_names: list of strings containing feature names.
y_tag: string containing target variable name.
"""
# Go to dataset path
os.chdir(dataset_path)
# Open the .csv file and convert it into DataFrame
data = pd.read_csv(filename)
# Store column names
cols_names = data.columns
# Replace nan values by np.nan and
data.replace(('nan'), (np.nan), inplace=True)
# Store features' and target variable's names
cols_names_prev = data.columns
y_tag = cols_names_prev[len(cols_names_prev)-1]
cols_names = cols_names_prev[0:cols_names_prev.size]
# Save X, Y, feature names and Y name
y_tag = cols_names[len(cols_names)-1]
cols_names = cols_names[0:len(cols_names)-1]
X = data[cols_names]
Y = data[y_tag]
return data, X, Y, cols_names, y_tag
def numerical_conversion_Framingham(data : np.array, features : str, y_col : str):
"""Fix all Framingham database features data types to its original type after KNNImputer is used,
since this functions returns only a floating points ndarray. For more, check sklearn
documentation of this function at
https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html. After
fixing datatypes, an ndarray to pd.DataFrame conversion is performed. Notice that this
operation is only done in the fields that were not originally floats.
Args:
-----
data: data returned by KNN Imputer (float data type).
features: list of strings containing the feature names of the dataset.
y_col: target variable (i.e., Y) name
Returns:
--------
data: dataframe containing the whole dataset after imputation
X : dataframe containing the dataset features after imputation
y : dataframe containing only the target variable after imputation
"""
# From ndarray to pd.DataFrame
names = features.insert(len(features), y_col)
data = pd.DataFrame(data, columns = names)
# Fixing necessary datatypes to int (including categorical variables)
data['male'] = data['male'].astype(int)
data['age'] = data['age'].astype(int)
data['education'] = data['education'].astype(int)
data['currentSmoker'] = data['currentSmoker'].astype(int)
data['cigsPerDay'] = data['cigsPerDay'].astype(int)
data['BPMeds'] = data['BPMeds'].astype(int)
data['prevalentStroke'] = data['prevalentStroke'].astype(int)
data['prevalentHyp'] = data['prevalentHyp'].astype(int)
data['diabetes'] = data['diabetes'].astype(int)
data['totChol'] = data['totChol'].astype(int)
data['sysBP'] = data['sysBP'].astype(float)
data['diaBP'] = data['diaBP'].astype(float)
data['BMI'] = data['BMI'].astype(float)
data['heartRate'] = data['heartRate'].astype(int)
data['glucose'] = data['glucose'].astype(int)
data['TenYearCHD'] = data['TenYearCHD'].astype(int)
# Separate X and Y
X = data[features]
y = data[[y_col]]
return data, X, y
def general_conversion_Framingham (data : pd.DataFrame) -> pd.DataFrame :
"""Fix all Framingham database features data types to its original type.
Categorical variables are set as "object" type. Binary ones as "bool".
A DataFrame with the original datatypes of this database is returned.
Args:
-----
data: dataset with datatypes not corresponding to the original ones.
Returns:
--------
data: dataframe with the original datatypes
"""
data['male'] = data['male'].astype(int)
data['age'] = data['age'].astype(int)
data['education'] = data['education'].astype('object')
data['currentSmoker'] = data['currentSmoker'].astype(int)
data['cigsPerDay'] = data['cigsPerDay'].astype(int)
data['BPMeds'] = data['BPMeds'].astype(int)
data['prevalentStroke'] = data['prevalentStroke'].astype(int)
data['prevalentHyp'] = data['prevalentHyp'].astype(int)
data['diabetes'] = data['diabetes'].astype(int)
data['totChol'] = data['totChol'].astype(int)
data['sysBP'] = data['sysBP'].astype(float)
data['diaBP'] = data['diaBP'].astype(float)
data['BMI'] = data['BMI'].astype(float)
data['heartRate'] = data['heartRate'].astype(int)
data['glucose'] = data['glucose'].astype(int)
data['TenYearCHD'] = data['TenYearCHD'].astype(int)
return data
def num2cat_Framingham(data : pd.DataFrame):
"""This function replaces the numerical values corresponding to categories in
the Framingham database by its correspondant category. It returns a DataFrame
after this replacement.
Args:
-----
data: dataset with categories represented by numbers.
Returns:
--------
data: dataframe with the categories represented by their correspondant string
"""
data['education'] = data['education'].replace([1,2,3,4],['edu1','edu2','edu3','edu4'])
return data
def one_hot_enc_Framingham(data):
"""This function performs One-Hot Encoding in the Framingham database.
Args:
-----
data: dataset with categories represented by numbers.
Returns:
--------
data: dataframe after One-Hot Encoding
"""
# One-hot Encoder declaration
enc = OneHotEncoder(handle_unknown='ignore')
# education
data[['education']] = data[['education']].astype('category')
edu = pd.DataFrame(enc.fit_transform(data[['education']]).toarray())
edu.columns = enc.categories_
edu.reset_index(drop=True, inplace=True)
# Drop target variable column to add it at the end
clas = data[['TenYearCHD']]
clas.reset_index(drop=True, inplace=True)
# Drop original categorical columns
data = data.drop(['TenYearCHD'], axis=1)
data.reset_index(drop=True, inplace=True)
# Drop the original categorical column
data = data.drop(['education'], axis=1)
data.reset_index(drop=True, inplace=True)
# Joint one-hot encoding columns
data = data.join([edu, clas])
return data