-
Notifications
You must be signed in to change notification settings - Fork 0
/
code.R
282 lines (186 loc) · 9.21 KB
/
code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
# https://archive.ics.uci.edu/ml/datasets/Bank+Marketing#
# [Moro et al., 2014] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to
# Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014
# The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns
# were based on phone calls. Often, more than one contact to the same client was required, in order to access
# if the product (bank term deposit) would be ('yes') or not ('no') subscribed.
library(skimr)
library(ggplot2)
library(tidyverse)
library(dplyr)
library(caret)
library(corrplot)
library(pROC)
bank_full <- read.csv('bank-additional-full.csv', sep = ';')
head(bank_full)
summary(bank_full)
skim(bank_full)
# Removing duplicates
sum(duplicated(bank_full))
idx <- which(duplicated(bank_full)==TRUE)
bank_full <- bank_full[-idx,]
rm(idx)
# Target variable : y ('yes','no') -> subscribe a term deposit
bank_full %>% ggplot(aes(y)) + geom_bar()
# drop: duration -> this attribute highly affects the output target (e.g., if duration=0 then y='no').
# Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known.
# Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to
# have a realistic predictive model.
bank_full <- bank_full %>% select(-duration)
#numeric
library("Hmisc")
num_bank_full <- bank_full %>% select(age, campaign, pdays, previous, emp.var.rate, cons.price.idx, cons.conf.idx,
euribor3m,nr.employed)
res <- rcorr(as.matrix(num_bank_full))
res
corrplot(res$r, type = "lower",tl.col = "black", tl.srt = 45)
heatmap(res$r,symm = TRUE)
detach("package:Hmisc", unload = TRUE)
rm(res,num_bank_full)
# age
bank_full %>% ggplot(aes(age)) + facet_grid("y", scales='free') + geom_histogram(color='black', bins=35) +
theme(axis.text.x = element_text(angle=45))
# job
bank_full %>% ggplot(aes(job)) + facet_grid("y", scales='free') + geom_bar() +
theme(axis.text.x = element_text(angle=45))
# day of the week
bank_full$day_of_week <- factor(bank_full$day_of_week, levels=c("mon", "tue", "wed", "thu", "fri"))
bank_full %>% ggplot(aes(day_of_week)) + facet_grid("y", scales='free') + geom_bar() +
theme(axis.text.x = element_text(angle=45))
# marital status
bank_full %>% ggplot(aes(marital)) + facet_grid("y", scales='free') + geom_bar() +
theme(axis.text.x = element_text(angle=45))
bank_full %>% filter(marital=="unknown") %>% group_by(y) %>% summarize(n())
# bank_full <- bank_full %>% filter(marital!="unknown")
# education: Put basic categories together and illiterate inside unknown?
bank_full %>% ggplot(aes(education)) + facet_grid("y", scales='free') + geom_bar() +
theme(axis.text.x = element_text(angle=45))
bank_full %>% filter(education=="illiterate") %>% group_by(y) %>% summarize(n())
#default
bank_full %>% ggplot(aes(default)) + geom_bar()
bank_full[which(bank_full$default=="yes"),] # should we drop it? lots of unknown and little yeses
bank_full %>% ggplot(aes(default)) + facet_grid("y", scales='free') + geom_bar() +
theme(axis.text.x = element_text(angle=45))
# housing
bank_full %>% ggplot(aes(housing)) + geom_bar()
# loan
bank_full %>% ggplot(aes(loan)) + geom_bar()
# contact
bank_full %>% ggplot(aes(contact)) + geom_bar()
# month
unique(bank_full$month)
bank_full$month <- factor(bank_full$month, levels=c("mar", "apr", "may", "jun", "jul",
"aug", "sep", "oct", "nov", "dec"))
bank_full %>% ggplot(aes(month)) + geom_bar()
bank_full %>% ggplot(aes(month)) + facet_grid("y", scales='free') + geom_bar() +
theme(axis.text.x = element_text(angle=45))
# campaign
summary(bank_full$campaign)
unique(bank_full$campaign)
bank_full %>% ggplot(aes(campaign)) + facet_grid("y", scales='free') + geom_histogram(color='black', bins=20) +
theme(axis.text.x = element_text(angle=45)) + scale_x_log10()
bank_full %>% ggplot(aes(campaign, fill=y)) + geom_boxplot() + scale_x_log10()
# pdays: number of days that passed by after the client was last contacted from a previous campaign
# (numeric; 999 means client was not previously contacted)
bank_full %>% ggplot(aes(pdays)) + geom_histogram(bins=25)
contacted_before <- bank_full %>% filter(pdays<999) %>% select(pdays)
contacted_before %>% ggplot(aes(pdays)) + geom_histogram(bins=15, color='black')
rm(contacted_before)
# pdays and previous
summary(bank_full$pdays) # 999 values
bank_full %>% filter(pdays==999) %>% dim() %>% .[1]
bank_full %>% filter(pdays==999) %>% dim() %>% .[1]/dim(bank_full)[1] #percentage
bank_full %>% filter(previous==0) %>% dim() %>% .[1]
bank_full %>% filter(previous==0) %>% dim() %>% .[1]/dim(bank_full)[1]
# drop 'pdays'
# poutcome
bank_full %>% ggplot(aes(poutcome)) + geom_bar()
bank_full %>% ggplot(aes(poutcome)) + facet_grid("y", scales='free') + geom_bar() +
theme(axis.text.x = element_text(angle=45))
# emp.var.rate
summary(bank_full$emp.var.rate)
bank_full %>% ggplot(aes(emp.var.rate)) + facet_grid("y", scales='free') + geom_histogram(color='black', bins=10) +
theme(axis.text.x = element_text(angle=45))
# cons.price.idx
summary(bank_full$cons.price.idx)
bank_full %>% ggplot(aes(cons.price.idx)) + facet_grid("y", scales='free') + geom_histogram(color='black', bins=10) +
theme(axis.text.x = element_text(angle=45))
# cons.conf.idx
summary(bank_full$cons.conf.idx)
bank_full %>% ggplot(aes(cons.conf.idx)) + facet_grid("y", scales='free') + geom_histogram(color='black', bins=10) +
theme(axis.text.x = element_text(angle=45))
# euribor3m
summary(bank_full$euribor3m)
bank_full %>% ggplot(aes(euribor3m)) + facet_grid("y", scales='free') + geom_histogram(color='black', bins=10) +
theme(axis.text.x = element_text(angle=45))
# nr.employed
summary(bank_full$nr.employed)
bank_full %>% ggplot(aes(nr.employed)) + facet_grid("y", scales='free') + geom_histogram(color='black', bins=10) +
theme(axis.text.x = element_text(angle=45))
# chi-squared test
chisq.test(bank_full$y, bank_full$job, correct=FALSE)
chisq.test(bank_full$y, bank_full$marital, correct=FALSE)
chisq.test(bank_full$y, bank_full$education, correct=FALSE) #check
table(bank_full$y, bank_full$education) # illiterate -> unknown
bank_full$education[bank_full$education == "illiterate"] <- "unknown"
chisq.test(bank_full$y, bank_full$education, correct=FALSE) #check
chisq.test(bank_full$y, bank_full$default, correct=FALSE) #check
table(bank_full$y, bank_full$default) # drop? only 3 yeses
chisq.test(bank_full$y, bank_full$housing, correct=FALSE) #####
chisq.test(bank_full$y, bank_full$loan, correct=FALSE) #####
chisq.test(bank_full$y, bank_full$contact, correct=FALSE)
chisq.test(bank_full$y, bank_full$month, correct=FALSE)
chisq.test(bank_full$y, bank_full$day_of_week, correct=FALSE)
chisq.test(bank_full$y, bank_full$poutcome, correct=FALSE)
#integer
chisq.test(bank_full$y, bank_full$previous, correct=FALSE) # too small
table(bank_full$y, bank_full$previous) # put 3-7 together?
# Modeling
bank_full_2 <- bank_full %>% mutate(job = as.factor(job), marital = as.factor(marital),
education = as.factor(education), contact = as.factor(contact),
day_of_week = as.factor(day_of_week), poutcome = as.factor(poutcome)) %>%
select(age, job, marital, education, contact, month, day_of_week, campaign, previous, poutcome,
emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, y)
rm(bank_full)
# Partition: train and test
set.seed(123, sample.kind = "Rounding")
index <- createDataPartition(bank_full_2$y, p=0.7, list=FALSE)
train <- bank_full_2[-index,]
test <- bank_full_2[index,]
rm(index,bank_full_2)
train <- train %>% mutate(y = as.factor(y))
# Logistic Regression
set.seed(123, sample.kind = "Rounding")
model <- glm(y ~ ., data = train, family = binomial)
summary(model)$coef
probabilities <- model %>% predict(test, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "yes", "no")
cm <- confusionMatrix(as.factor(predicted.classes), as.factor(test$y), positive = 'yes')
cm
roc_glm <- roc(response=as.ordered(predicted.classes), predictor=as.ordered(test$y), auc=TRUE)
roc_glm$auc #0.79
rm(model,probabilities,predicted.classes,roc_glm)
# Random Forest
set.seed(123, sample.kind = "Rounding")
fit_rf <- train(y ~ ., data = train, method = 'rf', ntree = 100,
tuneGrid = data.frame(mtry = seq(1:7)), trControl= ctrl, metric='ROC') #6
fit_rf$results$mtry[which.max(fit_rf$results$ROC)]
rf_pred <- predict(fit_rf, test)
cm <- confusionMatrix(as.factor(rf_pred), as.factor(test$y), positive = 'yes')
cm
roc_rf <- roc(response=as.ordered(rf_pred), predictor=as.ordered(test$y), auc=TRUE)
roc_rf$auc #0.7414
rm(rf_pred,roc_rf,fit_rf)
#Generalized Boosted Regression Model
set.seed(123, sample.kind = "Rounding")
fit_gbm <- train(y ~ ., data = train,
method = "gbm",
trControl = ctrl,
metric = "ROC",
verbose = FALSE)
gbm_pred <- predict(fit_gbm, test)
cm <- confusionMatrix(as.factor(gbm_pred), as.factor(test$y), positive = 'yes')
cm
roc_gbm <- roc(response=as.ordered(gbm_pred), predictor=as.ordered(test$y), auc=TRUE)
roc_gbm$auc #0.7846
rm(fit_gbm,gbm_pred,roc_gbm)