-
Notifications
You must be signed in to change notification settings - Fork 0
/
hd_log_reg_fullcode.R
154 lines (131 loc) · 5.29 KB
/
hd_log_reg_fullcode.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
##------------------------------------------------------------------------------------##
## FULL CODE ##
##------------------------------------------------------------------------------------##
#import dataframe: csv
df<-read.csv("heart.csv", header =TRUE)
summary(df)
head(df,5)
#change erronous attribute name: ï..age
colnames(df)[colnames(df)=='ï..age'] <-'age'
head(df,5)
#check the type of each attribute and change to factor or int
str(df)
df$sex <- as.factor(df$sex)
df$cp <- as.factor(df$cp)
df$fbs <- as.factor(df$fbs)
df$restecg <- as.factor(df$restecg)
df$exang <- as.factor(df$exang)
df$slope <- as.factor(df$slope)
df$thal <- as.factor(df$thal)
#check for any missing data
library(Amelia)
missmap(df, main = "Missing Data vs Observed")
#check the ratio of reponse variable and see if it requires to rebalance
library(plyr)
count(df,vars="target")
#Data Visualization
library(GGally)
library(ggsci)
#1. Count of Target variable
ggplot(df, aes(factor(target), fill=factor(target)))+
geom_bar(stat="count", width=0.5, color="black")+
ggtitle("Count of Target")+xlab("Target")+ylab("Count")+labs(fill="Target")+
theme_bw()+
scale_fill_npg()
#2. Barplot of Gender broken down by Target
ggplot(df, aes(sex, fill=factor(target)))+
geom_bar(stat="count", width=0.5, color="black", position=position_dodge())+
ggtitle("Gender of Patient, Broken by Target")+xlab("Gender")+ylab("Count")+labs(fill="Target")+
theme_bw()+
scale_fill_npg()
#3. Barplot for categorical variables broken down by Target
cat_var <- c("sex", "cp", "fbs", "restecg", "exang", "slope", "thal", "target")
cat_df <- df[cat_var]
for(i in 1:7){
print(ggplot(cat_df, aes(x=cat_df[,i], fill=factor(target)))+
geom_bar(stat="count", width=0.5, color="black", position=position_dodge())+
ggtitle(paste(colnames(cat_df)[i], ": Broken by Target"))+
xlab(colnames(cat_df)[i])+ylab("Count")+labs(fill="Target")+
theme_bw()+
scale_fill_npg())
}
#4. Age distribution by Target
mp <- ggplot(df, aes(sex, age, fill=factor(target)))+
geom_boxplot(width=0.5)+
ggtitle("Distribution of Age, by Gender", subtitle="")+xlab("Gender")+ylab("Age")+labs(fill="Target")+
theme_bw()+
scale_fill_npg()
#4-1. subplots
xplot <- ggplot(df, aes(sex, fill=factor(target)))+
geom_bar(stat="count", width=0.5, alpha=0.4,color="black", position=position_dodge())+
labs(fill="Target")+
theme_bw()+
scale_fill_npg()
yplot <- ggplot(df, aes(age, fill=factor(target)))+
geom_density(alpha=0.4)+
labs(fill="Target")+
theme_bw()+
scale_fill_npg()+
rotate()
#4-2. combination
library(ggpubr)
ggarrange(xplot, NULL, mp, yplot,
ncol =2, nrow=2, align = "hv",
widths= c(3,1), heights = c(1,2),
common.legend= TRUE)
#5. Boxplot for continuous variables broken down by Target
cont_var <- c("age", "trestbps", "chol", "thalach", "oldpeak", "target")
cont_df <- df[cont_var]
for(i in 1:5){
print(ggplot(cont_df, aes(x=cont_df[,i], y=factor(target), fill=factor(target)))+
geom_boxplot(width=0.5)+
geom_point(position=position_dodge(width=0.5), alpha=0.2)+
ggtitle(paste(colnames(cont_df)[i], ": Broken by Target"))+
xlab(colnames(cont_df)[i])+ylab("Target")+labs(fill="Target")+
theme_minimal()+
scale_fill_npg())
}
#Train,Test Split
library(caTools)
set.seed(123)
sample <- sample.split(df, SplitRatio = 0.75) #Randomly set identifier
train_df <- subset(df, sample==TRUE) #Train dataset
test_df <- subset(df, sample==FALSE) #Test dataset
#Logistic Regression: full fitting with train dataset
df_model <- glm(target~., data=train_df, family=binomial(link="logit"))
exp(coef(df_model))
summary(df_model)
#create a model with the statistically siginifcant variables only
df_model.part <- glm(target~sex+cp+trestbps+thalach+oldpeak+ca, data=train_df, family=binomial(link="logit"))
exp(coef(df_model.part))
summary(df_model.part)
print(df_model.part$aic - df_model$aic) #difference of AIC score
#validate that the reduced model is statistically siginifcant over the full model
anova(df_model.part, df_model, test="Chisq")
#ROC and AUC for the partial model
library(pROC)
par(pty="s")
roc(train_df$target, df_model.part$fitted.values, plot=TRUE, legacy.axes=TRUE,
col="Red", print.auc = TRUE, print.auc.x=0.35,
xlab="False Positive Rate",
ylab="True Positive Rate")
#Fiiting the model to test dataset and Confusion Matrix of the fitted model
df_model_fit <- predict(df_model.part, newdata=test_df, type="response")
df_model_confmat <- ifelse(df_model_fit >0.5, 1, 0)
library(caret)
confusionMatrix(factor(df_model_confmat), factor(test_df$target), positive=as.character(1))
#ROC and AUC for the fitted model
library(ROCR)
df_prediction <- prediction(df_model_fit, test_df$target)
df_performance <- performance(df_prediction, measure = "tpr", x.measure="fpr")
plot(df_performance, col = "Red",
main = "ROC Curve",
xlab="False Postiive Rate", ylab="True Positive Rate")+
abline(a=0, b=1, col= "Grey", lty=2)+
abline(v=0, h=1, col= "Blue", lty=3)+
plot(df_performance, col = "Red",
main = "ROC Curve",
xlab="False Postiive Rate", ylab="True Positive Rate",add=TRUE)
df_auc <- performance(df_prediction, measure = "auc")
df_auc <- df_auc@y.values
print(paste("AUC Score: ", lapply(df_auc,round,3)))