-
Notifications
You must be signed in to change notification settings - Fork 1
/
resampling_methods_ch5.Rmd
94 lines (66 loc) · 2.78 KB
/
resampling_methods_ch5.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
Resampling Methods - Chapter 5
========================================================
### Loading Prostate Cancer dataset into R
```{r}
library(ggplot2)
prostate_cancer <- read.table("~/Documents/github/stat_learning/data/prostate_caner.dat", quote="\"")
colnames(prostate_cancer)[1] <- "idCode"
colnames(prostate_cancer)[2] <- "tumor"
colnames(prostate_cancer)[3] <- "age"
colnames(prostate_cancer)[4] <- "race"
colnames(prostate_cancer)[5] <- "rectalExamResult"
colnames(prostate_cancer)[6] <- "capsularInvolvement"
colnames(prostate_cancer)[7] <- "antigenValue"
colnames(prostate_cancer)[8] <- "tumorVolume"
colnames(prostate_cancer)[9] <- "gleasonScore"
prostate_cancer$race <- as.factor(prostate_cancer$race)
prostate_cancer$rectalExamResult <- as.factor(prostate_cancer$rectalExamResult)
attach(prostate_cancer)
```
### Plots of various predictors colored by whether it is cancerous or not
```{r}
ggplot(data=prostate_cancer,aes(x=antigenValue,y=tumorVolume,col=tumor))+geom_point()
ggplot(data=prostate_cancer,aes(x=age,y=tumorVolume,col=tumor))+geom_point()
ggplot(data=prostate_cancer,aes(x=antigenValue,y=gleasonScore,col=tumor))+geom_point()
ggplot(data=prostate_cancer,aes(x=age,y=gleasonScore,col=tumor))+geom_point()
ggplot(data=prostate_cancer,aes(x=age,y=antigenValue,col=tumor))+geom_point()
```
### Using the Validation Set Method to Calculate MSE
```{r}
train=sample(380,200)
glm.fit=glm(tumor~age+rectalExamResult+capsularInvolvement+antigenValue+gleasonScore,data=prostate_cancer,family = binomial,subset=train)
mean((tumor-predict(glm.fit,prostate_cancer, type="response"))[-train]^2)
```
### Using the Leave One Out Cross-Validation Set
```{r}
library(boot)
glm.fit=glm(tumor~age+rectalExamResult+capsularInvolvement+antigenValue+gleasonScore,data=prostate_cancer,family = binomial)
coef(glm.fit)
kfCV <- cv.glm(data=prostate_cancer, glmfit=glm.fit)
kfCV$delta
```
### Polynomial Logistic Regression using Leave One Out Cross-Validation
```{r}
cv.error=rep(0,5)
for (i in 1:5){
glm.fit=glm(tumor~age+rectalExamResult+capsularInvolvement+poly(antigenValue,i)+poly(gleasonScore,i),data=prostate_cancer,family = binomial)
cv.error[i]=cv.glm(data=prostate_cancer, glmfit=glm.fit)$delta[1]
}
cv.error
```
### K=5 Cross Validation using Polynomial Regression
```{r}
cv.error=rep(0,4)
for (i in 1:4){
glm.fit=glm(tumor~poly(age,i)+rectalExamResult+capsularInvolvement+antigenValue+gleasonScore,data=prostate_cancer,family = binomial)
cv.error[i]=cv.glm(data=prostate_cancer, glmfit=glm.fit,K=5)$delta[1]
}
```
### Estimating Model Parameters via Boot Strapping
```{r}
library(boot)
boot.fn=function(data,index)
coefficients(glm(tumor~age+capsularInvolvement+antigenValue+gleasonScore,data=prostate_cancer,family = binomial,subset=index))
set.seed(1)
boot(prostate_cancer,boot.fn,1000)
```