-
Notifications
You must be signed in to change notification settings - Fork 0
/
US_schools_scorecards.R
155 lines (123 loc) · 7.62 KB
/
US_schools_scorecards.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#US college scorecards
#reading Data
#intalling packages
#install.packages("ggplot2")
library(ggplot2)
setwd("C:/Users/ARIN/Google Drive/DaViS/Quarters/Fall17/STA 141A/HMW/hmw1")
scores <- readRDS("college_scorecard_2013.rds")
#determining number of observations and features
dim(scores)
#determining number of distinct colleges
length(which(scores$main_campus == "TRUE"))
#converting from non-factor to factor
#converting from non-factor to factor
scores$unit_id <- factor(scores$unit_id)
scores$ope_id <- factor(scores$ope_id)
scores$city <- factor(scores$city)
scores$zip <- factor(scores$zip)
scores$branches <- factor(scores$branches)
#checking how many factors are there
sum(sapply(1:51, function(x) is.factor(scores[,x]))) #sum to get how many
str(scores) #observe other feature
#checking for missing values
missingObservations <- sum(is.na(scores))
#checking which feature has most missing values
names(which.max(colSums(is.na(scores))))
#ratio of private and public schools
#creating new ownership variable with two factors: Public and Private
scores$ownership2 <- scores$ownership
levels(scores$ownership2)[c(2,3)] = c("Private", "Private")
#shows the numbers of private and public schools
table(scores$ownership2)
#obtaining numbers of highest degree issued by each category of ownership
degrees <- table(scores$ownership2, scores$highest_degree)
degreeProportions <- prop.table(degrees)
#plotting the proportions of highest degrees issued
par(xpd = NA)
barplot(degreeProportions, main = "Proportions of Highest Degree Issued",
ylab = "Frequency",
xlab = "Type of Degree",
col = c("skyblue2", "thistle4"))
legend("topleft", levels(scores$ownership2), fill = c("skyblue", "thistle4"))
#determinigh number of Undegraduate Population
#checking the distribution of the undergraduate population
summary(scores$undergrad_pop)
quantile(scores$undergrad_pop, na.rm = TRUE) #comparing quantile() with summary()
#getting deciles for undegrad population
undergrad_pop_deiciles <- quantile(scores$undergrad_pop, probs = seq(0,1, 0.1), na.rm = 1)
scores[which.max(scores$undergrad_pop), "name"] ##shows the school with the highes number of undergraduate students
#plotting result for undergraduate population
par(oma = c(2,2,2,2), mar = c(2,2,6,2))
b <- boxplot(scores$undergrad_pop,
horizontal = TRUE,
lwd = 2,
col = "skyblue",
ylim = c(0,10000),
main = "Undergraduate Population",
xlab = "Number of Students")
axis(side = 3, at = c(undergrad_pop_deciles, b$stats), labels = c(undergrad_pop_deciles, b$stats))
abline(v = undergrad_pop_deciles, col = "thistle4", lwd = 3, lty = 2)
#determining the most populous states, per Wikipedia:https://simple.wikipedia.org/wiki/List_of_U.S._states_by_population
#California
#Texas
#Florida
#New York
#Illinois
#subsetting the dataset to only display the schools from the most populous states.
MostPopulatedStates <- subset(scores, state %in% c("CA", "TX", "FL","NY", "IL")) ## a better way to subset
MostPopulatedStates <- droplevels(MostPopulatedStates) ## removing values for states, in which we are not interested.
boxplot(tuition ~ state, MostPopulatedStates,
ylim = c(0,50000),
main = "Tuition for Five Most Populous States", col.main = "thistle3",
ylab = "Tuition", xlab = "State", col.lab = "orange4",
col.axis = "skyblue4",
col = "grey")
#Relationship between school and spending per student and 10-year earning
#plotting the relationship between spending per student and averagae salary in 10 years
p2 <- ggplot(scores, na.rm = T, aes(x = spend_per_student, y = avg_10yr_salary, col = ownership))
p2 + geom_point( pch = 5, size = 1, alpha = .6, na.rm = T) +
geom_smooth(method = "lm", se = FALSE, linetype = 2, lwd = 2,na.rm = T) +
facet_grid(scores$ownership ~.) +
scale_y_continuous(labels = scales::comma) + #changes y-axis ticks labels from scientific into comma format
scale_x_continuous(labels = scales::comma) + #changes x-axis ticks labels from scientific into comma format
labs(title = "Spending Per Student vs Average Salary in 10 Years from Starting School",
x = "Spending per Student",
y = "Avg. Salary in 10 Years")
#Earing income vs Cost of attendance
# creating a new variable to estimate earnings per cost
scores$earnings_per_cost <- round(scores$avg_10yr_salary / scores$cost, digits = 2)
summary(scores$earnings_per_cost)
#getting the top 10 schools with the highest earings per cost
earnings <- sort.list(scores$earnings_per_cost, decreasing = T) #sorting in the descending order since the largest values are desired.
top_ten_earnings <- subset(scores[earnings[1:10],])
top_ten_earnings[, c("name", "state", "earnings_per_cost")] #shows the top 10 schoos w/highest earnings per cost
# Diversity amopung students
#subsetting dataframe for the race percentage variables
races <-subset(scores[,c(44:49,51)])
#computing the variance among race percentage variables for each school
scores$race_variance <- round(apply(races, 1, var), digits = 4)
#selecting schools with the lowest varience in the percentage of different races
diversity <- sort.list(scores$race_variance) #sorting in ascending order since the lowest variances are desired
top_ten_diversity <- subset(scores[diversity[6:15],]) #the first five school in this list are omited, as they have varience of 0 due to only containing zeros or missing values in all of their race percentage variables
top_ten_diversity[, c("name", "state", "race_variance")] #shows 10 most racially diverse schools
#compare UC Davis vs Other Schools
#Using variables for cost of attendance w/o fin. aid, average 10-year salary, and spending per student
#plotting the results for UC Davis vs Other schools for the above three variables
par(mfrow = c(1, 3), oma = c(0,0,2,0)) # using three plots in one
#plotting Average 10-year salary for UC Davis vs other schools.
d1 <- boxplot(scores$avg_10yr_salary, col = "skyblue", xlab = "Average 10 Year Salary", ylim = c(0, 80000))
abline(h = scores[which(scores$name == "University of California-Davis"), "avg_10yr_salary"], lwd = 2, col = "red", lty = 3)
axis(side = 4, at = scores[which(scores$name == "University of California-Davis"), "avg_10yr_salary"], labels = scores[which(scores$name == "University of California-Davis"), "avg_10yr_salary"])
legend("top", legend = "UC Davis", col = "red", lty = 3, lwd = 2)
#plotting the cost of attendance for UC Davis vs other schools.
d2 <- boxplot(scores$cost, col = "orange3", xlab = "Cost of Attendance")
abline(h = scores[which(scores$name == "University of California-Davis"), "cost"], lwd = 2, col = "red", lty = 3)
axis(side = 4, at = scores[which(scores$name == "University of California-Davis"), "cost"], labels = scores[which(scores$name == "University of California-Davis"), "cost"])
legend("top", legend = "UC Davis", col = "red", lty = 3, lwd = 2)
#plotting the spending per student for UC Davis vs other schools.
d3 <- boxplot(scores$spend_per_student, col = "thistle2", xlab = "Spending per Student", ylim = c(0, 25000))
abline(h = scores[which(scores$name == "University of California-Davis"), "spend_per_student"], lwd = 2, col = "red", lty = 3)
axis(side = 4, at = scores[which(scores$name == "University of California-Davis"), "spend_per_student"],labels = scores[which(scores$name == "University of California-Davis"), "spend_per_student"])
legend("top", legend = "UC Davis", col = "red", lty = 3, lwd = 2)
title(main = "UC Davis vs Other Schools", outer = T, cex.main = 3)
#quantile(scores$spend_per_student, probs = seq(0,1, 0.1), na.rm = 1) #shows that UC Davis spending per student within highest 10% among schools in the dataframe.