-
Notifications
You must be signed in to change notification settings - Fork 2
/
dplyr_part1.R
199 lines (132 loc) · 5.35 KB
/
dplyr_part1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
## if font is small, please let me know
## to change font size please go to Tools > Global Options > Appearance >font size
## if this is your first time here please install tidyverse package
install.packages("tidyverse")
## load tidyverse
library(tidyverse)
## today will work with the "diamonds" dataset already part of R
## Recap from last workshop
## functions to look at dataset (gives you an idea what the dataset looks like)
View(diamonds)
head(diamonds)
tail(diamonds)
str(diamonds)
## Recap from ggplot workshop
ggplot (data = diamonds, mapping = aes(x=cut, fill= cut)) +
geom_bar()
## lets say we only want to look or work with only Ideal cut
## we need to subset the data to only look at cut == Ideal
## we use the function filter
diamonds %>%
filter(cut=='Ideal') ##remember to use == for equality and put string in ''
## if you want to create a new dataset with this subset you simply assign it a name
IdealDiamonds <- diamonds %>%
filter (cut=='Ideal')
## you can also filter by the values of one or more columns:
## give me only diamonds that have color =='E' and price > 350
## 2 ways to do this:
diamonds %>%
filter(color=='E' , price>350)
diamonds %>%
filter(color=='E' & price>350) ## I like this one better
## give me only diamonds that have color =='E' or price > 350
diamonds %>%
filter(color=='E' | price>350) ## remember the "or" syntax |
## give everything but dont include when cut == 'Fair'
diamonds %>%
filter(cut!='Fair') ## remember not equal !=
######################################################################3
## summarize:
## You can use all kind of functions to summarize the data:
## There are many other summary statistics you could consider such:
## sd(), min(), max(), median(), sum(),
## n() (returns the length of vector), first() (returns first value in vector),
## last() (returns last value in vector) and n_distinct() (number of distinct values in vector).
#calculating the mean of the column price
diamonds %>%
summarize(mean(price))
## but what if you want the mean price for each cut group
#Group by one column and then summarizing with those groups
diamonds %>%
group_by(cut) %>%
summarize(mean(price))
## notice the result in the console
## it gave you a tibble with the summary and it created a column called `mean(price)`
## you can name that column if you want to, call it mean_price
diamonds %>%
group_by(cut) %>%
summarize(mean_price=mean(price))
## so summarize is usually paired with group_by to summarize individual groups
## lets count how many were in each group (call it total)
diamonds %>%
group_by(cut) %>% %>%
summarize(mean_price=mean(price),total=n())
## we used the n() to count the total number in each group of cut (how many diamonds in each cut)
######################################################################
## mutate:
## lets say we want to calculate the price per caret and you want to store
## this result in a new column/variable
diamonds2 <- diamonds %>%
mutate(price_per_carat = price / carat)
## lets see what is looks like
str(diamonds2)
##notice we now have 11 variables
########################################################################
## arrange: reorder the rows by one or more columns
#reorder by the column "cut"
diamonds %>%
arrange(cut)
## reorder by the column cut and price.
## To order in descending way, put the column in desc()
diamonds %>%
arrange(cut,desc(price)) ## so first it arranges by cut and then by price in descending order
## Lets chain/pipe functions
## Select all diamonds with a fair cut and then calucate the mean price (name it mean_price)
## depending on the clarity
diamonds %>%
filter(cut=='Fair') %>%
group_by(clarity) %>%
summarise(mean_price=mean(price))
##############################################################################33
### Challenge
## Take 5 minutes to do this on your own:
## Select all Diamnons with the color 'H' and
## calculate the the number of diamonds, depending on the clarity???
## name the number of diamonds : total
## Answer
diamonds %>%
filter(color=='H') %>%
group_by(clarity) %>%
summarize(total = n())
## take it a step further:
## - create a new variable for color_H
## - take out the `summarize(total = n())` so that we can create a histogram
color_H <-diamonds %>%
filter(color=='H') %>%
group_by(clarity)
## - create a plot showing this information --- but I don't have all the info to
## determine the value of the y-axis, I think it has something to do with geom_bar()
ggplot (data=color_H, mapping= aes(x=clarity, fill=clarity)) +
geom_bar()
########################################################################33
## Challenge 2
## Take 5 minutes to do this on your own:
## Select all diamonds with a number of carats bigger than 3.
## Calculate the mean, median and sum of price and the number of diamonds depending on the cut.
## give your summary names:
## mean_price
## median_price
## sum_price
## total
## then order the data depending on the number of diamonds.
## Answer
diamonds %>%
filter(carat>3) %>%
group_by(cut) %>%
summarize(mean_price = mean(price),
median_price = median(price),
sum_price = sum(price),
total=n()) %>%
arrange(total)
## next workshop will revisit these functions again and add some more
##THANK YOU VANESSA