-
Notifications
You must be signed in to change notification settings - Fork 0
/
Tidytuesday_Wine_2019-05-28.R
124 lines (94 loc) · 5.42 KB
/
Tidytuesday_Wine_2019-05-28.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
wine_ratings <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-28/winemag-data-130k-v2.csv") %>%
select(-X1)
wine_ratings_tbl <- as_tibble(wine_ratings)
glimpse(wine_ratings)
country1 <- wine_ratings %>% count(country) %>% filter(n > 50) %>% select(country)
countryvect <- country1[[1, ]]
wine_ratings1 <- wine_ratings %>% group_by(country) %>% filter(country %in% countryvect) %>% summarise(avg_point = mean(points))
wine_ratings2 <- na.omit(wine_ratings1)
## country by there average score point
wine_ratings2 %>% ggplot(aes(reorder(country, avg_point), avg_point)) +
geom_point() +
coord_flip() +
geom_hline(yintercept = c(86, 88, 90)) +
scale_y_continuous("Mean of points", expand = c(0,0.3)) +
scale_x_discrete("Country") +
labs(title = "Average score per country") +
theme_economist()
## country by points boxplot
wine_ratings %>% group_by(country) %>% filter(country %in% countryvect) %>% sample_n(size = 30) %>% ggplot(aes(reorder(country, points), points)) +
geom_boxplot(alpha = 0.2) +
geom_point(col = "blue", position = "jitter", alpha = 0.3) +
theme(axis.text.x = element_text(angle = 90))
##country by points errorbar
wine_ratings %>% group_by(country) %>% filter(country %in% countryvect) %>% sample_n(size = 30) %>% ggplot(aes(reorder(country, points), points)) +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar") +
stat_summary(fun.y = mean, geom = "point") +
theme_economist() +
theme(axis.text.x = element_text(angle = 90))
## price by points + case_when
max(wine_ratings$price, na.rm = TRUE)
min(wine_ratings$price, na.rm = TRUE)
wine_price <- wine_ratings %>% filter(!is.na(price)) %>% mutate(economic_rate = case_when(price <= 25 ~ "normal",
price <= 100 ~ "high",
price <= 1000 ~ "very high",
price <= 3300 ~ "no limit"))
wine_price_omit <- na.omit(wine_price)
wine_price %>% ggplot(aes(points, price)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
top_producer <- wine_price %>% count(province) %>% arrange(desc(n)) %>% head(20) %>% select(-2)
province_top_p <- top_producer[[1, ]]
wine_ratings %>% filter(province %in% province_top_p) %>%
ggplot(aes(province, price)) +
geom_boxplot() +
scale_y_log10() +
theme(axis.text.x = element_text(angle = 90))
library(ggridges)
wine_ratings %>% filter(province %in% province_top_p) %>%
ggplot(aes(price, province)) +
geom_density_ridges(alpha = 0.5, fill = "red") +
geom_point(alpha = 0.2, shape = "|", position = position_nudge(y = -0.5))
xlim(0, 100)
avg_country <- wine_price %>% filter(country %in% c("France", "Italy", "US", "Germany")) %>% summarise(avg_point = mean(points)) %>% pull()
mean_country <- wine_price %>% group_by(country) %>% filter(country %in% c("France", "Italy", "US", "Germany")) %>% select(country, points) %>% summarise(mean = mean(points))
wine_price %>% filter(country %in% c("France", "Italy", "US", "Germany")) %>% sample_n(10000) %>%
ggplot(aes(reorder(country, points), points, color = country)) +
geom_point(position = "jitter", alpha = 0.05) +
geom_hline(yintercept = avg_country, color = "grey70", size = 1) +
geom_point(data = mean_country, aes(country, mean, color = country), size = 5) +
coord_flip() +
annotate("text", x = "Italy", y = 97.5, size = 5, color = "grey70",
label = "eazeaeda")
## wine score per taster with 500+ ratings
taster1 <- wine_ratings %>% count(taster_name) %>% filter(!is.na(taster_name), n > 515)
taster_final <- taster1[[1]]
wine_ratings %>% filter(taster_name %in% taster_final) %>% summarise(avg_point = mean(points))
wine_ratings %>% filter(taster_name %in% taster_final, country %in% c("France", "Germany", "Us", "Italy")) %>% ggplot(aes(points, taster_name, fill = country)) +
geom_density_ridges(alpha = 0.7, color = NA) +
scale_y_discrete("Name of the best tasters", expand = c(0,0))
geom_vline(xintercept = 88.6) +
theme_excel()
## Text analysis wine ratings
library(tidytext)
library(tidyverse)
tidy_wine_description <- wine_ratings %>% unnest_tokens(word, description)
tidy_wine_description2 <- tidy_wine_description %>% anti_join(stop_words2)
top15_words <- tidy_wine_description2 %>% filter(points > 95, country == "US") %>% count(word) %>% arrange(desc(n)) %>% top_n(15, n) %>% mutate(word2 = fct_reorder(word, n))
top15_words %>% ggplot(aes(word2, n)) +
geom_col() +
coord_flip() +
labs(title = "Review Word Count, + 15")
#### Add some stop words
custom_words <- tribble(
~word, ~lexicon,
"wine", "CUSTOM",
"flavors", "CUSTOM")
stop_words2 <- stop_words %>% bind_rows(custom_words)
min15_words <- tidy_wine_description2 %>% filter(points < 85, country == "US") %>% count(word) %>% arrange(desc(n)) %>% top_n(15, n) %>% mutate(word2 = fct_reorder(word, n))
min15_words %>% ggplot(aes(word2, n)) +
geom_col() +
coord_flip() +
labs(title = "Review Word Count, -15")
library(wordcloud)
wordcloud(words = min15_words$word2, freq = min15_words, max.words = 15)