-
Notifications
You must be signed in to change notification settings - Fork 1
/
Starter Code Reproduce Nat Geo Figure 4.R
46 lines (36 loc) · 1.83 KB
/
Starter Code Reproduce Nat Geo Figure 4.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#Set up packages
library(pyrfume)
library(tidyverse)
#The environment has a downsampled version of the data. You can load using the following code:
NGS_Small <- read.csv("data/NGS_sample01.csv")
#Look at the data
head(NGS_Small)
summary(NGS_Small)
#This is the standard function for loading directly from Pyrfume. Here we load the data dictionary.
#This describes each variable and what the numerical codes mean.
NGS_DataDictionary <- load_data("nat_geo_1986/DataDictionary.csv")
head(NGS_DataDictionary[,1:10])
#we are removing missing rows for the following data
df <- NGS_Small %>%
filter(SEX != "0" & totcorr !=7 & SMOKE != "0") %>%
filter(ETHNIC !="0" & ETHNIC != "7") %>%
filter(decage !="0") %>%
#Now we are averaging by sex and age
group_by(SEX,decage) %>%
summarize(totcorr = mean(totcorr))
#Our goal is to make a plot with the number of correctly identified odors on the
#y-axis, age (in decades) on the x-axis, and color our points by sex
#You can click the small arrow to hide/show the code below, so you can try on
#your own before seeing our solution.
#We will be using ggplot instead of base graphics to make plots.
#A tutorial is here: http://r-statistics.co/Complete-Ggplot2-Tutorial-Part1-With-R-Code.html
#Recreate the NGS plot #####
#Now we will recreate the plot. You can see a description of all of the variables at https://github.com/pyrfume/pyrfume-data/blob/main/nat_geo_1986/Data%20dictionary.xlsx
ggplot(df, aes(x = decage, y = totcorr, color=as.factor(SEX)))+
geom_point()
#Clean up the labels #####
ggplot(df, aes(x = decage, y = totcorr, color=as.factor(SEX)))+
geom_point()+
labs(y="Number Correct")+
scale_color_manual(name="Sex",labels=c("Male","Female"),values=c("blue","red"))+
scale_x_continuous(name="Age in Decades",breaks=c(1:9),labels=c("Teens","20's","30's","40's","50's","60's","70's","80's","90's"))