forked from STAT547-UBC-2019-20/group_01_dlin_njamshidi
-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_data.R
74 lines (63 loc) · 2.28 KB
/
process_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# Author: Nima jamshidi
# date: 2020-03-05
# Description of the script and the command-line arguments
"This script wrangles the data and creates a new data file including the dummy variables for the categorical variables.
Usage: process_data.R --file_path=<path_to_raw_data_file> --filename=<output_file_name>" -> doc
# Load in the necessary packages
suppressMessages(library(tidyverse))
suppressMessages(library(docopt))
suppressMessages(library(psych))
suppressMessages(library(hablar))
suppressMessages(library(glue))
suppressMessages(library(here))
suppressMessages(library(stringr))
# Take in command-line arguments
opt <- docopt(doc)
# Main function
main <- function(path, name) {
# check that the command-line argument given file exists
if (!file.exists(path)) {
stop(glue("The file {path} does not exist!"))
}
# Read in the file, and read each column into a certain type
data <- read_csv(
path,
col_types = cols(
age = col_integer(),
sex = readr::col_factor(),
bmi = col_double(),
children = col_integer(),
smoker = readr::col_factor(),
region = readr::col_factor(),
charges = col_double()
)
)
# wrangle the data to include dummy variables for factors
data <- data %>%
mutate(sex_dummy = as.integer(sex),
smoker_dummy = as.integer(fct_relevel(smoker, "no"))) %>%
cbind(as_tibble(psych::dummy.code(data$region)) %>% hablar::convert(hablar::int(1:4))) %>%
select(-charges) %>%
cbind(charges = data$charges) %>%
mutate(age_range = case_when(
age < 20 ~ glue("{min(age)}-20"),
age >= 20 & age < 30 ~ "20-30",
age >=30 & age < 40 ~ "30-40",
age >=40 & age < 50 ~ "40-50",
age >=50 & age < 60 ~ "50-60",
age >=60 & age <= max(age) ~ glue("60-{max(age)}")
))
data$age_range <- as.factor(data$age_range)
# write the csv file out to specified file name to data/processed directory
write_csv(data, here("data", "processed", name))
name_noext <- str_remove(name,".csv")
saveRDS(data, file = here("data","processed",glue("{name_noext}.rds")))
# print out success message
print(
glue(
"The data has been successfully wrangled and written to {here('data', 'processed', name)}."
)
)
}
# call the main function
main(opt$file_path, opt$filename)