.Rhistory

library(data.table)
library(dplyr)
library(stringi)
library(mirt)
library(foreign)
library(psych)
dt = fread('extracted_data.csv')
library(data.table)
library(dplyr)
library(stringi)
library(mirt)
library(foreign)
library(psych)
dt = fread('extracted_data.csv')
head(dt,3)
dt[,success:= result_flag<=2]
dt= rename(dt, same_p_optimal = p_optimal)
dt[, p_dup:=duplicates/optstep]
dt[,f_m_ratio:= first_move_time/rest_mean]
dt[,fp_m_ratio:= first_push_time/rest_mean]
dt[,f_t_ratio:= first_move_time/guan_time]
dt[,fp_t_ratio:= first_push_time/guan_time]
dt[,p_higher:=num_higher_than_sd/steps]
dt[,maxbox := max(box_completed, na.rm = T), by = num]
dt[,comp_rate:= box_completed/maxbox]
dt[,dbox:=maxbox - box_completed]
dt[,c('maxbox', 'optstep'):=NULL] # When you manipulate two variables together,
#you have to use c() to concantenate them.
dt[,time_window := quantile(guan_time, 0.4), by = num]
dt[, suc_within := success & guan_time<=time_window]
dt[,time_window:=NULL] #setting them equal to NULL means dropping them
# the variable list needs to be averaged
mean_var = c("box_completed","dsteps",
"first_move_time","first_push_time",
"guan_time","same_p_optimal","rest_mean","rest_sd",
"p_dup","f_m_ratio","fp_m_ratio","f_t_ratio",
"fp_t_ratio","p_higher","comp_rate","dbox")
# select those successful Guans, and calculate the mean of those variables by each student.
sucdt = dt[success==T, lapply(.SD, mean, na.rm = T), by  = id,.SDcols = mean_var]
#.SD means subset of data.table. See the documentation of data.table.
# since comp_rate, dbox, box_completed are constant in successful Guans, we drop them.
lntrans = c("first_move_time","first_push_time","guan_time",
"f_m_ratio","fp_m_ratio","f_t_ratio","fp_t_ratio")
for (var in lntrans){
lnvar = 'ln' %s+% var # %s+% is a function from stringi package,
# which concatenates two strings together.
sucdt[,(lnvar):=log(get(var))]
}
faildt = dt[success==F, lapply(.SD, mean, na.rm = T), by  = id,.SDcols = mean_var]
lntrans = c("first_move_time","first_push_time","guan_time",
"f_m_ratio","fp_m_ratio","f_t_ratio","fp_t_ratio")
for (var in lntrans){
lnvar = 'ln' %s+% var
faildt[,(lnvar):=log(get(var))]
}
ndt = merge(sucdt, faildt, by = 'id', all.x = T, all.y = T)
ndt = dcast(dt, id~num, value.var = 'box_completed') %>% select(.,-id) %>%
mirt(.,1,itemtype = 'graded', verbose = F) %>% fscores() %>% cbind(ndt,.) %>%
rename(.,suc_theta = F1)
ndt = dt[,suc_within:=as.numeric(suc_within)] %>%
dcast(., id~num, value.var = 'suc_within')  %>%
select(.,-id) %>% mirt(.,1,itemtype = 'Rasch', verbose = F) %>%
fscores %>% cbind(ndt,.) %>% rename(., suc_window = F1)
yvar = read.spss('Grade 1+2-others-0313.sav', to.data.frame = T) %>%
# read.spss comes from the foreign package.
select(., ID, contains('TM'), Raven) %>% data.table
# We only select math grades and Raven scores.
yvar[,Rperc := rank(Raven)/sum(!is.na(Raven))]
# calculate the percentile of Raven scores.
yvar[Rperc>1,Rperc:= NA] # those whose percentiles are larger than 1 is NA
fares = select(yvar, contains('TM')) %>% fa
# the function 'fa' from the package 'psych' is for exploratory factor analysis
yvar = fares$scores %>% cbind(yvar,.)
yvar[,Mperc := rank(MR1)/sum(!is.na(MR1))]
# MR1 is the latent variable estimated by EFA
yvar[Mperc>1,Mperc:= NA]
ndt = merge(yvar, ndt, by.x = "ID", by.y = "id", all.x = T, all.y = T)
Mpolar = ndt[Mperc>=0.75] %>% mutate(., Mperc=1)
#mutate is from dplyr. We recode those top 25% as 1, and those bottom 25% as 0.
Mpolar = ndt[Mperc<=0.25] %>% mutate(., Mperc=0) %>% rbind(.,Mpolar) %>%
select(., -(ID:MR1))
Mpolar = Mpolar[complete.cases(Mpolar),] # keep the complete cases only.
write.csv(Mpolar, 'Mpolar.csv')
Rpolar = ndt[Rperc>=0.75] %>% mutate(., Rperc=1)
Rpolar = ndt[Rperc<=0.26] %>% mutate(., Rperc=0) %>% rbind(.,Rpolar) %>%
select(., -(ID:Raven), -(MR1:Mperc))
Rpolar = Rpolar[complete.cases(Rpolar),]
write.csv(Rpolar, 'Rpolar.csv')
path.sep
os.sep
os.path.sep
library(data.table)
library(dplyr)
library(stringi)
library(mirt)
library(foreign)
library(psych)
dt = fread(file.path('working_data','extracted_data.csv'))
head(dt,3)
dt[,success:= result_flag<=2]
dt= rename(dt, same_p_optimal = p_optimal)
dt[, p_dup:=duplicates/optstep]
dt[,f_m_ratio:= first_move_time/rest_mean]
dt[,fp_m_ratio:= first_push_time/rest_mean]
dt[,f_t_ratio:= first_move_time/guan_time]
dt[,fp_t_ratio:= first_push_time/guan_time]
dt[,p_higher:=num_higher_than_sd/steps]
dt[,maxbox := max(box_completed, na.rm = T), by = num]
dt[,comp_rate:= box_completed/maxbox]
dt[,dbox:=maxbox - box_completed]
dt[,c('maxbox', 'optstep'):=NULL] # When you manipulate two variables together,
#you have to use c() to concantenate them.
dt[,time_window := quantile(guan_time, 0.4), by = num]
dt[, suc_within := success & guan_time<=time_window]
dt[,time_window:=NULL] #setting them equal to NULL means dropping them
# the variable list needs to be averaged
mean_var = c("box_completed","dsteps",
"first_move_time","first_push_time",
"guan_time","same_p_optimal","rest_mean","rest_sd",
"p_dup","f_m_ratio","fp_m_ratio","f_t_ratio",
"fp_t_ratio","p_higher","comp_rate","dbox")
# select those successful Guans, and calculate the mean of those variables by each student.
sucdt = dt[success==T, lapply(.SD, mean, na.rm = T), by  = id,.SDcols = mean_var]
#.SD means subset of data.table. See the documentation of data.table.
# since comp_rate, dbox, box_completed are constant in successful Guans, we drop them.
lntrans = c("first_move_time","first_push_time","guan_time",
"f_m_ratio","fp_m_ratio","f_t_ratio","fp_t_ratio")
for (var in lntrans){
lnvar = 'ln' %s+% var # %s+% is a function from stringi package,
# which concatenates two strings together.
sucdt[,(lnvar):=log(get(var))]
}
faildt = dt[success==F, lapply(.SD, mean, na.rm = T), by  = id,.SDcols = mean_var]
lntrans = c("first_move_time","first_push_time","guan_time",
"f_m_ratio","fp_m_ratio","f_t_ratio","fp_t_ratio")
for (var in lntrans){
lnvar = 'ln' %s+% var
faildt[,(lnvar):=log(get(var))]
}
ndt = merge(sucdt, faildt, by = 'id', all.x = T, all.y = T)
ndt = dcast(dt, id~num, value.var = 'box_completed') %>% select(.,-id) %>%
mirt(.,1,itemtype = 'graded', verbose = F) %>% fscores() %>% cbind(ndt,.) %>%
rename(.,suc_theta = F1)
ndt = dt[,suc_within:=as.numeric(suc_within)] %>%
dcast(., id~num, value.var = 'suc_within')  %>%
select(.,-id) %>% mirt(.,1,itemtype = 'Rasch', verbose = F) %>%
fscores %>% cbind(ndt,.) %>% rename(., suc_window = F1)
yvar = read.spss('Grade 1+2-others-0313.sav', to.data.frame = T) %>%
# read.spss comes from the foreign package.
select(., ID, contains('TM'), Raven) %>% data.table
library(data.table)
library(dplyr)
library(stringi)
library(mirt)
library(foreign)
library(psych)
dt = fread(file.path('working_data','extracted_data.csv'))
head(dt,3)
dt[,success:= result_flag<=2]
dt= rename(dt, same_p_optimal = p_optimal)
dt[, p_dup:=duplicates/optstep]
dt[,f_m_ratio:= first_move_time/rest_mean]
dt[,fp_m_ratio:= first_push_time/rest_mean]
dt[,f_t_ratio:= first_move_time/guan_time]
dt[,fp_t_ratio:= first_push_time/guan_time]
dt[,p_higher:=num_higher_than_sd/steps]
dt[,maxbox := max(box_completed, na.rm = T), by = num]
dt[,comp_rate:= box_completed/maxbox]
dt[,dbox:=maxbox - box_completed]
dt[,c('maxbox', 'optstep'):=NULL] # When you manipulate two variables together,
#you have to use c() to concantenate them.
dt[,time_window := quantile(guan_time, 0.4), by = num]
dt[, suc_within := success & guan_time<=time_window]
dt[,time_window:=NULL] #setting them equal to NULL means dropping them
# the variable list needs to be averaged
mean_var = c("box_completed","dsteps",
"first_move_time","first_push_time",
"guan_time","same_p_optimal","rest_mean","rest_sd",
"p_dup","f_m_ratio","fp_m_ratio","f_t_ratio",
"fp_t_ratio","p_higher","comp_rate","dbox")
# select those successful Guans, and calculate the mean of those variables by each student.
sucdt = dt[success==T, lapply(.SD, mean, na.rm = T), by  = id,.SDcols = mean_var]
#.SD means subset of data.table. See the documentation of data.table.
# since comp_rate, dbox, box_completed are constant in successful Guans, we drop them.
lntrans = c("first_move_time","first_push_time","guan_time",
"f_m_ratio","fp_m_ratio","f_t_ratio","fp_t_ratio")
for (var in lntrans){
lnvar = 'ln' %s+% var # %s+% is a function from stringi package,
# which concatenates two strings together.
sucdt[,(lnvar):=log(get(var))]
}
faildt = dt[success==F, lapply(.SD, mean, na.rm = T), by  = id,.SDcols = mean_var]
lntrans = c("first_move_time","first_push_time","guan_time",
"f_m_ratio","fp_m_ratio","f_t_ratio","fp_t_ratio")
for (var in lntrans){
lnvar = 'ln' %s+% var
faildt[,(lnvar):=log(get(var))]
}
ndt = merge(sucdt, faildt, by = 'id', all.x = T, all.y = T)
ndt = dcast(dt, id~num, value.var = 'box_completed') %>% select(.,-id) %>%
mirt(.,1,itemtype = 'graded', verbose = F) %>% fscores() %>% cbind(ndt,.) %>%
rename(.,suc_theta = F1)
ndt = dt[,suc_within:=as.numeric(suc_within)] %>%
dcast(., id~num, value.var = 'suc_within')  %>%
select(.,-id) %>% mirt(.,1,itemtype = 'Rasch', verbose = F) %>%
fscores %>% cbind(ndt,.) %>% rename(., suc_window = F1)
yvar = read.spss(file.path('raw_data','Grade 1+2-others-0313.sav'), to.data.frame = T) %>%
# read.spss comes from the foreign package.
select(., ID, contains('TM'), Raven) %>% data.table
# We only select math grades and Raven scores.
yvar[,Rperc := rank(Raven)/sum(!is.na(Raven))]
# calculate the percentile of Raven scores.
yvar[Rperc>1,Rperc:= NA] # those whose percentiles are larger than 1 is NA
fares = select(yvar, contains('TM')) %>% fa
# the function 'fa' from the package 'psych' is for exploratory factor analysis
yvar = fares$scores %>% cbind(yvar,.)
yvar[,Mperc := rank(MR1)/sum(!is.na(MR1))]
# MR1 is the latent variable estimated by EFA
yvar[Mperc>1,Mperc:= NA]
ndt = merge(yvar, ndt, by.x = "ID", by.y = "id", all.x = T, all.y = T)
Mpolar = ndt[Mperc>=0.75] %>% mutate(., Mperc=1)
#mutate is from dplyr. We recode those top 25% as 1, and those bottom 25% as 0.
Mpolar = ndt[Mperc<=0.25] %>% mutate(., Mperc=0) %>% rbind(.,Mpolar) %>%
select(., -(ID:MR1))
Mpolar = Mpolar[complete.cases(Mpolar),] # keep the complete cases only.
write.csv(Mpolar, 'Mpolar.csv')
Rpolar = ndt[Rperc>=0.75] %>% mutate(., Rperc=1)
Rpolar = ndt[Rperc<=0.26] %>% mutate(., Rperc=0) %>% rbind(.,Rpolar) %>%
select(., -(ID:Raven), -(MR1:Mperc))
Rpolar = Rpolar[complete.cases(Rpolar),]
write.csv(Rpolar, file.path('working_data','Rpolar.csv'))