-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_csv_eachCountry.R
93 lines (67 loc) · 3.01 KB
/
generate_csv_eachCountry.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
library(tidyverse)
library("poLCA")
nonnegtive = read.csv("nonnegative_dataset.csv")
###################TO generate csv output for each individual country
# There is a function to generate model running results and write to csv
## Generate dataset for a certain country if needed
US = nonnegtive %>%
filter(country == "United States") %>%
dplyr::select(c("tax", "religion", "free_election", "state_aid",
"civil_rights", "women"))
mydata = nonnegtive %>%
dplyr::select(c("tax", "religion", "free_election", "state_aid",
"civil_rights", "women"))
## LCA formula, note that the first arg is your dataset.
f = cbind(tax, religion, free_election, state_aid, civil_rights, women)~1
## Some necessary funcitons
gen_country = function(country_name){
a = nonnegtive %>%
filter(country == country_name) %>%
dplyr::select(c("tax", "religion", "free_election", "state_aid",
"civil_rights", "women"))
return(a)
}
get_country_csv = function(country_name){
country = gen_country(country_name)
results = data.frame(matrix(, nrow=5, ncol=13))
colnames(results) = c("model", "log-likelihood", "resid. df", "BIC",
"ABIC", "cAIC", "likelihood-ratio", "Entropy",
"Class1", "Class2", "Class3", "Class4", "Class5")
entropy<-function (p) sum(-p*log(p))
for (i in 1:5){
#model = replicate(5, NA)
model <- poLCA(f, data= country, nclass= i, na.rm = FALSE, nrep=15, maxiter=3500)
results[i,1] = paste("model", i)
results[i,2]<- model$llik
results[i,3]<- model$resid.df
results[i,4]<- model$bic
results[i,5]<- (-2* model$llik) + ((log((model$N + 2)/24)) * model$npar) #abic
results[i,6]<- (-2* model$llik) + model$npar * (1 + log(model$N)) #caic
results[i,7]<- model$Gsq
results[i,8] <- round(((entropy(model$P) - mean(apply(model$posterior,1, entropy),na.rm = TRUE)) / entropy(model$P)),3)
cat("BIC for ", country_name, "are", model$bic)
if (i == 1) {
results[i, 8] = c("-")
}
results[i, 9:13] = c(round(model$P,3), rep("-", 5-i))
}
cat("BIC for ", country_name, "are", results[, 4])
write.csv(results, paste("csv_each_country/", country_name, ".csv", sep = ""), row.names = FALSE)
}
country_list = unique(nonnegtive$country)[-c(1,19)]
#"Argentina", "Sweden" NEED FURTHER WORK
for (i in country_list){
get_country_csv(i)
}
model <- poLCA(f, data= US, nclass= 3, na.rm = FALSE, nrep=15, maxiter=3500)
model$Chisq
## To calculate classification error
posteriors <- data.frame(model$posterior, predclass=model$predclass)
classification_table <-
ddply(posteriors, .(predclass), function(x) colSums(x[,1:3]))
sum(diag(as.matrix(classification_table[2:4])))/sum(classification_table)
##################################
results = data.frame(matrix(, nrow=5, ncol=13))
colnames(results) = c("model", "log-likelihood", "resid. df", "BIC",
"ABIC", "cAIC", "likelihood-ratio", "Entropy",
"Class1", "Class2", "Class3", "Class4", "Class5")