-
Notifications
You must be signed in to change notification settings - Fork 10
/
twitter_followersGender.R
137 lines (117 loc) · 4.77 KB
/
twitter_followersGender.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
require("dplyr")
require("gender")
library("rCharts")
require("RCurl")
require("scales")
require("twitteR")
set.seed(42)
# Get top 100 twitter users by followers (from twittercounter.com)
top.100 <- getURL("http://twittercounter.com/pages/100")
top.100 <- unlist(strsplit(top.100, "\n"))
top.100 <- top.100[sapply(top.100, grepl, pattern="@")]
top.100 <- gsub(".*>@(.+)<.*", "\\1", top.100)[2:101]
# Your twitter API keys here
cons.key <- scan("../consumerkey.twitter", what="character")
cons.sec <- scan("../consumersecret.twitter", what="character")
## Setup OAuth
# via http://davetang.org/muse/2013/04/06/using-the-r_twitter-package/
cred <- OAuthFactory$new(consumerKey = cons.key,
consumerSecret = cons.sec,
requestURL = 'https://api.twitter.com/oauth/request_token',
accessURL = 'https://api.twitter.com/oauth/access_token',
authURL = 'https://api.twitter.com/oauth/authorize')
cred$handshake()
getTwitterOAuth(cons.key, cons.sec)
registerTwitterOAuth(cred)
user <- lookupUsers(top.100)
# Check your rate limits:
twitteR::getCurRateLimitInfo()
getFollows <- function(userList, numFollowers){
df <- data.frame(user = character(),
followNames = character())
for(u in 1:length(userList)){
print(u)
try({
follows <- userList[[u]]$getFollowers(n=numFollowers)
names <- unlist(lapply(follows, "[[", "name"))
names <- tolower(gsub(" .*", "", names))
rows <- data.frame(user = userList[[u]]$screenName,
followNames = names)
df <- rbind(df, rows)
Sys.sleep(90)
})
}
return(df)
}
### Try get 1k followers per user, with 2k >50% have some technical
### issue. Possible strange characters in usernames / other data
### that break the R JSON import. Could be fixed w/ RCurl probably.
### WARNING: these take a _long_ time to run due to API rate limits.
### wait between requests alone is 2.5h (when using 90s, try 60s).
df <- getFollowers(user, 1000)
# How many have some gender prob? ~60%
length(df$proportion_male[!is.na(df$proportion_male)])
### These had technical issues with 1000 followers, try 500
u2 <- user[which(!top.100 %in% unique(df$user))]
getFollowers(u2, 500)
# Finally 2 still broke, try 200
u3 <- user[which(!top.100 %in% c(unique(as.character(df$user)),
unique(as.character(df2$user))))]
df3 <- getFollowers(u3, 200)
followers <- rbind(df, df2, df3)
followers$followNames <- as.character(followers$followNames)
## 2) Get gender probabilities, methods: 1) simple: threshold,
## 2) probablistic 3) bayesian, combine threshold and probs.
followers <- cbind(followers, gender(followers$followNames))
hist(followers$proportion_male)
followers$mf <- ifelse(runif(nrow(followers)) < followers$proportion_male, "M", "F")
# Plot barcharts with unknown genders included
summarys <- group_by(followers, user, mf) %.%
summarise(sum=n()) %.% mutate(tot=cumsum(sum))
summarys$tot <- rep(summarys$tot[seq(3, nrow(summarys), by =3)], each=3)
pdf("../plots/twitter_GenderWithMissing.pdf", 10, 14)
ggplot(summarys, aes(x=mf, y=sum/tot, fill=mf)) +
facet_wrap(~user, ncol=10) + #, scale="free_y") +
geom_bar(stat="identity") +
theme(legend.position="none") +
scale_y_continuous(labels=percent) +
labs(y="Percent of followers", x="Gender")
dev.off()
## Without unknowns:
plot <- followers
plot <- plot[!is.na(plot$mf),]
plot <- group_by(plot, user, mf) %.% summarise(sum=n()) %.% mutate(tot=cumsum(sum))
## Tidy up:
plot$user <- as.character(plot$user)
plot$tot <- rep(plot$tot[seq(2, nrow(plot), by=2)], each=2)
plot$perc <- with(plot, sum / tot)
plot$perc.m <- rep(plot$perc[seq(2, nrow(plot), by=2)], each=2)
plot$user <- factor(plot$user, levels=unique(plot$user[order(plot$perc.m)]))
plot$mf <- ifelse(plot$mf == "F", "Female", "Male")
pdf("../plots/twitter_genderDist.pdf", 6, 10)
ggplot(plot, aes(x=user, y=sum/tot, fill=mf)) +
geom_bar(stat="identity") +
theme(legend.position="none") +
scale_y_continuous(labels=percent, expand = c(0,0)) +
coord_flip() + scale_fill_brewer(palette="Paired") +
labs(x="Twitter account", y="", fill="Gender of\n followers") +
geom_hline(y=.5, linetype=2) + theme_minimal() +
theme(axis.text.y = element_text(size=8))
dev.off()
# order by top100:
#plot <- plot[order(match(plot$user, top.100)),]
# order by percent male:
plot$user <- as.character(plot$user)
plot <- plot[order(plot$perc.m),]
plot$perc <- plot$perc * 100
plot$sum <- NULL
plot$tot <- NULL
plot$perc.m <- NULL
n1 <- nPlot(perc ~ user, group="mf",
data = plot, color="mf",
type = "multiBarHorizontalChart",
height=1100, width=600)
n1
n1$publish("other")
# Find missing account (@TwitPic)
top.100[which(!top.100 %in% unique(plot$user))]