-
Notifications
You must be signed in to change notification settings - Fork 0
/
topicmodel.Rmd
102 lines (86 loc) · 2.92 KB
/
topicmodel.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
---
title: "Merzmail Topics"
author: "Anton Könneke"
output: github_document
---
```{r}
pacman::p_load(quanteda,
readtext,
seededlda,
tidyverse,
quanteda.textstats,
quanteda.textplots,
LSX)
devtools::load_all(path = "~/Documents/gh_forks/gendered")
mm <- readtext(file = "texts/mm_06-2023.csv")
```
Now I recode the dates.
```{r}
Sys.getlocale()
Sys.setlocale(locale = "de_DE")
mm <- mm %>%
mutate(day = str_extract(date, "^\\d+"),
mon = str_match(date, "(?<=\\d{1}\\.\\s)\\w*")[,1],
year = str_extract(date, "\\d{4}$"),
date_fmt = paste(year, mon, day, sep = "-"),
date_fmt2 = as.Date(date_fmt, format = "%Y-%B-%e")) %>%
filter(!is.na(fulltext))
Sys.setlocale(locale = "en_US")
```
# Gender
Friedrich Merz meint ja, dass die AfD durch jede gegenderte Nachrichtensendung
Prozente dazugewinnt. Aber wie häufig gendert eigentlich Friedrich Merz?
```{r}
mm <- mm %>%
mutate(fulltext = str_squish(fulltext),
gender_stats = gendered::any_gender(fulltext))
mm <- mm %>%
mutate(percent_gpair = gender_stats$percent_gendered_pair,
percent_gneut = gender_stats$percent_gendered_neutral,
percent_gsym = gender_stats$percent_gendered_symbol,
percent_g = gender_stats$total_gendered_percent)
mm %>%
pivot_longer(starts_with("percent"), names_prefix = "percent_") %>%
mutate(mon = floor_date(date_fmt2, "month")) %>%
summarise(percent = mean(value, na.rm = T), .by = c(mon, name)) %>%
ggplot(aes(mon, percent, color = name))+
geom_line(size = 1)+
theme_bw()+
labs(title = "Gendern in der #MerzMail")+
ylab("")+
scale_y_continuous(labels = scales::label_percent())+
xlab("")+
scale_color_manual("", values = c(2:5),
labels = c("Gendern insgesamt",
"Neutrale Ausdrücke",
"Nennung beider\nGeschlechter",
"Gendern mit Symbol"))+
coord_cartesian(xlim = as.Date(c("2022-06-01", "2023-06-05")))
ggsave("plots/mmgender.png")
```
Clean tokens
```{r}
mm_corp <- corpus(mm, text_field = "fulltext")
mm_tok <- tokens(mm_corp, remove_punct = TRUE, remove_numbers = TRUE, remove_symbol = TRUE)
mm_tok <- tokens_remove(mm_tok, pattern = c(stopwords("de")))
mm_dfmat <- dfm(mm_tok) %>%
dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
max_docfreq = 0.1, docfreq_type = "prop")
```
Calculate LDA
```{r}
ldas <- lapply(seq(5, 20, 3), function(x){
textmodel_lda(mm_dfmat, k = x)
})
lapply(ldas, function(x) terms(x, 10))
mm_dfmat$topic_k14 <- topics(ldas[[4]])
terms(ldas[[4]], n = 3)
seededlda::sizes(ldas[[4]])
seededlda::terms(ldas[[4]])
```
Calculate Keyness of last mail
```{r}
keyness <-
textstat_keyness(mm_dfmat, target = docid(mm_dfmat) == "mm_06-2023.csv.153")
textplot_keyness(keyness)
```