-
Notifications
You must be signed in to change notification settings - Fork 0
/
pilot-study.R
106 lines (76 loc) · 3.13 KB
/
pilot-study.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
## pilot STP ICS plan analysis
## based on 3 plans
### load libraries
library(pacman)
p_load(tidyverse, readtext, quanteda, myScrapers)
### extract and import files
dir <- tempdir()
unzip("~/Downloads/fwdofficialaisoftwareanalysisoficsplans.zip", exdir = dir)
pdfs <- list.files(dir, pattern = "pdf")
files <- readtext(paste0(dir, "/", pdfs))
### explore files
corp1 <- corpus(files, text_field = "text")
textplot_xray(kwic(corp1, phrase("blue*")))
corpus <- files %>%
mutate(pmid = row_number()) %>%
rename(absText = text,
title = doc_id ) %>%
create_abstract_corpus()
dfm <- corpus$corpus %>%
filter(!str_detect(word, "[[:digit:]]")) %>%
cast_dfm(pmid, word, n) %>%
as.dfm()
### word clouds
textplot_wordcloud(dfm, comparison = TRUE, min_count = 2, color = viridis::viridis(5, begin = .2))
## Search for key words
### create dictionary
l <- list("Built Environment, Natural Environment, Nature, Green space,
Blue space, Parks, Open space, Built and Natural Environment,
Environment(al), Wider determinants, Social determinants, Housing, Homelessness,
Rough sleeping, Good home, Affordable housing, Private rented sector,
Social housing, Accommodation, Sustainable housing, Fuel poverty, Education,
Employment, Place, Neighbourhood, Place based, Deprivation, Deprived communities, Air quality, Air pollution, Transport,
Public transport, Traffic, Congestion, Walking, Cycling, Walking and cycling, Active travel")
dictionary <- create_lookup(
natural_environment = c("natur*", "green*", "blue*", "open*"),
built_environment = "buil*",
determinants = "determin*",
housing = c("hous", "hom*", "afford"),
fuel_poverty = "fuel",
place_based = c("plac", "neighbour*"),
deprivation = "depriv*",
airquality = c("pollut*", "air_qual*"),
transport = c("traffic", "walk*", "cycl*"),
homelessness = c("homeless", "rough"),
employment = "employ*"
)
lu <- dfm_lookup(dfm, dictionary = dictionary) %>%
convert(., to = "data.frame") %>%
pivot_longer(names_to = "category", values_to = "count", cols = 2:ncol(.))
lu %>%
gt::gt()
lu %>%
ggplot(aes(category, fct_rev(doc_id), fill = count)) +
geom_tile() +
scale_fill_gradient2(midpoint = 30, low = "white", mid = "orange", high = "red") +
coord_equal() +
#viridis::scale_fill_viridis(direction = -1) +
labs(y = "doc_id")
### synonyms / word vectors
library(text2vec)
tok <- tokens(corp1)
feats <- dfm(tok, verbose = TRUE) %>%
dfm_trim(min_termfreq = 2) %>%
featnames()
fcm <- fcm(tok, context = "window", count = "weighted", weights = 1 / (1:5), tri = TRUE)
glove <- GlobalVectors$new(rank = 50, x_max = 10)
wv_main <- glove$fit_transform(fcm, n_iter = 10,
convergence_tol = 0.01, n_threads = 8)
wv_context <- glove$components
word_vectors <- wv_main + t(wv_context)
test <- word_vectors["homeless", , drop = FALSE] +
word_vectors["environment", , drop = FALSE]
library("quanteda.textstats")
cos_sim <- textstat_simil(x = as.dfm(word_vectors), y = as.dfm(test),
method = "cosine")
head(sort(cos_sim[, 1], decreasing = TRUE), 20)