Update code as dependent libraries' update requires

boltomli · Dec 30, 2019 · 03274b6 · 03274b6
1 parent e20ef34
commit 03274b6
Show file tree

Hide file tree

Showing 10 changed files with 71 additions and 47 deletions.
diff --git a/01-tidy-text.Rmd b/01-tidy-text.Rmd
@@ -197,13 +197,12 @@ tidy_mingqingxiaoshuo %>%
   count(word, sort = TRUE)
 ```
 
-图 \@ref(fig:plotcount-zh) 展示了常见词词频。这里尝试使用 [showtext](https://github.com/yixuan/showtext) [@R-showtext] 方便图片里的中文字符正确渲染而无需依赖系统字体。[文泉驿微米黑](https://wenq.org/wqy2/index.cgi?MicroHei)是自由字体，遵循[GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html)，允许所有人复制和发布。
+图 \@ref(fig:plotcount-zh) 展示了常见词词频。这里尝试使用 [showtext](https://github.com/yixuan/showtext) [@R-showtext] 方便图片里的中文字符正确渲染而无需依赖系统字体。`showtext`内嵌的[文泉驿微米黑](https://wenq.org/wqy2/index.cgi?MicroHei)是自由字体，遵循[GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html)，允许所有人复制和发布。
 
 ```{r plotcount-zh, dependson = "tidy_mingqingxiaoshuo", fig.width=6, fig.height=5, fig.cap="四大名著中最常见的词"}
 library(showtext)
 showtext_auto(enable = TRUE)
-font_add("WenQuanYi Micro Hei", "data/wqy-microhei.ttc")
-
+pdf()
 tidy_mingqingxiaoshuo %>%
   count(word, sort = TRUE) %>%
   filter(n >= 5000) %>%
@@ -212,7 +211,7 @@ tidy_mingqingxiaoshuo %>%
   geom_col() +
   xlab(NULL) +
   coord_flip() +
-  theme(text = element_text(family = "WenQuanYi Micro Hei"))
+  theme(text = element_text(family = "wqy-microhei"))
 ```
 
 ## 词频
@@ -291,14 +290,14 @@ library(scales)
 ggplot(frequency, aes(x = proportion, y = `Various`, color = abs(`Various` - proportion))) +
   geom_abline(color = "gray40", lty = 2) +
   geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
-  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family = "WenQuanYi Micro Hei") +
+  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family = "wqy-microhei") +
   scale_x_log10(labels = percent_format()) +
   scale_y_log10(labels = percent_format()) +
   scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
   facet_wrap(~author, ncol = 2) +
   theme(legend.position="none") +
   labs(y = "Various", x = NULL) +
-  theme(text = element_text(family = "WenQuanYi Micro Hei"))
+  theme(text = element_text(family = "wqy-microhei"))
 ```
 
 靠近斜线的点代表的词在相应的两组文本中有着相似的出现频率，比如“道”“是”“不”同时高频出现在四大名著和冯梦龙的文本中，而“不”“見”“上”同时高频出现在四大名著和袁枚的文本中。远离斜线的词在一个文本中比另一个要常见得多。比如，在四大名著-冯梦龙一侧的图中，诸如“寳玉”“大聖”这些词（全是人名和称谓）在四大名著文本中多见，而在冯梦龙文本中就不多，同时“東坡”在冯梦龙文本中更多见而非四大名著文本。

diff --git a/02-sentiment-analysis.Rmd b/02-sentiment-analysis.Rmd
@@ -7,7 +7,7 @@ options(width = 100, dplyr.width = 100)
 library(jiebaR)
 library(showtext)
 showtext_auto(enable = TRUE)
-font_add("WenQuanYi Micro Hei", "data/wqy-microhei.ttc")
+pdf()
 library(ggplot2)
 theme_set(theme_light())
 ```
@@ -38,7 +38,7 @@ sentiments
 
 这三个词典全部基于一元组，即单独的词。这些词典包含了很多英语词，每个词都有正面/负面情感的分数，以及可能的情感类型如开心、愤怒、悲伤等。`nrc` 词典把词按二元（是/否）标注在各个类别中，如正面、负面、愤怒、期待、恶心、恐惧、开心、悲伤、惊讶和信任。`bing` 词典把词按二元归于正面和负面的类别。`AFINN` 词典给每个词分配 -5 到 5 之间的值，负值代表负面情感，正值代表正面情感。这些信息列在 `sentiments` 数据集的表格中，tidytext 提供了 `get_sentiments()` 函数以获取特定的情感词典，且不含其未使用的列。
 
-```{r}
+```{r, eval=FALSE}
 get_sentiments("afinn")
 get_sentiments("bing")
 get_sentiments("nrc")
@@ -90,8 +90,17 @@ tidy_books <- hongloumeng_en %>%
 
 现在文本已经是 tidy 格式，每行一个词，我们可以开始情感分析。首先，使用 NRC 词典和 `filter()` 得到开心的词。之后，用 `filter()` 得到上半部书的词，然后用 `inner_join()` 执行情感分析。上半部书里最常见表示开心的词有哪些？使用 dplyr 里的 `count()`。
 
+```{r eval=FALSE}
+nrc <- get_sentiments("nrc")
+```
+
+```{r echo=FALSE}
+load("data/nrc.rda")
+nrc
+```
+
 ```{r nrcjoy, dependson = "tidy_books_sentiment"}
-nrc_joy <- get_sentiments("nrc") %>% 
+nrc_joy <- nrc %>% 
   filter(sentiment == "joy")
 
 tidy_books %>%
@@ -151,18 +160,27 @@ Remember from above that the AFINN lexicon measures sentiment with a numeric sco
 
 依然使用整数除法（`%/%`）确定含有多行的较大段的文本，用同样的方式使用 `count()`、`spread()`和`mutate()` 算出每段文本的情感净值。
 
+```{r eval=FALSE}
+get_sentiments("afinn")
+```
+
+```{r echo=FALSE}
+load("data/afinn.rda")
+afinn
+```
+
 ```{r comparesentiment, dependson = "book_2"}
 afinn <- book_2 %>% 
-  inner_join(get_sentiments("afinn")) %>% 
+  inner_join(afinn) %>%
   group_by(index = linenumber %/% 100) %>% 
-  summarise(sentiment = sum(score)) %>% 
+  summarise(sentiment = sum(value)) %>%
   mutate(method = "AFINN")
 
 bing_and_nrc <- bind_rows(book_2 %>% 
                             inner_join(get_sentiments("bing")) %>%
                             mutate(method = "Bing et al."),
                           book_2 %>% 
-                            inner_join(get_sentiments("nrc") %>% 
+                            inner_join(nrc %>%
                                          filter(sentiment %in% c("positive", 
                                                                  "negative"))) %>%
                             mutate(method = "NRC")) %>%
@@ -254,7 +272,7 @@ hongloumeng %>%
   unnest_tokens(word, text) %>%
   count(word) %>%
   top_n(80) %>%
-  wordcloud2(fontFamily = "WenQuanYi Micro Hei")
+  wordcloud2(fontFamily = "wqy-microhei")
 ```
 
 使用 R base 的 wordcloud 包 [@R-wordcloud] 包含了 `comparison.cloud()`，可以用 reshape2 的 `acast()` 把数据框转变为矩阵。用内部联接标记情感分析正面和负面的词，然后找到最常见的正面和负面词。在把数据发给 `comparison.cloud()` 之前，都可以用联接、管道和 dplyr，因为数据为 tidy 格式。

diff --git a/03-tf-idf.Rmd b/03-tf-idf.Rmd
@@ -8,9 +8,9 @@ library(jiebaR)
 library(ggplot2)
 library(showtext)
 showtext_auto(enable = TRUE)
-font_add("WenQuanYi Micro Hei", "data/wqy-microhei.ttc")
+pdf()
 theme_zh <- theme_light() +
-  theme(text = element_text(family = "WenQuanYi Micro Hei"))
+  theme(text = element_text(family = "wqy-microhei"))
 theme_set(theme_zh)
 ```
 

diff --git a/04-word-combinations.Rmd b/04-word-combinations.Rmd
@@ -148,39 +148,44 @@ bigrams_separated %>%
 
 By performing sentiment analysis on the bigram data, we can examine how often sentiment-associated words are preceded by "not" or other negating words. We could use this to ignore or even reverse their contribution to the sentiment score.
 
-Let's use the AFINN lexicon for sentiment analysis, which you may recall gives a numeric sentiment score for each word, with positive or negative numbers indicating the direction of the sentiment.
+Let's use the AFINN lexicon for sentiment analysis, which you may recall gives a numeric sentiment value for each word, with positive or negative numbers indicating the direction of the sentiment.
 
-```{r AFINN_ngrams}
+```{r eval=FALSE}
 AFINN <- get_sentiments("afinn")
 AFINN
 ```
 
+```{r AFINN_ngrams, echo=FALSE}
+load("data/afinn.rda")
+AFINN <- afinn
+AFINN
+```
+
 We can then examine the most frequent words that were preceded by "not" and were associated with a sentiment.
 
 ```{r not_words, dependson = c("bigrams", "AFINN_ngrams")}
 not_words <- bigrams_separated %>%
   filter(word1 == "not") %>%
   inner_join(AFINN, by = c(word2 = "word")) %>%
-  count(word2, score, sort = TRUE) %>%
-  ungroup()
+  count(word2, value, sort = TRUE)
 not_words
 ```
 
 For example, the most common sentiment-associated word to follow "not" was "like", which would normally have a (positive) score of 2.
 
-It's worth asking which words contributed the most in the "wrong" direction. To compute that, we can multiply their score by the number of times they appear (so that a word with a score of +3 occurring 10 times has as much impact as a word with a sentiment score of +1 occurring 30 times). We visualize the result with a bar plot (Figure \@ref(fig:notwordsplot)).
-
-```{r notwordsplot, dependson = "not_words", fig.width=8, fig.height=6, fig.cap = "The 20 words preceded by 'not' that had the greatest contribution to sentiment scores, in either a positive or negative direction"}
+It's worth asking which words contributed the most in the "wrong" direction. To compute that, we can multiply their value by the number of times they appear (so that a word with a value of +3 occurring 10 times has as much impact as a word with a sentiment value of +1 occurring 30 times). We visualize the result with a bar plot (Figure \@ref(fig:notwordsplot)).
 
+```{r notwordsplot, dependson = "not_words", fig.width=8, fig.height=6, fig.cap = "The 20 words preceded by 'not' that had the greatest contribution to sentiment values, in either a positive or negative direction"}
+library(ggplot2)
 not_words %>%
-  mutate(contribution = n * score) %>%
+  mutate(contribution = n * value) %>%
   arrange(desc(abs(contribution))) %>%
   head(20) %>%
   mutate(word2 = reorder(word2, contribution)) %>%
-  ggplot(aes(word2, n * score, fill = n * score > 0)) +
+  ggplot(aes(word2, n * value, fill = n * value > 0)) +
   geom_col(show.legend = FALSE) +
   xlab("Words preceded by \"not\"") +
-  ylab("Sentiment score * number of occurrences") +
+  ylab("Sentiment value * number of occurrences") +
   coord_flip()
 ```
 
@@ -193,24 +198,23 @@ negation_words <- c("not", "no", "never", "without")
 negated_words <- bigrams_separated %>%
   filter(word1 %in% negation_words) %>%
   inner_join(AFINN, by = c(word2 = "word")) %>%
-  count(word1, word2, score, sort = TRUE) %>%
-  ungroup()
+  count(word1, word2, value, sort = TRUE)
 ```
 
-We could then visualize what the most common words to follow each particular negation are (Figure \@ref(fig:negatedwords)). While "not like" and "not help" are still the two most common examples, we can also see pairings such as "no great" and "never loved." We could combine this with the approaches in Chapter \@ref(sentiment) to reverse the AFINN scores of each word that follows a negation. These are just a few examples of how finding consecutive words can give context to text mining methods.
+We could then visualize what the most common words to follow each particular negation are (Figure \@ref(fig:negatedwords)). While "not like" and "not help" are still the two most common examples, we can also see pairings such as "no great" and "never loved." We could combine this with the approaches in Chapter \@ref(sentiment) to reverse the AFINN values of each word that follows a negation. These are just a few examples of how finding consecutive words can give context to text mining methods.
 
 ```{r negatedwords, dependson = "negated_words", fig.width=9, fig.height=9, echo = FALSE, fig.cap = "The most common positive or negative words to follow negations such as 'never', 'no', 'not', and 'without'"}
 negated_words %>%
-  mutate(contribution = n * score,
+  mutate(contribution = n * value,
          word2 = reorder(paste(word2, word1, sep = "__"), contribution)) %>%
   group_by(word1) %>%
   top_n(12, abs(contribution)) %>%
-  ggplot(aes(word2, contribution, fill = n * score > 0)) +
+  ggplot(aes(word2, contribution, fill = n * value > 0)) +
   geom_col(show.legend = FALSE) +
   facet_wrap(~ word1, scales = "free") +
   scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) +
   xlab("Words preceded by negation term") +
-  ylab("Sentiment score * # of occurrences") +
+  ylab("Sentiment value * # of occurrences") +
   coord_flip()
 ```
 

diff --git a/05-document-term-matrices.Rmd b/05-document-term-matrices.Rmd
@@ -309,7 +309,8 @@ Each of the items in the `corpus` list column is a `WebCorpus` object, which is
 
 ```{r stock_tokens, dependson = "stock_articles"}
 stock_tokens <- stock_articles %>%
-  unnest(map(corpus, tidy)) %>%
+  mutate(corpus = map(corpus, tidy)) %>%
+  unnest(cols = (corpus)) %>%
   unnest_tokens(word, text) %>%
   select(company, datetimestamp, word, id, heading)
 
@@ -318,7 +319,7 @@ stock_tokens
 
 Here we see some of each article's metadata alongside the words used. We could use tf-idf to determine which words were most specific to each stock symbol.
 
-```{r}
+```{r stocktfidfdata, dependson="stock_tokens"}
 library(stringr)
 
 stock_tf_idf <- stock_tokens %>%
@@ -330,7 +331,7 @@ stock_tf_idf <- stock_tokens %>%
 
 The top terms for each are visualized in Figure \@ref(fig:stocktfidf). As we'd expect, the company's name and symbol are typically included, but so are several of their product offerings and executives, as well as companies they are making deals with (such as Disney with Netflix).
 
-```{r stocktfidf, dependson = "stock_tf_idf", echo = FALSE, fig.cap = "The 8 words with the highest tf-idf in recent articles specific to each company", fig.height = 8, fig.width = 8}
+```{r stocktfidf, dependson = "stocktfidfdata", echo = FALSE, fig.cap = "The 8 words with the highest tf-idf in recent articles specific to each company", fig.height = 8, fig.width = 8}
 stock_tf_idf %>%
   group_by(company) %>%
   top_n(8, tf_idf) %>%
@@ -346,19 +347,20 @@ stock_tf_idf %>%
 
 If we were interested in using recent news to analyze the market and make investment decisions, we'd likely want to use sentiment analysis to determine whether the news coverage was positive or negative. Before we run such an analysis, we should look at what words would contribute the most to positive and negative sentiments, as was shown in Chapter \@ref(most-positive-negative). For example, we could examine this within the AFINN lexicon (Figure \@ref(fig:stockafinn)).
 
-```{r stockafinn, dependson = "stock_articles", fig.cap = "The words with the largest contribution to sentiment scores in recent financial articles, according to the AFINN dictionary. The 'contribution' is the product of the word and the sentiment score."}
+```{r stockafinn, dependson = "stock_articles", fig.cap = "The words with the largest contribution to sentiment values in recent financial articles, according to the AFINN dictionary. The 'contribution' is the product of the word and the sentiment value."}
+load("data/afinn.rda")
 stock_tokens %>%
   anti_join(stop_words, by = "word") %>%
   count(word, id, sort = TRUE) %>%
-  inner_join(get_sentiments("afinn"), by = "word") %>%
+  inner_join(afinn, by = "word") %>%
   group_by(word) %>%
-  summarize(contribution = sum(n * score)) %>%
+  summarize(contribution = sum(n * value)) %>%
   top_n(12, abs(contribution)) %>%
   mutate(word = reorder(word, contribution)) %>%
   ggplot(aes(word, contribution)) +
   geom_col() +
   coord_flip() +
-  labs(y = "Frequency of word * AFINN score")
+  labs(y = "Frequency of word * AFINN value")
 ```
 
 In the context of these financial articles, there are a few big red flags here. The words "share" and "shares" are counted as positive verbs by the AFINN lexicon ("Alice will **share** her cake with Bob"), but they're actually neutral nouns ("The stock price is $12 per **share**") that could just as easily be in a positive sentence as a negative one. The word "fool" is even more deceptive: it refers to Motley Fool, a financial services company. In short, we can see that the AFINN sentiment lexicon is entirely unsuited to the context of financial data (as are the NRC and Bing lexicons).
@@ -368,9 +370,10 @@ Instead, we introduce another sentiment lexicon: the Loughran and McDonald dicti
 The Loughran data divides words into six sentiments: "positive", "negative", "litigious", "uncertain", "constraining", and "superfluous". We could start by examining the most common words belonging to each sentiment within this text dataset.
 
 ```{r stockloughransentiments, fig.cap = "The most common words in the financial news articles associated with each of the six sentiments in the Loughran and McDonald lexicon"}
+load("data/loughran.rda")
 stock_tokens %>%
   count(word) %>%
-  inner_join(get_sentiments("loughran"), by = "word") %>%
+  inner_join(loughran, by = "word") %>%
   group_by(sentiment) %>%
   top_n(5, n) %>%
   ungroup() %>%
@@ -388,7 +391,7 @@ Now that we know we can trust the dictionary to approximate the articles' sentim
 
 ```{r}
 stock_sentiment_count <- stock_tokens %>%
-  inner_join(get_sentiments("loughran"), by = "word") %>%
+  inner_join(loughran, by = "word") %>%
   count(sentiment, company) %>%
   spread(sentiment, n, fill = 0)
 
@@ -399,13 +402,13 @@ It might be interesting to examine which company has the most news with "litigio
 
 ```{r stockpositivity, fig.cap = "\"Positivity\" of the news coverage around each stock in January 2017, calculated as (positive - negative) / (positive + negative), based on uses of positive and negative words in 20 recent news articles about each company"}
 stock_sentiment_count %>%
-  mutate(score = (positive - negative) / (positive + negative)) %>%
-  mutate(company = reorder(company, score)) %>%
-  ggplot(aes(company, score, fill = score > 0)) +
+  mutate(value = (positive - negative) / (positive + negative)) %>%
+  mutate(company = reorder(company, value)) %>%
+  ggplot(aes(company, value, fill = value > 0)) +
   geom_col(show.legend = FALSE) +
   coord_flip() +
   labs(x = "Company",
-       y = "Positivity score among 20 recent news articles")
+       y = "Positivity value among 20 recent news articles")
 ```
 
 Based on this analysis, we'd say that in January 2017 most of the coverage of Yahoo and Twitter was strongly negative, while coverage of Google and Amazon was the most positive. A glance at current financial headlines suggest that it's on the right track. If you were interested in further analysis, you could use one of R's many quantitative finance packages to compare these articles to recent stock prices and other metrics.

diff --git a/06-topic-models.Rmd b/06-topic-models.Rmd
@@ -13,9 +13,9 @@ library(methods)
 library(scales)
 library(showtext)
 showtext_auto(enable = TRUE)
-font_add("WenQuanYi Micro Hei", "data/wqy-microhei.ttc")
+pdf()
 theme_zh <- theme_light() +
-  theme(text = element_text(family = "WenQuanYi Micro Hei"))
+  theme(text = element_text(family = "wqy-microhei"))
 theme_set(theme_zh)
 ``` 
 
@@ -353,7 +353,7 @@ assignments %>%
        fill = "正确分配的百分比") +
   theme(axis.text.x = element_text(angle = 90, hjust = 1),
         panel.grid = element_blank(),
-        text = element_text(family = "WenQuanYi Micro Hei"))
+        text = element_text(family = "wqy-microhei"))
 ```
 
 注意到几乎所有词都正确分配了，但不少书里的词都倾向于了《水滸後傳》。最常出错的词有哪些？

diff --git a/data/afinn.rda b/data/afinn.rda
diff --git a/data/loughran.rda b/data/loughran.rda
diff --git a/data/nrc.rda b/data/nrc.rda
diff --git a/data/wqy-microhei.ttc b/data/wqy-microhei.ttc