IPBES_TCA_Corpus_DMP.qmd

---
title: 'Data Management Report Transformative Change Assessment Corpus - SOD'
date: today
author:
  - name: 
        family: Krug
        given: Rainer M.    
    id: rmk
    orcid: 0000-0002-7490-0066
    email: Rainer.Krug@Senckenberg.de, Rainer@Krugs.de
    affiliation: 
      - name: Senckenberg
        city: Frankfurt (Main)
        url: https://www.senckenberg.de/en/institutes/sbik-f/
    roles: [author, editor]
abstract: > 
    The literature search for the assessment corpus was conducted using search terms provided by the experts and refined in co-operation with the Knowldge and Data task force.
    The search was conducted using [OpenAlex](https://openalex.org), scripted from [R](https://cran.r-project.org) to use  the [API](https://docs.openalex.org). Search terms for the following searches were defined:
    **Transformative Change**,
    **Nature / Environment** and 
    **additional search terms for individual chapters and sub-chapters**
    To assess the quality of the corpus, sets of key-papers were selected by the experts to verify if these are in the corpus. 
    These key-papers were selected per chapter / sub-chapter to ensure that the corpus is representative of each chapter.  
keywords:
  - DMR
  - TCA
  - Assessment Corpus
license: "CC BY"

citation: 
  type: report
  container-title: Report Transformative Change Assessment Corpus
  doi: 10.5281/zenodo.10251349
doi: 10.5281/zenodo.10251349
version: 2.0.0

format:
    html:
        toc: true
        toc-depth: 5
        toc_expand: true
        embed-resources: true
        code-fold: true
        code-summary: 'Show the code'
        grid:
            sidebar-width: 0px
            body-width: 4000px
            margin-width: 200px
            gutter-width: 1.5rem      
execute:
  message: NA
params:
    # s_level_1: (transformation OR transition* OR ((shift OR change) AND (fundamental OR deep OR radical))) AND (socio OR social OR politics OR political OR governance OR economical OR cultural OR system* OR technological OR inner OR personal)  
    
    # s_tfc_rev: "('transformative change'  OR  'deliberate transformation*'  OR  'transformative turn*'  OR  'transition*'  OR  'social-ecological change*'  OR  'deep change'  OR  'fundamental alteration'  OR  'profound change'  OR  'profound transformation'  OR  'radical transformation'  OR  'transformational change'  OR  'complete change'  OR  'complete transformation'  OR  'drastic change'  OR  'in-depth transformation'  OR  'progressive change'  OR  'radical alteration'  OR  'radical change'  OR  'revolutionary change'  OR  'significant modification'  OR  'total transformation'  OR  'transition'  OR  'pathway'  OR  'power'  OR  'agency'  OR  'scale'  OR  'leverage'  OR  'context'  OR  'process'  OR  'regime'  OR  'shift'  OR  'views'  OR  'value*'  OR  'structure*'  OR  'institution*' OR  'deliberate'  OR  'structural'  OR  'fundamental'  OR  'system*'  OR  'deep'  OR  'radical'  OR  'profound'  OR  'drastic'  OR  'widespread'  OR  'political'  OR  'economical'  OR  'structur*'  OR  'complete'  OR  'progressive'  OR  'revolutionary'  OR  'substantial'  OR  'significant') AND ('transformation'  OR  'alteration'  OR  'change'  OR  'turn'  OR  'action' OR  'transition'  OR  'shift' )"
 
    s_1_oa: ""
    s_1_transformative_change: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "tfc.txt")), collapse = "\n")
    s_1_nature_environment: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "nature.txt")), collapse = "\n")
    s_1_tca_corpus: !expr paste("(", paste0(readLines(file.path("tca_corpus", "input", "search terms", "nature.txt")), collapse = "\n"), ") \nAND \n(", paste0(readLines(file.path("tca_corpus", "input", "search terms", "tfc.txt")), collapse = "\n"), ")")
    s_1_ch1_01: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "ch1_01.txt")), collapse = "\n")
    s_1_ch1_02: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "ch1_02.txt")), collapse = "\n")
    s_1_ch1_03: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "ch1_03.txt")), collapse = "\n")
    s_1_ch1_04: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "ch1_04.txt")), collapse = "\n")
    s_1_ch1_05: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "ch1_05.txt")), collapse = "\n")
    s_1_ch1_06: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "ch1_06.txt")), collapse = "\n")
    s_1_ch2:    !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "ch2.txt")),    collapse = "\n")
    s_1_ch3_01: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "ch3_01.txt")), collapse = "\n")
    s_1_ch3_02: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "ch3_02.txt")), collapse = "\n")
    s_1_ch3_03: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "ch3_03.txt")), collapse = "\n")
    s_1_ch3_04: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "ch3_04.txt")), collapse = "\n")
    s_1_ch3_05: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "ch3_05.txt")), collapse = "\n")
    s_1_ch3_06: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "ch3_06.txt")), collapse = "\n")
    s_1_ch4_01: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "ch4_01.txt")), collapse = "\n")     
    s_1_ch4_02: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "ch4_02.txt")), collapse = "\n")
    # s_1_case:   !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "case.txt")),   collapse = "\n")
    s_1_case: !expr paste("(", paste0(readLines(file.path("tca_corpus", "input", "search terms", "nature.txt")), collapse = "\n"), ") \nAND \n(", paste0(readLines(file.path("tca_corpus", "input", "search terms", "tfc.txt")), collapse = "\n"), ") \nAND \n(", paste0(readLines(file.path("tca_corpus", "input", "search terms", "case.txt")), collapse = "\n"), ")")


    st_art:   !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "art.txt")),   collapse = "\n")
    st_media: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "media.txt")),   collapse = "\n")
    st_imaginative: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "imaginative.txt")),   collapse = "\n")
    st_rights_of_nature: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "rights_of_nature.txt")),   collapse = "\n")
    st_initiative: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "initiative.txt")),   collapse = "\n")
    st_creativity: !expr paste0(readLines(file.path("tca_corpus", "input", "search terms", "creativity.txt")),   collapse = "\n")
    
    sts_strategies_options: !expr readLines(file.path("tca_corpus", "input", "search terms", "strategies_options.md"))
    sts_actors: !expr readLines(file.path("tca_corpus", "input", "search terms", "actors.md"))


    # s_1_ch2_vision_case: !expr paste("(", paste0(readLines(file.path("tca_corpus", "input", "search terms", "ch2.txt")), collapse = "\n"), ") \nAND \n(", paste0(readLines(file.path("tca_corpus", "input", "search terms", "case.txt")), collapse = "\n"), ")")
   
    concept_cuttoff: 0.6

    key_papers:
    # Ch_1:
        - "./tca_corpus/input/key papers/Ch 1 - Arun.csv"
        - "./tca_corpus/input/key papers/Ch 1 - pdf.csv"
        - "./tca_corpus/input/key papers/Ch 1 - word.csv"
    # Ch_2:
        - "./tca_corpus/input/key papers/Ch 2 - pdf.csv"
        - "./tca_corpus/input/key papers/Ch 2 - Sebastian.csv"
    # Ch_3_Cl_1:
        - "./tca_corpus/input/key papers/Ch 3 - Cl1.csv"
    # Ch_3_Cl_3:
        - "./tca_corpus/input/key papers/Ch 3 - Cl3.csv"
    # Ch_3_Cl_4:
        - "./tca_corpus/input/key papers/Ch 3 - Cl4.csv"
    # Ch_3_Cl_5:
        - "./tca_corpus/input/key papers/Ch 3 - Cl5.csv"
    # Ch_3Cl_6:
        - "./tca_corpus/input/key papers/Ch 3 - Cl6.csv"
    # Ch_3:
        - "./tca_corpus/input/key papers/Ch 3 - pdf.csv"
    # Ch_4_Cl_1:
        - "./tca_corpus/input/key papers/Ch 4 - Challenge 1.csv"
    # Ch_4_Cl_2:
        - "./tca_corpus/input/key papers/Ch 4 - Challenge 2.csv"
    # Ch_4_Cl_3:
        - "./tca_corpus/input/key papers/Ch 4 - Challenge 3.csv"
    # Ch_4_Cl_4:
        - "./tca_corpus/input/key papers/Ch 4 - Challenge 4.csv"
    # Ch_4_Cl_5:
        - "./tca_corpus/input/key papers/Ch 4 - Challenge 5.csv"
    # Ch_5:
        - "./tca_corpus/input/key papers/Ch 5 - Hannah.csv"
        - "./tca_corpus/input/key papers/Ch 5 - Victoria.csv"
        - "./tca_corpus/input/key papers/Ch 5 -.csv"


    gdm_dir: !expr file.path(".", "tca_corpus", "data", "gdm_dir")

    pages_dir: !expr file.path(".", "tca_corpus", "data", "pages")
    pages_cases_dir: !expr file.path(".", "tca_corpus", "data", "pages_cases")
    pages_strategies_dir: !expr file.path(".", "tca_corpus", "data", "pages_strategies")
    pages_actors_dir: !expr file.path(".", "tca_corpus", "data", "pages_actors")

    corpus_dir: !expr file.path(".", "tca_corpus", "data", "corpus")
    corpus_topics_dir: !expr file.path(".", "tca_corpus", "data", "corpus_topics")
    corpus_authors_dir: !expr file.path(".", "tca_corpus", "data", "corpus_authors")
    corpus_cases_dir: !expr file.path(".", "tca_corpus", "data", "corpus_cases")

    duckdb_fn: !expr file.path(".", "tca_corpus", "data", "tca_corpus.duckdb")

    sample_size: 10000
    mc.cores: 8
---


[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10251349.svg)](https://doi.org/10.5281/zenodo.10251349)
[![GitHub release](https://img.shields.io/github/release/IPBES-Data/IPBES_TCA_Corpus.svg)](https://github.com/IPBES-Data/IPBES_TCA_Corpus/releases/latest)
[![GitHub commits since latest release](https://img.shields.io/github/commits-since/IPBES-Data/IPBES_TCA_Corpus/latest)](https://github.com/IPBES-Data/IPBES_TCA_Corpus/commits/main)
[![License: CC BY 4.0](https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by/4.0/)

 
```{r}
#| label: setup
#| include: false

# paste(
#     "(", paste0(readLines(file.path("tca_corpus", "input", "search terms", "nature.txt")), collapse = "\n"), ") \nAND \n(", paste0(readLines(file.path("tca_corpus", "input", "search terms", "tfc.txt")), collapse = "\n"), ")"
# )

knitr::opts_chunk$set(message = NA)

build <- as.integer(readLines(file.path("tca_corpus", "buildNo")))
build <- build + 1
writeLines(as.character(build), file.path("tca_corpus", "buildNo"))

if (!exists("params")) {
    params <- rmarkdown::yaml_front_matter("./IPBES_TCA_Corpus_DMP.qmd")$params
}

knitr::opts_chunk$set(message = NA)

library(parallel)
library(pbmcapply)
library(pbapply)

library(openalexR)

library(dplyr)
library(tibble)

library(ggplot2)
library(patchwork)
library(igraph)
library(ggraph)
library(visNetwork)
library(DT)

library(tictoc)


library(knitr)

library(arrow)
library(parquetize)
library(purrr)
library(base64enc)

library(diffviewer)

library(collapsibleTree)

library(htmlwidgets)


if (!require("plantuml")) {
    install.packages("plantuml", repos = c("https://rkrug.r-universe.dev", "https://cran.r-project.org"))
    library(plantuml)
}

plantuml::server_set("remote")


# Function to make the search strings compacter
# compact <- function(x) {
#     x |>
#         gsub(pattern = "\n", replacement = " ") |>
#         gsub(pattern = "\\*", replacement = "") |>
#         gsub(pattern = "\\s+", replacement = " ") |>
#         gsub(pattern = "\\( ", replacement = "(") |>
#         gsub(pattern = " )", replacement = ")")
# }

if (!require("IPBES.R")) {
    install.packages("IPBES.R", repos = c("https://ipbes-data.r-universe.dev", "https://cloud.r-project.org"))
    if (!require("IPBES.R")) {
        stop("Package `IPBES.R` is not available and could not be installed!")
    }
}

source(file.path("tca_corpus", "R", "functions.R"))

# A single core request is needed to make multicore parallel processing work
invisible(oa_request(oa_query("bidiversity"), count_only = TRUE))

```

# Working Title
IPBES_TCA_Corpus

## Code repo

[Github repository](https://github.com/IPBES-Data/IPBES_TCA_Corpus)

## Build No: ``r build``

# Introduction
The following terminology is used in this document:

- **Individual corpus**: The corpus resulting from one search term, e.g. `transformative` or `nature` or `ChX_Y`
- **Assessment Corpus**: The corpus resulting from the search terms `transformative AND nature`
- **Chapter corpus**: The corpus resulting from `transformative AND Nature AND ChX_Y` 

The following searches are conducted on Title and Abstrat only as the availability of fulltext drops in 2020. OpenAlex did "inherit" fro Microsoft Academic their initial corpus in 2021 which contained a lot of fulltext for searches. After that time, other sources had to be used which did not include fulltext for searches. To eliminate this bias, we linit the search for terms in abstract and title only.

## Schematic Overview

```{r}
#| label: tca_corpus_overview_plantuml
#|

basename <- file.path("tca_corpus", "figures")

nf <- list.files(
    path = dirname(basename),
    pattern = basename(basename)
) |>
    length()

if (nf < 3) {
    puml <- readLines(file.path("tca_corpus", "input", "tca_corpus.plantuml")) |>
        paste(collapse = "\n") |>
        plantuml::plantuml()

    puml |>
        plantuml::get_graph(
            file = file.path("tca_corpus", "figures", "tca_corpus_overview.pdf")
        )

    puml |>
        plantuml::get_graph(
            file = file.path("tca_corpus", "figures", "tca_corpus_overview.svg")
        )

    puml |>
        plantuml::get_graph(
            file = file.path("tca_corpus", "figures", "tca_corpus_overview.png")
        )
}
```

![Schematic overview of the TCA Corpus as well as othher corpora using the TCA Corpus](tca_corpus/figures/tca_corpus_overview.png)

- To download png, [click here](tca_corpus/tca_corpus_overview.png){target="_blank"}.
- To download high resolution pdf, [click here](tca_corpus/figures/tca_corpus_overview.pdf){target="_blank"}.
- To download high resolution svg, [click here](tca_corpus/figures/tca_corpus_overview.svg){target="_blank"}.


## Search Terms {#sec-search_terms}
Here are the search terms used in this document. They were provided by the authors, and some adaptations were done by the tsu to adopt them for OpenAlex.

### Transformative Change {#sec-transform}


```{r}
#| label: s_1_transformative_change

cat(params$s_1_transformative_change)
```


### Nature {#sec-nature}


```{r}
#| label: s_1_nature_environment
#|

cat(params$s_1_nature_environment)
```


### Assessment Corpus {#sec-tca-corpus}

```{r}
#| label: s_1_tca_corpus
#|

cat(params$s_1_tca_corpus)
```


### Chapter 1

#### 01 {#sec-ch1-01}


```{r}
#| label: s_1_ch1_01
#|

cat(params$s_1_ch1_01)
```


#### 02 {#sec-ch1-02}


```{r}
#| label: s_1_ch1_02
#|

cat(params$s_1_ch1_02)
```


#### 03 {#sec-ch1-03}


```{r}
#| label: s_1_ch1_03
#|

cat(params$s_1_ch1_03)
```


#### 04 {#sec-ch1-04}


```{r}
#| label: s_1_ch1_04
#|

cat(params$s_1_ch1_04)
```


#### 05 {#sec-ch1-05}


```{r}
#| label: s_1_ch1_05
#|

cat(params$s_1_ch1_05)
```


#### 06 {#sec-ch1-06}


```{r}
#| label: s_1_ch1_06
#|

cat(params$s_1_ch1_06)
```


### Chapter 2 {#sec-ch2}


```{r} 
#| label: s_1_ch2
#|

cat(params$s_1_ch2)
```


### Chapter 3

#### 01 {#sec-ch3-01}


```{r}
#| label: s_1_ch3_01
#|

cat(params$s_1_ch3_01)
```


#### 02 {#sec-ch3-02}


```{r}
#| label: s_1_ch3_02
#|

cat(params$s_1_ch3_02)
```


#### 03 {#sec-ch3-03}


```{r}
#| label: s_1_ch3_03
#|

cat(params$s_1_ch3_03)
```


#### 04 {#sec-ch3-04}


```{r}
#| label: s_1_ch3_04
#|

cat(params$s_1_ch3_04)
```


#### 05 {#sec-ch3-05}

```{r}
#| label: s_1_ch3_05
#|

cat(params$s_1_ch3_05)
```


#### 06 {#sec-ch3-06}


```{r}
#| label: s_1_ch3_06
#|

cat(params$s_1_ch3_06)
```


### Chapter 4

#### 01 {#sec-ch4-01}


```{r}
#| label: s_1_ch4_01
#|

cat(params$s_1_ch4_01)
```


#### 02 {#sec-ch4-02}


```{r}
#| label: s_1_ch4_02
#|

cat(params$s_1_ch4_02)
```

### Art

```{r}
#| label: st_art
#|

cat(params$st_art)
```

### Media

```{r}
#| label: st_media
#|

cat(params$st_media)
```

### Strategies and Options

In contrast to the other search term, this opne contains multiple sets of search terms

```{r}
#| label: sts_strategies_options
#|

cat(params$sts_strategies_options)
```

### Chapter 5

#### Vision {#sec-ch5_vision}


```{r}
#| label: s_1_ch5_vision
#|

cat(params$s_1_ch5_vision)
```


#### Case {#sec-case}


```{r}
#| label: s_1_case
#|

cat(params$s_1_case)
```

#### Vision & Case {#sec-ch5_vision_case}


## Topics

OpenAlex assigns topics to each work in a hirarchical manner:

![](images/topic_hirarchy.png)

Please see [here](https://help.openalex.org/how-it-works/topics) for more information and [here](https://docs.google.com/spreadsheets/d/1v-MAq64x4YjhO7RWcB-yrKV5D_2vOOsxl4u6GBKEXY8/edit#gid=983250122) for a complete list of all topics and their corresponding subfields, fields and domains.


# Methods

## Get and calculate Data from OpenAlex
These data is gathered from OpenAlex directly, not using the downloaded TCA Corpus. The data is used to assess the quality of the TCA Corpus.

```{r}
#| label: get_search_term_hits
#|

fn <- file.path("tca_corpus", "data", "search_term_hits.rds")
if (!file.exists(fn)) {
    s_t <- grep("s_1_", names(params), value = TRUE)
    search_term_hits <- parallel::mclapply(
        s_t,
        function(stn) {
            message("getting '", stn, "' ...")
            if (grepl("_f_", stn)) {
                search <- params[[stn]]()
            } else {
                search <- params[[stn]]
            }
            search <- compact(search)
            openalexR::oa_query(filter = list(title_and_abstract.search = search)) |>
                openalexR::oa_request(count_only = TRUE, verbose = TRUE) |>
                unlist()
        },
        mc.cores = params$mc.cores,
        mc.preschedule = FALSE
    ) |>
        do.call(what = cbind) |>
        t() |>
        as.data.frame() |>
        dplyr::mutate(page = NULL, per_page = NULL) |>
        dplyr::mutate(count = formatC(count, format = "f", big.mark = ",", digits = 0))

    rownames(search_term_hits) <- s_t |>
        gsub(pattern = "s_1_", replacement = "") |>
        gsub(pattern = "f_", replacement = "") |>
        gsub(pattern = "^ch", replacement = "Chapter ") |>
        gsub(pattern = "_", replacement = " ")

    saveRDS(search_term_hits, file = fn)
} else {
    search_term_hits <- readRDS(fn)
}
```


```{r}
#| label: get_additional_search_hits
#|

fn <- file.path(".", "tca_corpus", "data", "additional_search.rds")
if (!file.exists(fn)) {
    # oa
    st <- params$s_1_tca_corpus |>
        compact()

    count_all <- openalexR::oa_fetch(title_and_abstract.search = st, count_only = TRUE, verbose = TRUE, output = "list")$count

    # art
    st <- paste0("(", params$s_1_tca_corpus, ") AND (", params$st_art, ")") |>
        compact()
    count <- openalexR::oa_fetch(title_and_abstract.search = st, count_only = TRUE, verbose = TRUE, output = "list")$count

    result <- data.frame(
        Category = "Art",
        Count = count,
        Proportion = count / count_all,
        Timestamp = Sys.time()
    )

    # media
    st <- paste0("(", params$s_1_tca_corpus, ") AND (", params$st_media, ")") |>
        compact()

    count <- openalexR::oa_fetch(title_and_abstract.search = st, count_only = TRUE, verbose = TRUE, output = "list")$count

    result <- rbind(
        result,
        data.frame(
            Category = "Media",
            Count = count,
            Proportion = count / count_all,
            Timestamp = Sys.time()
        )
    )

    # imaginative
    st <- paste0("(", params$s_1_tca_corpus, ") AND (", params$st_imaginative, ")") |>
        compact()

    count <- openalexR::oa_fetch(title_and_abstract.search = st, count_only = TRUE, verbose = TRUE, output = "list")$count

    result <- rbind(
        result,
        data.frame(
            Category = "Imaginative",
            Count = count,
            Proportion = count / count_all,
            Timestamp = Sys.time()
        )
    )

    # Rights of Nature
    st <- paste0("(", params$s_1_tca_corpus, ") AND (", params$st_rights_of_nature, ")") |>
        compact()

    count <- openalexR::oa_fetch(title_and_abstract.search = st, count_only = TRUE, verbose = TRUE, output = "list")$count

    result <- rbind(
        result,
        data.frame(
            Category = "Rights of Nature",
            Count = count,
            Proportion = count / count_all,
            Timestamp = Sys.time()
        )
    )

    # Initiative
    st <- paste0("(", params$s_1_tca_corpus, ") AND (", params$st_initiative, ")") |>
        compact()

    count <- openalexR::oa_fetch(title_and_abstract.search = st, count_only = TRUE, verbose = TRUE, output = "list")$count

    result <- rbind(
        result,
        data.frame(
            Category = "Initiative",
            Count = count,
            Proportion = count / count_all,
            Timestamp = Sys.time()
        )
    )

    # Creativity
    st <- paste0("(", params$s_1_tca_corpus, ") AND (", params$st_creativity,  ")") |>
        compact()

    count <- openalexR::oa_fetch(title_and_abstract.search = st, count_only = TRUE, verbose = TRUE, output = "list")$count

    result <- rbind(
        result,
        data.frame(
            Category = "Creativity",
            Count = count,
            Proportion = count / count_all,
            Timestamp = Sys.time()
        )
    )

    # save it
    saveRDS(result, file = fn)
}
```


```{r}
#| label: get_key_papers
#|

fn <- file.path(".", "tca_corpus", "data", "key_papers.rds")
if (!file.exists(fn)) {
    key_papers <- lapply(
        params$key_papers,
        function(fn) {
            message("Processing '", fn, "' ...")
            sapply(
                fn,
                function(x) {
                    read.csv(x) |>
                        select(DOI)
                }
            ) |>
                unlist()
        }
    )
    names(key_papers) <- gsub("\\.csv", "", basename(params$key_papers))


    key_papers <- list(
        Ch_1 = unlist(key_papers[grepl("Ch 1 -", names(key_papers))], recursive = FALSE) |> as.vector(),
        Ch_2 = unlist(key_papers[grepl("Ch 2 -", names(key_papers))], recursive = FALSE) |> as.vector(),
        Ch_3_Cl_1 = unlist(key_papers[grepl("Ch 3 - Cl1", names(key_papers))], recursive = FALSE) |> as.vector(),
        Ch_3_Cl_3 = unlist(key_papers[grepl("Ch 3 - Cl3", names(key_papers))], recursive = FALSE) |> as.vector(),
        Ch_3_Cl_4 = unlist(key_papers[grepl("Ch 3 - Cl4", names(key_papers))], recursive = FALSE) |> as.vector(),
        Ch_3_Cl_5 = unlist(key_papers[grepl("Ch 3 - Cl5", names(key_papers))], recursive = FALSE) |> as.vector(),
        Ch_3_Cl_6 = unlist(key_papers[grepl("Ch 3 - Cl6", names(key_papers))], recursive = FALSE) |> as.vector(),
        Ch_3 = unlist(key_papers[grepl("Ch 3 - p", names(key_papers))], recursive = FALSE) |> as.vector(),
        Ch_4_Cl_1 = unlist(key_papers[grepl("Ch 4 - Challenge 1", names(key_papers))], recursive = FALSE) |> as.vector(),
        Ch_4_Cl_2 = unlist(key_papers[grepl("Ch 4 - Challenge 2", names(key_papers))], recursive = FALSE) |> as.vector(),
        Ch_4_Cl_3 = unlist(key_papers[grepl("Ch 4 - Challenge 3", names(key_papers))], recursive = FALSE) |> as.vector(),
        Ch_4_Cl_4 = unlist(key_papers[grepl("Ch 4 - Challenge 4", names(key_papers))], recursive = FALSE) |> as.vector(),
        Ch_4_Cl_5 = unlist(key_papers[grepl("Ch 4 - Challenge 5", names(key_papers))], recursive = FALSE) |> as.vector(),
        Ch_5 = unlist(key_papers[grepl("Ch 5 -", names(key_papers))], recursive = FALSE) |> as.vector()
    )

    saveRDS(key_papers, file = fn)
} else {
    key_papers <- readRDS(fn)
}
```


```{r}
#| label: get_key_works
#|

fn_kw <- file.path(".", "tca_corpus", "data", "key_works.rds")
fn_kw_df <- file.path(".", "tca_corpus", "data", "key_works_df.rds")
if (!all(file.exists(fn_kw, fn_kw_df))) {
    key_works <- parallel::mclapply(
        key_papers,
        function(kp) {
            dois <- kp[kp != ""] |>
                unlist() |>
                tolower() |>
                unique()

            openalexR::oa_fetch(doi = dois, output = "list")
        },
        mc.cores = params$mc.cores,
        mc.preschedule = FALSE
    )

    found <- sapply(
        key_works,
        function(x) {
            length(x) > 0
        }
    )

    key_works <- key_works[found]

    print("The following key paper sets were excluded as they contained no papers in OpenAlex:\n")
    print(names(found)[!found])

    saveRDS(key_works, file = fn_kw)

    key_works_df <- lapply(
        key_works,
        oa2df,
        entity = "works"
    )

    saveRDS(key_works_df, fn_kw_df)
} else {
    key_works <- readRDS(file = fn_kw)
    key_works_df <- readRDS(fn_kw_df)
}
```


```{r}
#| label: get_key_works_hits
#|

fn <- file.path(".", "tca_corpus", "data", "key_works_hits.rds")
if (!file.exists(fn)) {
    kws <- key_works_df
    kws$all <- key_works_df |>
        bind_rows()

    nms <- names(kws)

    key_works_hits <- pbapply::pblapply(
        nms,
        function(nm) {
            message("Getting key paper set for ", nm, " ...")
            dois <- kws[[nm]] |>
                select(doi) |>
                distinct() |>
                unlist() |>
                unique() |>
                tolower()

            s_t <- grep("s_1_", names(params), value = TRUE)
            kw_h <- parallel::mclapply(
                s_t,
                function(stn) {
                    message("  getting '", stn, "' ...")
                    if (grepl("_f_", stn)) {
                        search <- compact(params[[stn]]())
                    } else {
                        search <- compact(params[[stn]])
                    }
                    get_count(dois = dois, list(title_and_abstract.search = search), verbose = FALSE)
                },
                mc.cores = params$mc.cores,
                mc.preschedule = FALSE
            ) |>
                do.call(what = cbind) |>
                as.data.frame()
            message("Done")

            names(kw_h) <- s_t

            # if (ncol(kw_h) == 1){
            #     kw_h <- t(kw_h)
            #     rownames(kw_h) <- dois
            # }

            kw_h <- rbind(
                kw_h,
                colSums(kw_h)
            )

            rownames(kw_h)[[nrow(kw_h)]] <- "Total"
            return(kw_h)
        }
    )

    names(key_works_hits) <- nms

    for (i in nms) {
        # key_works_hits[[i]] <- cbind(
        #     key_works_hits[[i]],
        #     key_works_hits_tca_filtered[[i]]
        # )

        key_works_hits[[i]] <- cbind(
            key_works_hits[[i]],
            Total = rowSums(key_works_hits[[i]])
        ) |>
            mutate(Total = Total - 1) # |>
        # relocate(tca_corpus_SDG, .after = s_1_tca_corpus)
    }

    ###

    saveRDS(key_works_hits, file = fn)
} else {
    key_works_hits <- readRDS(file = fn)
}
```


```{r}
#| label: get_languages_tca_corpus
#|

fn <- file.path(".", "tca_corpus", "data", "tca_corpus_languages.rds")

if (!file.exists(fn)) {
    list(
        timestamp <- Sys.time(),
        languages = openalexR::oa_fetch(
            entity = "works",
            title_and_abstract.search = compact(params$s_1_tca_corpus),
            group_by = "language",
            output = "list",
            verbose = TRUE
        ) |>
            lapply(
                function(x) {
                    result <- data.frame(
                        language = ifelse(
                            is.null(x["key_display_name"]),
                            as.character(NA),
                            as.character(x["key_display_name"])
                        ),
                        count = ifelse(
                            is.null(x["count"]),
                            as.integer(NA),
                            as.integer(x["count"])
                        )
                    )
                    return(result)
                }
            ) |>
            do.call(what = rbind) |>
            dplyr::arrange(desc(count)) |>
            dplyr::filter(count > 0)
    ) |>
        saveRDS(file = fn)
}

```


### Works over Time

Get works over time for different search terms

```{r}
#| label: get_count_years
#|

fn <- file.path(".", "tca_corpus", "data", "oa_count.rds")
if (!file.exists(fn)) {
    oa_count <- list(
        timestamp = Sys.time()
    )
    #
    message("OpenAlex ...")
    oa_count$oa_years <- openalexR::oa_fetch(
        entity = "works",
        search = "",
        group_by = "publication_year",
        output = "dataframe",
        verbose = TRUE
    ) |>
        dplyr::mutate(
            publication_year = as.integer(as.character(key_display_name)),
            key = NULL,
            key_display_name = NULL,
            p = count / sum(count)
        ) |>
        dplyr::arrange(publication_year) |>
        dplyr::mutate(
            p_cum = cumsum(p)
        ) |>
        dplyr::select(
            publication_year,
            everything()
        )
    #
    message("NATURE ...")
    oa_count$tca_nature <- openalexR::oa_fetch(
        title_and_abstract.search = compact(paste0("(", params$s_1_nature_environment, ")")),
        group_by = "publication_year",
        output = "dataframe",
        verbose = TRUE
    ) |>
        dplyr::mutate(
            publication_year = as.integer(as.character(key_display_name)),
            key = NULL,
            key_display_name = NULL,
            p = count / sum(count)
        ) |>
        dplyr::arrange(publication_year) |>
        dplyr::mutate(
            p_cum = cumsum(p)
        ) |>
        dplyr::select(
            publication_year,
            everything()
        )
    #
    message("TransformatveChange ...")
    oa_count$transformative_change_years <- openalexR::oa_fetch(
        title_and_abstract.search = compact(paste0("(", params$s_1_transformative_change, ")")),
        group_by = "publication_year",
        output = "dataframe",
        verbose = TRUE
    ) |>
        dplyr::mutate(
            publication_year = as.integer(as.character(key_display_name)),
            key = NULL,
            key_display_name = NULL,
            p = count / sum(count)
        ) |>
        dplyr::arrange(publication_year) |>
        dplyr::mutate(
            p_cum = cumsum(p)
        ) |>
        dplyr::select(
            publication_year,
            everything()
        )
    #
    message("TCA ...")
    oa_count$tca_years <- openalexR::oa_fetch(
        title_and_abstract.search = compact(paste0("(", params$s_1_tca_corpus, ")")),
        group_by = "publication_year",
        output = "dataframe",
        verbose = TRUE
    ) |>
        dplyr::mutate(
            publication_year = as.integer(as.character(key_display_name)),
            key = NULL,
            key_display_name = NULL,
            p = count / sum(count)
        ) |>
        dplyr::arrange(publication_year) |>
        dplyr::mutate(
            p_cum = cumsum(p)
        ) |>
        dplyr::select(
            publication_year,
            everything()
        )
    #
    message("CASE ...")
    oa_count$case_years <- openalexR::oa_fetch(
        title_and_abstract.search = compact(paste0("(", params$s_1_case, ")")),
        group_by = "publication_year",
        output = "dataframe",
        verbose = TRUE
    ) |>
        dplyr::mutate(
            publication_year = as.integer(as.character(key_display_name)),
            key = NULL,
            key_display_name = NULL,
            p = count / sum(count)
        ) |>
        dplyr::arrange(publication_year) |>
        dplyr::mutate(
            p_cum = cumsum(p)
        ) |>
        dplyr::select(
            publication_year,
            everything()
        )
    #
    saveRDS(oa_count, file = fn)
}
```


## Download TCA Corpus

The corpus download will be stored in `data/pages` and the arrow database in `data/corpus`.

This is not on github!

The corpus can be read by running `get_corpus()` which o[pens the database so that then it can be fed into a `dplyr` pipeline. After most `dplyr` functions, the actual data needs to be collected via `collect()`.

Only then is the actual data read!

Needs to be enabled by setting `eval: true` in the code block below.


```{r}
#| label: download_tca_corpus
#| eval: false
#|

tic()

IPBES.R::corpus_download(
    pages_dir = params$pages_dir,
    title_and_abstract_search = compact(params$s_1_tca_corpus),
    continue = TRUE,
    delete_pages_dir = FALSE,
    set_size = 2000,
    dry_run = FALSE,
    verbose = TRUE,
    mc_cores = 6
)

toc()
```

```{r}
#| label: convert_tca_corpus_arrow
#| eval: false

tic()

IPBES.R::corpus_pages_to_arrow(
    pages_dir = params$pages_dir,
    arrow_dir = params$corpus_dir,
    continue = TRUE,
    delete_arrow_dir = FALSE,
    dry_run = FALSE,
    verbose = TRUE,
    mc_cores = 2
)

toc()
```

```{r}
#| label: fix_duplicate_ids_TEMPORARY
#| eval: false
#|

years <- IPBES.R::corpus_read(params$corpus_dir) |>
    distinct(publication_year) |>
    collect() |>
    unlist() |>
    as.vector() |>
    sort()

lapply(
    years,
    function(y) {
        message("\nProcessing year: ", y)
        tic()
        dataset <- IPBES.R::corpus_read(params$corpus_dir) |>
            dplyr::filter(publication_year == y) |>
            dplyr::collect() |>
            group_by(id) |>
            slice_max(
                publication_year,
                n = 1,
                with_ties = FALSE,
                na_rm = TRUE
            )
        unlink(
            file.path(params$corpus_dir, paste0("publication_year=", y)),
            recursive = TRUE,
            force = TRUE
        )
        arrow::write_dataset(
            dataset = dataset,
            path = params$corpus_dir,
            partitioning = c("publication_year", "set"),
            format = "parquet",
            existing_data_behavior = "overwrite"
        )
        toc()
    }
)
```

## Download TCA AND CASE Corpus

```{r}
#| label: download_tca_cases_corpus
#| eval: false
#|

tic()

IPBES.R::corpus_download(
    pages_dir = params$pages_cases_dir,
    title_and_abstract_search = compact(params$s_1_case),
    continue = TRUE,
    delete_pages_dir = FALSE,
    set_size = 1999,
    dry_run = FALSE,
    verbose = TRUE,
    mc_cores = 8
)

toc()
```

```{r}
#| label: convert_tca_cases_corpus_arrow
#| eval: false

tic()

IPBES.R::corpus_pages_to_arrow(
    pages_dir = params$pages_cases_dir,
    arrow_dir = params$corpus_cases_dir,
    continue = TRUE,
    delete_arrow_dir = FALSE,
    dry_run = FALSE,
    verbose = TRUE,
    mc_cores = 2
)

toc()
```

```{r}
#| label: fix_cases_duplicate_ids_TEMPORARY
#| eval: false
#|

years <- IPBES.R::corpus_read(params$corpus_cases_dir) |>
    distinct(publication_year) |>
    collect() |>
    unlist() |>
    as.vector() |>
    sort()

lapply(
    years,
    function(y) {
        message("\nProcessing year: ", y)
        tic()
        dataset <- IPBES.R::corpus_read(params$corpus_cases_dir) |>
            dplyr::filter(publication_year == y) |>
            dplyr::collect() |>
            group_by(id) |>
            slice_max(
                publication_year,
                n = 1,
                with_ties = FALSE,
                na_rm = TRUE
            )
        unlink(
            file.path(params$corpus_cases_dir, paste0("publication_year=", y)),
            recursive = TRUE,
            force = TRUE
        )
        arrow::write_dataset(
            dataset = dataset,
            path = params$corpus_cases_dir,
            partitioning = c("publication_year", "set"),
            format = "parquet",
            existing_data_behavior = "overwrite"
        )
        toc()
    }
)

```

## Export 250 Random Works from Cases Corpus

```{r}
#| label: export_random_250_cases
#|

sample_size <- 250

fn <- file.path("tca_corpus", "data", paste0("random_", sample_size, "_tca_cases_corpus.xlsx"))
if (!file.exists(fn)) {
    set.seed(13)
    read_corpus(params$corpus_cases_dir) |>
        dplyr::select(
            id,
            doi,
            author = author_abbr,
            title = display_name,
            abstract = ab
        ) |>
        dplyr::slice_sample(
            n = sample_size
        ) |>
        dplyr::mutate(
            abstract = substr(abstract, 1, 5000)
        ) |>
        dplyr::collect() |>
        writexl::write_xlsx(path = fn)
}
```


## Prepare Full Text search of Title and Abstract

This is done using duckDB and the [fts extension](https://duckdb.org/docs/sql/extensions/fts) which is a full text search extension for duckDB (see also [for details](https://duckdb.org/docs/extensions/full_text_search.html) and for [arrow / parquet support](https://duckdb.org/docs/data/parquet/overview.html)).

The following steps are conducted:

1. Create new duckDB called `tca_corpus.duckdb` 
    - import data needed
    - create `fts` index for full text search

```{r}
#| label: create_duckdb

if (!file.exists(params$duckdb_fn)) {
    sql <- paste0(
        "CREATE TABLE tca_corpus AS SELECT id, author_abbr, publication_year, doi, display_name, ab FROM parquet_scan('",
        file.path(".", "tca_corpus", "data", "corpus", "**", "*.parquet"),
        "')"
    )

    con <- duckdb::dbConnect(duckdb::duckdb(), dbdir = params$duckdb_fn, read_only = FALSE)
    #
    dbExecute(con, "SET autoinstall_known_extensions=1")
    dbExecute(con, "SET autoload_known_extensions=1")
    dbExecute(con, sql)
    #
    duckdb::dbDisconnect(con, shutdown = TRUE)

    con <- duckdb::dbConnect(duckdb::duckdb(), dbdir = params$duckdb_fn, read_only = FALSE)
    #
    dbExecute(con, "INSTALL fts")
    dbExecute(con, "LOAD fts")

    input_table <- "tca_corpus"
    input_id <- "id"
    input_values <- "'display_name', 'ab'"

    sql <- paste0("PRAGMA create_fts_index(", input_table, ", ", input_id, ", ", input_values, ", overwrite=1);")

    dbExecute(con, sql)
    #
    duckdb::dbDisconnect(con, shutdown = TRUE)
}

# con <- dbConnect(duckdb::duckdb(params$duckdb_fn))

# SQL <- "SELECT * FROM tca_corpus WHERE display_name MATCH 'transformative';"
# dbListTables(con)

#     input_table <- "tca_corpus"
#     input_id <- "id"
#     input_values <- "'display_name', 'ab'"

#     query_string <- "'case study'"
#     fields <- "'display_name', 'ab'"

# sql <- paste0("SELECT fts_main_tca_corpus.match_bm25(", input_id, ", ", query_string, ", fields = ", fields, " FROM tca_corpus)"

# dbExecute(con, sql)

# duckdb::dbDisconnect(con, shutdown = TRUE)


```


## Extract Data from Global Corpus


### Export Random Works from TCA Cases Corpus

```{r}
#| label: export_random_250_tca
#|

sample_size <- 250

fn <- file.path("tca_corpus", "data", paste0("random_", sample_size, "_tca_cases_corpus.xlsx"))
if (!file.exists(fn)) {
    set.seed(13)
    read_corpus(params$corpus_cases_dir) |>
        dplyr::select(
            id,
            doi,
            author = author_abbr,
            title = display_name,
            abstract = ab
        ) |>
        dplyr::slice_sample(
            n = sample_size
        ) |>
        dplyr::mutate(
            abstract = substr(abstract, 1, 5000)
        ) |>
        dplyr::collect() |>
        writexl::write_xlsx(path = fn)
}
```


### Sectors

The Sectors definition is based on the subfields assigned to each work by OpenAlex. 
These were grouped by experts into sectors. See [this Google Doc](https://docs.google.com/spreadsheets/d/1ZCB_St2TQu_wL3yl1iN7Wxz5FERGxk-GZA0_2oGcRM0/edit#gid=210658071) for details.

```{r}
#| label: sectors_tca_corpus_data
#|

if (!dir.exists(params$corpus_topics_dir)) {
    con <- duckdb::dbConnect(duckdb::duckdb(), read_only = FALSE)

    corpus_read(params$corpus_dir) |>
        arrow::to_duckdb(table_name = "corpus", con = con) |>
        invisible()
    corpus_read(file.path("tca_corpus", "input", "sectors_def.parquet")) |>
        arrow::to_duckdb(table_name = "sectors", con = con) |>
        invisible()


    paste0(
        "CREATE VIEW corpus_unnest AS ",
        "SELECT  ",
        "corpus.id AS work_id,  ",
        "corpus.publication_year AS publication_year,  ",
        "UNNEST(topics).i AS i,  ",
        "UNNEST(topics).score AS score,  ",
        "UNNEST(topics).name AS name, ",
        "UNNEST(topics).id AS id,  ",
        "UNNEST(topics).display_name AS display_name  ",
        "FROM  ",
        "corpus "
    ) |>
        dbExecute(conn = con)

    select_sql <- paste0(
        "SELECT ",
        "corpus_unnest.*, ",
        "sectors.sector ",
        "FROM ",
        "corpus_unnest ",
        "LEFT JOIN ",
        "sectors ",
        "ON ",
        "corpus_unnest.id  == sectors.id "
    )

    dbGetQuery(con, paste(select_sql, "LIMIT 10"))

    sql <- paste0(
        "COPY ( ",
        select_sql,
        ") TO '", params$corpus_topics_dir, "' ",
        "(FORMAT PARQUET, COMPRESSION 'SNAPPY', PARTITION_BY 'publication_year')"
    )

    dbExecute(con, sql)

    duckdb::dbDisconnect(con, shutdown = TRUE)

    ###########################

    # years <- IPBES.R::corpus_read(params$corpus_dir) |>
    #     distinct(publication_year) |>
    #     collect() |>
    #     unlist() |>
    #     as.vector() |>
    #     sort()

    # sectors <- read.csv(file.path("tca_corpus", "input", "sectors_def.csv")) |>
    #     tibble::as_tibble() |>
    #     dplyr::mutate(
    #         id = paste0("https://openalex.org/subfields/", id),
    #         display_name = NULL
    #     )

    # pbmcapply::pbmclapply(
    #     years,
    #     function(y) {
    #         message("\nProcessing year: ", y)
    #         IPBES.R::corpus_read(params$corpus_dir) |>
    #             dplyr::filter(publication_year == y) |>
    #             dplyr::select(
    #                 id,
    #                 publication_year,
    #                 topics
    #             ) |>
    #             collect() |>
    #             IPBES.R::extract_topics(
    #                 names = "subfield"
    #             ) |>
    #             dplyr::left_join(
    #                 y = sectors,
    #                 by = "id"
    #             ) |>
    #             arrow::write_dataset(
    #                 path = params$corpus_topics_dir,
    #                 partitioning = c("publication_year"),
    #                 format = "parquet",
    #                 existing_data_behavior = "overwrite"
    #             )
    #     },
    #     mc.cores = 3,
    #     mc.preschedule = FALSE
    # )
}
```

### Authors

```{r}
#| label: authors_tca_corpus_data
#|

if (!dir.exists(params$corpus_authors_dir)) {
    con <- duckdb::dbConnect(duckdb::duckdb(), read_only = FALSE)

    corpus_read(params$corpus_dir) |>
        arrow::to_duckdb(table_name = "corpus", con = con) |>
        invisible()


    paste0(
        "CREATE VIEW corpus_unnest AS ",
        "SELECT  ",
        "corpus.id AS work_id,  ",
        "corpus.publication_year AS publication_year,  ",
        "UNNEST(author).au_id AS au_id,  ",
        "UNNEST(author).au_display_name AS au_display_name, ",
        "UNNEST(author).au_orcid AS au_orcid,  ",
        "UNNEST(author).author_position AS author_position,  ",
        "UNNEST(author).is_corresponding AS is_corresponding,  ",
        "UNNEST(author).au_affiliation_raw AS au_affiliation_raw,  ",
        "UNNEST(author).institution_id AS institution_id,  ",
        "UNNEST(author).institution_display_name AS institution_display_name,  ",
        "UNNEST(author).institution_ror AS institution_ror,  ",
        "UNNEST(author).institution_country_code AS institution_country_code,  ",
        "UNNEST(author).institution_type AS institution_type,  ",
        "UNNEST(author).institution_lineage AS institution_lineage  ",
        "FROM  ",
        "corpus "
    ) |> dbExecute(conn = con)

    paste0(
        "COPY ( ",
        "SELECT * FROM corpus_unnest ",
        ") TO '", params$corpus_authors_dir, "' ",
        "(FORMAT PARQUET, COMPRESSION 'SNAPPY', PARTITION_BY 'publication_year')"
    ) |>
        dbExecute(conn = con)

    duckdb::dbDisconnect(con, shutdown = TRUE)
}
```


### Primary Topics

```{r}
#| label: get_prim_topic_tca_corpus
#| eval: true

fn <- file.path(".", "tca_corpus", "data", paste0("prim_topics_tca_corpus.rds"))
if (!file.exists(fn)) {
    prim_topics_tca_corpus <- corpus_read(params$corpus_topics_dir) |>
        dplyr::filter(
            name == "topic",
            i == 1
        ) |>
        mutate(
            id = as.integer(sub("https://openalex.org/T", "", id))
        ) |>
        dplyr::group_by(id) |>
        summarize(
            count = n()
        ) |>
        dplyr::left_join(
            read.csv(file.path("tca_corpus", "input", "OpenAlex_topic_mapping_table - final_topic_field_subfield_table.csv")),
            by = c("id" = "topic_id")
        ) |>
        dplyr::arrange(desc(count)) |>
        collect()

    saveRDS(prim_topics_tca_corpus, file = fn)
} else {
    prim_topics_tca_corpus <- readRDS(fn)
}
```

## Figures

```{r}
#| label: publications_over_time_data
#|

fn <- file.path(".", "tca_corpus", "data", "publications_over_time_tca_corpus.rds")

if (!file.exists(fn)) {
    read_corpus(params$corpus_dir) |>
        dplyr::select(publication_year) |>
        dplyr::arrange(publication_year) |>
        dplyr::collect() |>
        table() |>
        as.data.frame() |>
        mutate(
            publication_year = as.integer(as.character(publication_year)),
            p = Freq / sum(Freq),
            p_cum = cumsum(p)
        ) |>
        rename(
            count = Freq
        ) |>
        dplyr::inner_join(
            y = openalexR::oa_fetch(
                search = "",
                group_by = "publication_year",
                output = "tibble",
                verbose = FALSE
            ) |>
                dplyr::select(
                    key,
                    count
                ) |>
                dplyr::rename(
                    publication_year = key,
                    count_oa = count
                ) |>
                dplyr::arrange(publication_year) |>
                dplyr::mutate(
                    publication_year = as.integer(as.character(publication_year)),
                    p_oa = count_oa / sum(count_oa),
                    p_oa_cum = cumsum(p_oa)
                )
        ) |>
        saveRDS(file = fn)
}
```

```{r}
#| label: publications_over_time_plot
#|

if (length(list.files(file.path("tca_corpus", "figures"), pattern = "publications_over_time")) < 2) {
    figure <- readRDS(file.path(".", "tca_corpus", "data", "publications_over_time_tca_corpus.rds")) |>
        dplyr::filter(publication_year >= 1900) |>
        ggplot() +
        geom_bar(aes(x = publication_year, y = p), stat = "identity") +
        geom_line(aes(x = publication_year, y = p_cum / 10), color = "red") +
        geom_line(aes(x = publication_year, y = p_oa_cum / 10), color = "blue") +
        scale_x_continuous(breaks = seq(1900, 2020, 10)) +
        scale_y_continuous(
            "Proportion of publications",
            sec.axis = sec_axis(~ . * 10, name = "Cumulative proportion") # divide by 100 to scale back the secondary axis
        ) +
        labs(
            title = "Publications over time",
            x = "Year",
            y = "Number of publications"
        ) +
        theme_minimal() +
        theme(axis.text.y.right = element_text(color = "red"))


    ggplot2::ggsave(
        file.path("tca_corpus", "figures", "publications_over_time.pdf"),
        width = 12,
        height = 6,
        figure
    )
    ggplot2::ggsave(
        file.path("tca_corpus", "figures", "publications_over_time.png"),
        width = 12,
        height = 6,
        figure
    )

    rm(figure)
}
```

## Maps


```{r}
#| label: countries_tca_corpus_data
#|

fn <- file.path(".", "tca_corpus", "data", "countries_tca_corpus.rds")
if (!file.exists(fn)) {
    corpus <- corpus_read(params$corpus_authors_dir)

    data_first <- corpus |>
        dplyr::filter(
            author_position == "first"
        ) |>
        dplyr::select(
            work_id,
            institution_country_code,
        ) |>
        dplyr::group_by(
            work_id,
            institution_country_code
        ) |>
        dplyr::summarise(
            count_first = 1 / n(),
            .groups = "drop"
        ) |>
        dplyr::group_by(
            institution_country_code
        ) |>
        dplyr::summarise(
            count = sum(count_first),
            .groups = "drop"
        ) |>
        dplyr::mutate(
            position = "first"
        )

    data_all <- corpus |>
        dplyr::select(
            work_id,
        ) |>
        dplyr::group_by(
            work_id,
        ) |>
        dplyr::summarize(
            count = n()
        ) |>
        dplyr::right_join(
            y = corpus |>
                dplyr::select(
                    work_id,
                    institution_country_code
                ),
            by = "work_id"
        ) |>
        dplyr::group_by(
            institution_country_code
        ) |>
        dplyr::summarise(
            count = sum(count),
            .groups = "drop"
        ) |>
        dplyr::mutate(
            position = "all"
        )


    data_oa <- openalexR::oa_fetch(
        group_by = "authorships.countries",
        output = "tibble",
        verbose = FALSE
    ) |>
        dplyr::mutate(
            iso3c = countrycode::countrycode(
                key_display_name,
                origin = "country.name",
                destination = "iso3c"
            ),
            key_display_name = NULL,
            key = NULL,
            position = "oa"
        )

    data <- dplyr::add_row(
        collect(data_first),
        collect(data_all)
    ) |>
        dplyr::mutate(
            iso3c = countrycode::countrycode(
                institution_country_code,
                origin = "iso2c",
                destination = "iso3c"
            ),
            institution_country_code = NULL
        ) |>
        dplyr::add_row(
            data_oa
        ) |>
        saveRDS(file = fn)
    rm(data_first, data_all, data_oa)
}
```

Some check of the data

```{r}
#| label: countries_tca_corpus_check
#| eval: false
#| echo: false
#|

data <- readRDS(file.path(".", "tca_corpus", "data", "countries_tca_corpus.rds"))
data <- data |>
    filter(!is.na(iso3c)) |>
    tidyr::pivot_wider(names_from = position, values_from = count) |>
    dplyr::mutate(
        all_first = all / first,
        oa_all = oa / all
    )
```


```{r}
#| label: countries_tca_corpus_map
#|

if (length(list.files(path = file.path("tca_corpus", "maps"), pattern = "publications_countries")) < 2) {
    data <- readRDS(file.path(".", "tca_corpus", "data", "countries_tca_corpus.rds")) |>
        dplyr::group_by(iso3c) |>
        dplyr::summarize(
            count_first = sum(count[position == "first"], na.rm = TRUE),
            count_all = sum(count[position == "all"], na.rm = TRUE),
            count_oa = sum(count[position == "oa"], na.rm = TRUE)
        ) |>
        dplyr::mutate(
            count_oa = ifelse(is.na(count_oa), 0, count_oa),
            log_count_oa = log(count_oa + 1),
            p_oa = count_oa / sum(count_oa),
            #
            count_first = ifelse(is.na(count_first), 0, count_first),
            log_count_first = log(count_first + 1),
            p_first = count_first / sum(count_first),
            p_first_output = count_first / count_oa,
            p_first_diff = (p_oa - p_first) * 100,
            #
            count_all = ifelse(is.na(count_all), 0, count_all),
            log_count_all = log(count_all + 1),
            p_all = count_all / sum(count_all),
            p_all_output = count_all / count_oa,
            p_all_diff = (p_oa - p_all) * 100,
        )


    # data |> mutate(
    #     count_first = count_first / max(count_first),
    #     count_all = count_all / max(count_all),
    #     count_oa = count_oa / max(count_oa)
    # ) |>
    # dplyr::arrange(desc(count_oa)) |>
    # ggplot(aes(x = iso3c)) +
    #     geom_line(aes(y = count_first, color = "Count First"), group = 1) +
    #     geom_line(aes(y = count_all, color = "Count All"), group = 1) +
    #     geom_line(aes(y = count_oa, color = "Count OA"), group = 1) +
    #     scale_color_manual(values = c("Count First" = "red", "Count All" = "blue", "Count OA" = "green")) +
    #     labs(x = "ISO3C", y = "Normalized Count") +
    #     theme_minimal()

    map <- patchwork::wrap_plots(
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "count_oa",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("count of overall publications (count_oa)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "count_first",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("count of TCA publications (count_first)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "count_all",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("count of TCA publications (count_all)"),
        ####
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "log_count_oa",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("log(count + 1) of overall publications (log_count_oa)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "log_count_first",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("log(count + 1) of TCA publications (log_count_first)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "log_count_all",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("log(count + 1) of TCA publications (log_count_all)"),
        ####
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "p_oa",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("Overall research output (p_oa)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "p_first",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("TCA research output (p_first)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "p_all",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("TCA research output (p_all)"),
        ####
        ggplot() +
            theme_void(),
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "p_first_diff",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", mid = "white", high = "#56B4E9", midpoint = 0) +
            ggplot2::ggtitle("difference (TCA - overall) output (p_oa - p_first)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "p_all_diff",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", mid = "white", high = "#56B4E9", midpoint = 0) +
            ggplot2::ggtitle("difference (TCA - overall) output (p_oa - p_all)"),
        ncol = 3
    )

    ggplot2::ggsave(
        file.path("tca_corpus", "maps", "publications_countries.pdf"),
        width = 12,
        height = 8,
        map
    )
    ggplot2::ggsave(
        file.path("tca_corpus", "maps", "publications_countries.png"),
        width = 12,
        height = 8,
        map
    )
}
```


```{r}
#| label: xx_countries_tca_corpus_before_2016_map
#| eval: false

if (length(list.files(path = file.path("tca_corpus", "maps"), pattern = "publications_countries_before_2016")) < 2) {
    data <- readRDS(file.path(".", "tca_corpus", "data", "countries_tca_corpus.rds")) |>
        dplyr::filter(
            publication_year < 2016
        ) |>
        dplyr::group_by(iso3c) |>
        dplyr::summarize(
            count_first = sum(as.integer(count_first), na.rm = TRUE),
            count_all = sum(as.integer(count_all), na.rm = TRUE),
            count_oa = sum(as.integer(count_oa), na.rm = TRUE),
        ) |>
        dplyr::mutate(
            count_oa = ifelse(is.na(count_oa), 0, count_oa),
            log_count_oa = log(count_oa + 1),
            p_oa = count_oa / sum(count_oa),
            #
            count_first = ifelse(is.na(count_first), 0, count_first),
            log_count_first = log(count_first + 1),
            p_first = count_first / sum(count_first),
            p_first_output = count_first / count_oa,
            p_first_diff = (p_oa - p_first) * 100,
            #
            count_all = ifelse(is.na(count_all), 0, count_all),
            log_count_all = log(count_all + 1),
            p_all = count_all / sum(count_all),
            p_all_output = count_all / count_oa,
            p_all_diff = (p_oa - p_all) * 100,
        )

    map <- patchwork::wrap_plots(
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "count_oa",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("count of overall publications (count_oa)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "count_first",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("count of TCA publications (count_first)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "count_all",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("count of TCA publications (count_all)"),
        ####
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "log_count_oa",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("log(count + 1) of overall publications (log_count_oa)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "log_count_first",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("log(count + 1) of TCA publications (log_count_first)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "log_count_all",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("log(count + 1) of TCA publications (log_count_all)"),
        ####
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "p_oa",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("Overall research output (p_oa)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "p_first",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("TCA research output (p_first)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "p_all",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("TCA research output (p_all)"),
        ####
        ggplot() +
            theme_void(),
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "p_first_diff",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", mid = "white", high = "#56B4E9", midpoint = 0) +
            ggplot2::ggtitle("difference (TCA - overall) output (p_oa - p_first)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "p_all_diff",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", mid = "white", high = "#56B4E9", midpoint = 0) +
            ggplot2::ggtitle("difference (TCA - overall) output (p_oa - p_all)"),
        ncol = 3
    )

    ggplot2::ggsave(
        file.path("tca_corpus", "maps", "publications_countries_before_2016.pdf"),
        width = 12,
        height = 8,
        map
    )
    ggplot2::ggsave(
        file.path("tca_corpus", "maps", "publications_countries_before_2016.png"),
        width = 12,
        height = 8,
        map
    )
}
```


```{r}
#| label: xx_countries_tca_corpus_after_2019_map
#| eval: false


if (length(list.files(path = file.path("tca_corpus", "maps"), pattern = "publications_countries_after_2019")) < 2) {
    data <- readRDS(file.path(".", "tca_corpus", "data", "countries_tca_corpus.rds")) |>
        dplyr::filter(
            publication_year > 2019
        ) |>
        dplyr::group_by(iso3c) |>
        dplyr::summarize(
            count_first = sum(as.integer(count_first), na.rm = TRUE),
            count_all = sum(as.integer(count_all), na.rm = TRUE),
            count_oa = sum(as.integer(count_oa), na.rm = TRUE),
        ) |>
        dplyr::mutate(
            count_oa = ifelse(is.na(count_oa), 0, count_oa),
            log_count_oa = log(count_oa + 1),
            p_oa = count_oa / sum(count_oa),
            #
            count_first = ifelse(is.na(count_first), 0, count_first),
            log_count_first = log(count_first + 1),
            p_first = count_first / sum(count_first),
            p_first_output = count_first / count_oa,
            p_first_diff = (p_oa - p_first) * 100,
            #
            count_all = ifelse(is.na(count_all), 0, count_all),
            log_count_all = log(count_all + 1),
            p_all = count_all / sum(count_all),
            p_all_output = count_all / count_oa,
            p_all_diff = (p_oa - p_all) * 100,
        )

    map <- patchwork::wrap_plots(
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "count_oa",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("count of overall publications (count_oa)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "count_first",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("count of TCA publications (count_first)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "count_all",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("count of TCA publications (count_all)"),
        ####
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "log_count_oa",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("log(count + 1) of overall publications (log_count_oa)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "log_count_first",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("log(count + 1) of TCA publications (log_count_first)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "log_count_all",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("log(count + 1) of TCA publications (log_count_all)"),
        ####
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "p_oa",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("Overall research output (p_oa)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "p_first",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("TCA research output (p_first)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "p_all",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", high = "#56B4E9") +
            ggplot2::ggtitle("TCA research output (p_all)"),
        ####
        ggplot() +
            theme_void(),
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "p_first_diff",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", mid = "white", high = "#56B4E9", midpoint = 0) +
            ggplot2::ggtitle("difference (TCA - overall) output (p_oa - p_first)"),
        #
        data |>
            IPBES.R::map_country_codes(
                map_type = "countries",
                values = "p_all_diff",
                geodata_path = params$gdm_dir
            ) +
            ggplot2::scale_fill_gradient2(low = "#E69F00", mid = "white", high = "#56B4E9", midpoint = 0) +
            ggplot2::ggtitle("difference (TCA - overall) output (p_oa - p_all)"),
        ncol = 3
    )

    ggplot2::ggsave(
        file.path("tca_corpus", "maps", "publications_countries_after_2019.pdf"),
        width = 12,
        height = 8,
        map
    )
    ggplot2::ggsave(
        file.path("tca_corpus", "maps", "publications_countries_after_2019.png"),
        width = 12,
        height = 8,
        map
    )
}
```

## Topics and Sectors


```{r}
#| label: sectors_over_time_data
#|

fn <- file.path("tca_corpus", "data", "sectors_over_time.rds")
if (!file.exists(fn)) {
    data <- IPBES.R::corpus_read(params$corpus_topics_dir) |>
        dplyr::filter(
            name == "subfield"
        ) |>
        dplyr::group_by(
            publication_year,
            sector,
            i
        ) |>
        dplyr::summarize(
            count = n(),
            .groups = "drop"
        ) |>
        dplyr::rename(
            level = i
        ) |>
        dplyr::collect()

    data |>
        dplyr::filter(
            level == 1
        ) |>
        dplyr::group_by(
            publication_year,
            sector
        ) |>
        dplyr::summarize(
            count_1 = sum(count),
            .groups = "drop"
        ) |>
        dplyr::full_join(
            data |>
                dplyr::group_by(
                    publication_year,
                    sector
                ) |>
                dplyr::summarize(
                    count_all = sum(count)
                )
        ) |>
        dplyr::arrange(
            publication_year,
            sector
        ) |>
        dplyr::mutate(
            count_1 = ifelse(is.na(count_1), 0, count_1),
            count_all = ifelse(is.na(count_all), 0, count_all)
        ) |>
        dplyr::group_by(sector) |>
        dplyr::mutate(
            cumsum_count_1 = cumsum(count_1),
            cumsum_count_all = cumsum(count_all),
            p_cumsum_count_1 = cumsum_count_1 / max(cumsum_count_1),
            p_cumsum_count_all = cumsum_count_all / max(cumsum_count_all)
        ) |>
        saveRDS(fn)
    rm(data)
}
```


```{r}
#| label: sectors_over_time_plot
#|

if (length(list.files(file.path("tca_corpus", "figures"), pattern = "sectors_over_time")) < 2) {
    figure_1 <- readRDS(file.path(file.path("tca_corpus", "data", "sectors_over_time.rds"))) |>
        dplyr::filter(
            publication_year >= 1950
        ) |>
        ggplot() +
        geom_line(
            aes(
                x = publication_year,
                y = cumsum_count_1,
                color = sector,
                lty = sector
            )
        ) +
        scale_x_continuous(breaks = seq(1900, 2020, 10)) +
        scale_y_continuous(
            "Log(No Publications)",
            trans = "log10"
            # sec.axis = sec_axis(~ . * 10, name = "Cumulative proportion") # divide by 100 to scale back the secondary axis
        ) +
        labs(
            title = "Publications classified into Sectors over time (primary sector only)",
            x = "Year"
            # y = "Number of publications"
        ) +
        theme_minimal() +
        theme(
            legend.position = "bottom",
            # axis.text.y.right = element_text(color = "red")
        )

    figure_all <- readRDS(file.path(file.path("tca_corpus", "data", "sectors_over_time.rds"))) |>
        dplyr::filter(
            publication_year >= 1950
        ) |>
        ggplot() +
        geom_line(
            aes(
                x = publication_year,
                y = cumsum_count_all,
                color = sector,
                lty = sector
            )
        ) +
        scale_x_continuous(breaks = seq(1900, 2020, 10)) +
        scale_y_continuous(
            "Log(No Publications)",
            trans = "log10"
            # sec.axis = sec_axis(~ . * 10, name = "Cumulative proportion") # divide by 100 to scale back the secondary axis
        ) +
        labs(
            title = "Publications classified into Sectors over time (up to three sectors)",
            x = "Year"
            # y = "Number of publications"
        ) +
        theme_minimal() +
        theme(
            legend.position = "none",
            # axis.text.y.right = element_text(color = "red")
        )

    figure <- patchwork::wrap_plots(
        figure_1,
        figure_all,
        nrow = 2
    )

    ggplot2::ggsave(
        file.path("tca_corpus", "figures", "sectors_over_time.pdf"),
        width = 12,
        height = 12,
        figure
    )
    ggplot2::ggsave(
        file.path("tca_corpus", "figures", "sectors_over_time.png"),
        width = 12,
        height = 12,
        figure
    )

    rm(figure_1, figure_all, figure)
}
```


```{r}
#| label: sectors_proportions over_time_plot
#|

if (length(list.files(file.path("tca_corpus", "figures"), pattern = "sectors_proportions_over_time")) < 2) {
    figure <- readRDS(file.path(file.path("tca_corpus", "data", "sectors_over_time.rds"))) |>
        dplyr::filter(
            publication_year >= 1950
        ) |>
        group_by(publication_year) |>
        mutate(count_all = count_all / sum(count_all)) |>
        ggplot() +
        geom_col(
            aes(
                x = publication_year,
                y = count_all,
                fill = sector
            ),
            position = "stack"
        ) +
        scale_x_continuous(breaks = seq(1900, 2020, 10)) +
        scale_y_continuous(
            "Proportion of Publications" # ,
            #    limits = c(0, 1.0001)
        ) +
        labs(
            title = "Publications classified into Sectors over time. Each publication has up to three sectors assigned.",
            x = "Year",
            y = "Proportion"
        ) +
        theme_minimal() +
        theme(
            legend.position = "right"
        )

    ggplot2::ggsave(
        file.path("tca_corpus", "figures", "sectors_proportions_over_time.pdf"),
        width = 12,
        height = 12,
        figure
    )
    ggplot2::ggsave(
        file.path("tca_corpus", "figures", "sectors_proportions_over_time.png"),
        width = 12,
        height = 12,
        figure
    )

    rm(figure)
}
```


# Results

## Assessment of Search Terms Using OpenAlex

### Number of Hits per Individual Corpus

Here we show the number of hits for the key papers in the different individual corpi. The columns represent the different search terms as defined in @sec-search_terms.

```{r}
#| label: hits_per_search_term_table

dat <- cbind(
    search_term_hits
)

rownames(dat) <- dplyr::recode(
    rownames(dat),
    "transformative change" = "Transformative Change @sec-transform",
    "nature environment" = "Nature @sec-nature",
    "tca corpus" = "Assessment Corpus @sec-tca-corpus",
    "Chapter 1 01" = "Ch1 01 @sec-ch1-01",
    "Chapter 1 02" = "Ch1 02 @sec-ch1-02",
    "Chapter 1 03" = "Ch1 03 @sec-ch1-03",
    "Chapter 1 04" = "Ch1 04 @sec-ch1-04",
    "Chapter 1 05" = "Ch1 05 @sec-ch1-05",
    "Chapter 1 06" = "Ch1 06 @sec-ch1-06",
    "Chapter 2" = "Ch2  @sec-ch2",
    "Chapter 3 01" = "Ch3 01 @sec-ch3-01",
    "Chapter 3 02" = "Ch3 02 @sec-ch3-02",
    "Chapter 3 03" = "Ch3 03 @sec-ch3-03",
    "Chapter 3 04" = "Ch3 04 @sec-ch3-04",
    "Chapter 3 05" = "Ch3 05 @sec-ch3-05",
    "Chapter 3 06" = "Ch3 06 @sec-ch3-06",
    "Chapter 4 01" = "Ch4 01 @sec-ch4-01",
    "Chapter 4 02" = "Ch4 02 @sec-ch4-02",
    "Chapter 5 vision" = "Ch5 Vision @sec-ch5_vision",
    "Chapter 5 vision case" = "Ch5 Vision Case @sec-ch5_vision_case",
    "case" = "Ch5 Case @sec-case"
)

dat |>
    knitr::kable(
        caption = "Number of hits",
    )

rm(dat)
```

### Key papers in different Individual Corpi {#key_papers_in_corpi}


```{r}
#| label: key_paper_hits_in_corpi
#|

tbl <- lapply(
    names(key_works_hits),
    function(n) {
        kwh <- key_works_hits[[n]]
        if (nrow(kwh) > 0) {
            total <- grepl("Total", rownames(kwh))
            rownames(kwh)[!total] <- paste0(n, " - <a href='https://doi.org/", rownames(kwh)[!total], "' target='_blank'>Click here</a>")
            rownames(kwh)[total] <- paste0("**", n, " - Total**")
            kwh |>
                arrange(Total) |>
                apply(
                    c(1, 2),
                    function(x) {
                        ifelse(x == 0, "<font color='red'>0</font>", paste0("<font color='green'>", x, "</font>"))
                    }
                ) |>
                as.data.frame()
        } else {
            return(NULL)
        }
    }
)
tbl <- tbl[sapply(tbl, class) != "NULL"]
tbl <- do.call(what = rbind, tbl)


detail <- rbind(
    "**overall**" = c(
        paste0(
            "**",
            search_term_hits |>
                select(count) |>
                unlist() |>
                as.vector(),
            "**"
        ),
        ""
    ),
    tbl
)

detail <- detail |>
    dplyr::rename(
        "Transformative Change @sec-transform" = s_1_transformative_change,
        "Nature @sec-nature" = s_1_nature_environment,
        "Assessment Corpus @sec-tca-corpus" = s_1_tca_corpus,
        "Ch1 01 @sec-ch1-01" = s_1_ch1_01,
        "Ch1 02 @sec-ch1-02" = s_1_ch1_02,
        "Ch1 03 @sec-ch1-03" = s_1_ch1_03,
        "Ch1 04 @sec-ch1-04" = s_1_ch1_04,
        "Ch1 05 @sec-ch1-05" = s_1_ch1_05,
        "Ch1 06 @sec-ch1-06" = s_1_ch1_06,
        "Ch2  @sec-ch2" = s_1_ch2,
        "Ch3 01 @sec-ch3-01" = s_1_ch3_01,
        "Ch3 02 @sec-ch3-02" = s_1_ch3_02,
        "Ch3 03 @sec-ch3-03" = s_1_ch3_03,
        "Ch3 04 @sec-ch3-04" = s_1_ch3_04,
        "Ch3 05 @sec-ch3-05" = s_1_ch3_05,
        "Ch3 06 @sec-ch3-06" = s_1_ch3_06,
        "Ch4 01 @sec-ch4-01" = s_1_ch4_01,
        "Ch4 02 @sec-ch4-02" = s_1_ch4_02,
        # "Ch5 Vision @sec-ch5_vision" = s_1_ch5_vision,
        "Ch5 Case @sec-case" = s_1_case,
        # "Ch5 Vision Case @sec-ch5_vision_case" = s_1_ch5_vision_case
    )
```

### Key Papers in Individual Corpi

#### Summary
Each column is a different search term, and each row consists of the key papers of a specific chapter and the author who provided the key papers. The number is the number of key papers occurring in the Individual Corpus.
```{r}
in_summary <- grepl("Total|overall", rownames(detail))
knitr::kable(
    detail[in_summary, ]
)
```


#### Detail
```{r}
knitr::kable(
    detail
)
```

## TCA Corpus properties

### Languages in TCA Corpus (from OpenAlex call)

These are the languages of the abstract and title. Articles with multiple languages will be counted not accurate. Also, as this is
the language of the abstract / title, it might not be the language of the full text.

```{r}
readRDS(file.path(".", "tca_corpus", "data", "tca_corpus_languages.rds"))$languages |>
    IPBES.R::table_dt(
        fn = "languages_tca_corpus"
    )
```


### Publications over time

The red line is the cumulative proportion of publications, the blue line the cumulative proportion of all of the Op[enAlex corpus. Both use the secondeary (red) axis.


![](`r file.path("tca_corpus", "figures", "publications_over_time.png")`)

To download high resolution, [click here](`r file.path("tca_corpus", "figures", "publications_over_time.pdf")`){target="_blank"}

```{r}
readRDS(file.path(".", "tca_corpus", "data", "publications_over_time_tca_corpus.rds")) |>
    IPBES.R::table_dt(fn = "publications_over_time")
```

### Countries in TCA Corpus

The countries are based on the countries of the institutes of all authors, weighted by `1/no_ authors_per_paper`. 

The following calculations were done:

- `**count** = `ifelse(is.na(count), 0, count)`
- `**log_count** = `log(count + 1)`
- `**p** = `count / sum(count)`
- `**count_oa** = `ifelse(is.na(count_oa), 0, count)`
- `**log_count_oa** = `log(count_oa + 1)`
- `**p_oa** = `count_oa / sum(count_oa)`
- `**p_diff** = `(p_oa - p) * 100`
- `**p_ratio** = `count / count_oa`


```{r}
readRDS(file.path(".", "tca_corpus", "data", "countries_tca_corpus.rds")) |>
    IPBES.R::table_dt(fn = "publications_per_country")
```

#### All Years

![](`r file.path("tca_corpus", "maps", "publications_countries.png")`)

To download high resolution, [click here](`r file.path("tca_corpus", "maps", "publications_countries.pdf")`){target="_blank"}


### Sectors over time

For clarity, the log of the cumulative sum of the sectors over time are shown here. 


![The top graph shows only the primary sector assigned, the bottom graph all sectors (first, secondary and tertiary)](tca_corpus/figures/sectors_over_time.png)

To download high resolution, [click here](`r file.path("tca_corpus", "maps", "sectors_over_time.pdf")`){target="_blank"}


![The graph shows the proportion of the different sectors over time](tca_corpus/figures/sectors_proportions_over_time.png)

To download high resolution, [click here](`r file.path("tca_corpus", "maps", "sectors_proportions_over_time.pdf")`){target="_blank"}


```{r}
readRDS(file.path(".", "tca_corpus", "data", "sectors_over_time.rds")) |>
    IPBES.R::table_dt(
        fn = "sectors_over_time",
        fixedColumns = list(leftColumns = 3)
    )
```

### Topics in corpus

```{r}
#| label: prim_topics_tca_corpus_plot
#|

cs <- cumsum(prim_topics_tca_corpus$count)
cs |>
    plot(
        type = "l",
        xlab = "Topic",
        ylab = "Cumulative Count",
        main = "Cumulative Topics in TCA Corpus"
    )

abline(
    h = 0.95 * cs[length(cs)],
    v = min(which(cs > 0.95 * cs[length(cs)])),
    col = "red"
)

text(
    x = 0.5 * length(cs),
    y = 0.95 * cs[length(cs)],
    pos = 3,
    labels = "95% of the corpus",
    col = "red"
)
```

```{r}
#| label: prim_topics_tca_corpus_table
#|

prim_topics_tca_corpus |>
    relocate(count, .after = "id") |>
    IPBES.R::table_dt(
        fn = "topics_tca_corpus",
    )
```

### SubFields in Corpus

```{r}
#| label: prim_subfield_tca_corpus_plot
#|
cs <- prim_topics_tca_corpus |>
    mutate(
        id = NULL,
        topic_name = NULL,
        keywords = NULL,
        summary = NULL,
        wikipedia_url = NULL
    ) |>
    group_by(
        subfield_id,
    ) |>
    summarise(
        count = sum(count)
    ) |>
    arrange(desc(count)) |>
    dplyr::select(count) |>
    unlist() |>
    cumsum()
cs |>
    plot(
        type = "l",
        xlab = "Subfield",
        ylab = "Cumulative Count",
        main = "Cumulative Subfields in TCA Corpus"
    )

abline(
    h = 0.95 * cs[length(cs)],
    v = min(which(cs > 0.95 * cs[length(cs)])),
    col = "red"
)

text(
    x = 0.5 * length(cs),
    y = 0.95 * cs[length(cs)],
    pos = 3,
    labels = "95% of the corpus",
    col = "red"
)
```

```{r}
#| label: prim_subfield_tca_corpus_table
#|

prim_topics_tca_corpus |>
    mutate(
        topic_id = NULL,
        topic_name = NULL,
        keywords = NULL,
        summary = NULL,
        wikipedia_url = NULL
    ) |>
    group_by(
        subfield_id,
        subfield_name,
        field_id,
        field_name,
        domain_id,
        domain_name
    ) |>
    summarise(
        count = sum(count),
        .groups = "drop"
    ) |>
    arrange(desc(count)) |>
    relocate(count, .after = "subfield_id") |>
    DT::datatable(
        extensions = c(
            "Buttons",
            "FixedColumns",
            "Scroller"
        ),
        options = list(
            dom = "Bfrtip",
            buttons = list(
                list(
                    extend = "csv",
                    filename = fn
                ),
                list(
                    extend = "excel",
                    filename = fn
                ),
                list(
                    extend = "pdf",
                    filename = fn,
                    orientation = "landscape",
                    customize = DT::JS(
                        "function(doc) {",
                        "  doc.defaultStyle.fontSize = 5;", # Change the font size
                        "}"
                    )
                ),
                "print"
            ),
            scroller = TRUE,
            scrollY = JS("window.innerHeight * 0.7 + 'px'"),
            scrollX = TRUE,
            fixedColumns = list(leftColumns = 4)
        ),
        escape = FALSE
    )
```


## Additional Searches subsets

```{r}
file.path(".", "tca_corpus", "data", "additional_search.rds") |>
    readRDS() |>
    knitr::kable()
```

Differences between the used [media search term](tca_corpus/input/search terms/media.txt) and the original media search term:
```{r}
diffviewer::visual_diff(
    "tca_corpus/input/search terms/media.org.txt",
    "tca_corpus/input/search terms/media.txt"
)
```


# ==== APPENDIX ====

## Strategies/Options/Instruments for transformative change 

The file [strategies_options.md](tca_corpus/input/search terms/strategies_options.md) contains the terms for the different strategies and options.

I will now iterate through all of them and identify the number of hits per individual search term. This can be used as a result in itself in interpreting the importance of each term as well as to shorten the search term to be able to use it together with the [TCA search term](tca_corpus/input/search terms/tca_corpus.txt).

### Methods

#### Prepare Search Terms
```{r}
#| label: strategies_options_prepare_searchterms

fn <- file.path("tca_corpus", "data", "strategies_options_terms.rds")
if (!file.exists(fn)) {
    sts <- data.frame(
        "Strategy" = NA,
        "Option" = NA,
        term = params$sts_strategies_options
    )
    sts <- sts[sts$term != "", ]
    for (i in 1:nrow(sts)) {
        if (grepl("^# ", sts$term[i])) {
            strategy <- gsub("^# ", "", sts$term[i])
        } else if (grepl("^## ", sts$term[i])) {
            option <- gsub("^## ", "", sts$term[i])
        } else {
            sts$Strategy[i] <- strategy
            sts$Option[i] <- option
        }
    }
    sts <- sts[!is.na(sts$Strategy), ]
    sts <- sts[!is.na(sts$Option), ]

    sts$Name <- paste0(sts$Strategy, " ||| ", sts$Option)

    strategies_options_terms <- split(sts$term, sts$Name)

    saveRDS(strategies_options_terms, file = fn)
}
```


#### Run the search terms
```{r}
#| label: strategies_options_get_counts
#|

fn <- file.path("tca_corpus", "data", "strategies_options.rds")
if (!file.exists(file.path(fn))) {
    strategies_options_terms <- readRDS(file.path("tca_corpus", "data", "strategies_options_terms.rds"))
    strategies_options <- lapply(
        names(strategies_options_terms),
        function(strategy) {
            message("- ", strategy)
            result <- list()
            result$term <- paste(strategies_options_terms[[strategy]], collapse = " ")
            result$count <- NA
            result$years <- data.frame(publication_year = NA, count = NA)
            result$assess_search_terms <- assess_search_term(
                st = strategies_options_terms[[strategy]],
                AND_term = params$s_1_tca_corpus,
                remove = " OR$",
                excl_others = FALSE,
                verbose = FALSE,
                mc.cores = 1
            ) |>
                dplyr::arrange(desc(count))
            #
            try(
                {
                    result$count <- openalexR::oa_fetch(
                        title_and_abstract.search = IPBES.R::compact(paste0("(", params$s_1_tca_corpus, ") AND (", result$term, ")")),
                        count_only = TRUE,
                        output = "list",
                        verbose = FALSE
                    )$count
                    message("  - ", result$count, " hits")
                    #
                    result$years <- openalexR::oa_fetch(
                        title_and_abstract.search = IPBES.R::compact(paste0("(", params$s_1_tca_corpus, ") AND (", result$term, ")")),
                        group_by = "publication_year",
                        output = "dataframe",
                        verbose = FALSE
                    ) |>
                        dplyr::select(
                            publication_year = key_display_name,
                            count
                        ) |>
                        dplyr::arrange(
                            publication_year
                        )
                },
                silent = FALSE
            )
            return(result)
        }
    )

    # Assign the names to the first and second level list
    names(strategies_options) <- names(strategies_options_terms)

    saveRDS(strategies_options, file = fn)
}
```


#### Download Strategies Corpus (only ids)

```{r}
#| label: download_strategies_corpus
#| eval: false
#|

tic()

strategies_options_terms <- readRDS(file.path("tca_corpus", "data", "strategies_options_terms.rds"))
lapply(
    names(strategies_options_terms),
    function(strategy) {
        message("- ", strategy)
        IPBES.R::corpus_download(
            pages_dir = paste0(params$pages_strategies_dir, "_", make.names(strategy)),
            title_and_abstract_search = IPBES.R::compact(paste0("(", params$s_1_tca_corpus, ") AND (", paste0(strategies_options_terms[[strategy]], collapse = " "), ")")),
            select_fields = c("id"),
            continue = TRUE,
            delete_pages_dir = FALSE,
            dry_run = FALSE,
            verbose = TRUE,
            mc_cores = 8
        )
    }
)

toc()
```


#### Convert Strategies Pages to rds (only ids)

```{r}
#| label: convert_strategies_corpus
#| eval: false
#| 

tic()

fn <- file.path("tca_corpus", "data", "strategies_options_ids.rds")

if (!file.exists(fn)) {
    strategies__options_terms <- readRDS(file.path("tca_corpus", "data", "strategies_options_terms.rds"))
    strategies_options_ids <- lapply(
        names(strategies__options_terms),
        function(strategy) {
            message("- ", strategy)
            paste0(params$pages_strategies_dir, "_", make.names(strategy)) |>
                list.files(
                    pattern = ".rds$",
                    recursive = TRUE,
                    full.names = TRUE
                ) |>
                sapply(
                    FUN = function(f) {
                        readRDS(f)
                    }
                ) |>
                unlist() |>
                as.vector() |>
                unique()
        }
    )
    names(strategies_options_ids) <- names(strategies__options_terms)

    saveRDS(strategies_options_ids, file = fn)
}

toc()
```


#### [x] Convert Strategies Pages in Cases to rds (only ids)

```{r}
#| label: convert_strategies_corpus_cases
#|

fn <- file.path("tca_corpus", "data", "strategies_options_ids_cases.rds")

if (!file.exists(fn)) {
    case_ids <- corpus_read(params$corpus_cases_dir) |>
        dplyr::distinct(id) |>
        collect() |>
        unlist()
    strategies_options_ids <- readRDS(file.path("tca_corpus", "data", "strategies_options_ids.rds"))

    strategies_options_ids_cases <- lapply(
        strategies_options_ids,
        function(strategy) {
            intersect(strategy, case_ids)
        }
    )

    saveRDS(strategies_options_ids_cases, file = fn)
}
```


#### Extract per year from Strategies Pages

```{r}
#| label: extract_strategies_per_year_corpus
#|

tic()

fn <- file.path("tca_corpus", "data", "strategies_options_corpus.rds")

if (!file.exists(fn)) {
    strategies_options_terms <- readRDS(file.path("tca_corpus", "data", "strategies_options_terms.rds"))
    strategies_options_ids <- readRDS(file.path("tca_corpus", "data", "strategies_options_ids.rds"))

    strategies_options_corpus <- lapply(
        names(strategies_options_terms),
        function(strategy) {
            message("- ", strategy)
            years <- paste0(params$pages_strategies_dir, "_", make.names(strategy)) |>
                list.files(
                    pattern = "^set_publication_year=",
                    recursive = FALSE,
                    full.names = FALSE
                ) |>
                gsub(
                    pattern = "set_publication_year=",
                    replacement = ""
                ) |>
                as.integer()
            #
            result <- list()
            result$term <- paste(strategies_options_terms[[strategy]], collapse = " ")
            result$count <- length(strategies_options_ids[[strategy]])


            result$years <- data.frame(
                publication_year = years,
                count = sapply(
                    years,
                    function(year) {
                        paste0(params$pages_strategies_dir, "_", make.names(strategy)) |>
                            list.files(
                                pattern = ".rds$",
                                recursive = TRUE,
                                full.names = TRUE
                            ) |>
                            grep(
                                pattern = paste0("set_publication_year=", year),
                                value = TRUE
                            ) |>
                            sapply(
                                FUN = function(f) {
                                    readRDS(f)
                                }
                            ) |>
                            unlist() |>
                            as.vector() |>
                            unique() |>
                            length()
                    }
                )
            )
            return(result)
        }
    )

    names(strategies_options_corpus) <- names(strategies_options_terms)

    saveRDS(strategies_options_corpus, file = fn)
}

toc()
```


#### Create Strategies Matrix

```{r}
#| label: strategies_options_overlap
#|

fn <- file.path("tca_corpus", "data", "strategies_options_overlap.rds")

if (!file.exists(fn)) {
    strategies_options_ids <- readRDS(file.path("tca_corpus", "data", "strategies_options_ids.rds"))

    overlap_count <- function(x, y) {
        length(intersect(x, y))
    }

    strategies_options_overlap <- outer(
        strategies_options_ids,
        strategies_options_ids,
        Vectorize(overlap_count)
    )

    saveRDS(strategies_options_overlap, file = fn)
    write.csv(strategies_options_overlap, file = gsub("rds$", "csv", fn))
}
```

#### [x] Create Strategies Matrix Cases Only

```{r}
#| label: strategies_options_overlap_cases
#|

fn <- file.path("tca_corpus", "data", "strategies_options_overlap_cases.rds")

if (!file.exists(fn)) {
    strategies_options_overlap_cases <- file.path("tca_corpus", "data", "strategies_options_ids_cases.rds") |>
        readRDS()

    strategies_options_overlap_cases <- sapply(
        names(strategies_options_overlap_cases),
        FUN = function(s1) {
            sapply(
                names(strategies_options_overlap_cases),
                function(s2) {
                    length(intersect(strategies_options_overlap_cases[[s1]], strategies_options_overlap_cases[[s2]]))
                }
            )
        }
    )

    saveRDS(strategies_options_overlap_cases, file = fn)
    write.csv(strategies_options_overlap_cases, file = gsub("rds$", "csv", fn))
}
```


### Results

#### Count of Strategies / Options Table
```{r}
#| label: strategies_options_corpus_counts_data
#|

# data from OpernAlex vie count
# strategies_options_corpus <- readRDS(file.path("tca_corpus", "data", "strategies_options.rds"))

# data from corpora
strategies_options <- readRDS(file.path("tca_corpus", "data", "strategies_options_corpus.rds"))

data <- lapply(
    names(strategies_options),
    function(strategy) {
        data.frame(
            Strategy = strsplit(strategy, " \\|\\|\\| ")[[1]][1],
            Concept = strsplit(strategy, " \\|\\|\\| ")[[1]][2],
            Count = strategies_options[[strategy]]$count,
            Count_until_1992 = sum(strategies_options[[strategy]]$years$count[strategies_options[[strategy]]$years$publication_year <= 1992]),
            Count_after_1992 = sum(strategies_options[[strategy]]$years$count[strategies_options[[strategy]]$years$publication_year > 1992])
        )
    }
) |>
    do.call(what = rbind)

data |>
    IPBES.R::table_dt(
        fn = "strategies_options_corpus_counts",
        fixedColumns = list(leftColumns = 2)
    )
```


#### Plot of the Count of the Strategies / Options split at 1992

This data is corrected for different research oputput before and after 1992 by dividing by the overall reasarch output in that perio as reflected on [OpenAlex](https://openalex.org).

```{r}
#| label: strategies_options_counts_figure
#|

figname <- file.path("tca_corpus", "figures", "strategies_options_time_split")

figs <- list.files(
    path = dirname(figname),
    pattern = basename(figname),
    recursive = TRUE
) |>
    length()

if (figs < 4) {
    # data from OpernAlex vie count
    # strategies_options <- readRDS(file.path("tca_corpus", "data", "strategies_options.rds"))

    # data from corpora
    strategies_options <- readRDS(file.path("tca_corpus", "data", "strategies_options_corpus.rds"))

    oa <- openalexR::oa_fetch(
        search = "",
        group_by = "publication_year",
        output = "dataframe",
        verbose = FALSE
    )
    oa_until_1992 <- sum(oa$count[oa$key <= 1992])
    oa_after_1992 <- sum(oa$count[oa$key > 1992])

    data <- lapply(
        names(strategies_options),
        function(strategy) {
            data.frame(
                Strategy = strsplit(strategy, " \\|\\|\\| ")[[1]][1],
                Concept = strsplit(strategy, " \\|\\|\\| ")[[1]][2],
                Count = strategies_options[[strategy]]$count,
                Count_until_1992 = sum(strategies_options[[strategy]]$years$count[strategies_options[[strategy]]$years$publication_year <= 1992]),
                Count_after_1992 = sum(strategies_options[[strategy]]$years$count[strategies_options[[strategy]]$years$publication_year > 1992])
            )
        }
    ) |>
        do.call(what = rbind) |>
        dplyr::mutate(
            Strategy = paste0(Strategy, " |||| ", Concept),
            Count_until_1992 = Count_until_1992 / oa_until_1992,
            Count_after_1992 = Count_after_1992 / oa_after_1992,
        ) |>
        dplyr::group_by(Strategy) |>
        dplyr::mutate(
            Count_until_1992_p = Count_until_1992 / sum(Count_until_1992 + Count_after_1992),
            Count_after_1992_p = Count_after_1992 / sum(Count_until_1992 + Count_after_1992)
        )


    figure <- data |>
        tidyr::pivot_longer(
            cols = c(Count_until_1992, Count_after_1992),
            names_to = "Period",
            values_to = "Count_year"
        ) |>
        # Create the plot
        ggplot(aes(x = Strategy, y = Count_year, fill = Period)) +
        geom_bar(stat = "identity") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
        labs(x = "Strategy", y = "Count", fill = "Period")


    ggplot2::ggsave(
        paste0(figname, ".pdf"),
        width = 12,
        height = 18,
        figure
    )
    ggplot2::ggsave(
        paste0(figname, ".png"),
        width = 12,
        height = 18,
        figure
    )

    figure <- data |>
        tidyr::pivot_longer(
            cols = c(Count_until_1992_p, Count_after_1992_p),
            names_to = "Period",
            values_to = "Count_p_year"
        ) |>
        # Create the plot
        ggplot(aes(x = Strategy, y = Count_p_year, fill = Period)) +
        geom_bar(stat = "identity") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
        labs(x = "Strategy", y = "Count", fill = "Period")


    ggplot2::ggsave(
        paste0(figname, "_p.pdf"),
        width = 12,
        height = 18,
        figure
    )
    ggplot2::ggsave(
        paste0(figname, "_p.png"),
        width = 12,
        height = 18,
        figure
    )
}
```

![The graph shows the by overall research output corrected publications before and after 1992 for each `Strategy ||| Option`](`r file.path("tca_corpus", "figures", "strategies_options_time_split.png")`)

To download high resolution, [click here](`r file.path("tca_corpus", "figures", "strategies_options_time_split.pdf")`){target="_blank"}

![The second graph shows the proportion of the by overall research output corrected publications before and after 1992 for each `Strategy ||| Option`](`r file.path("tca_corpus", "figures", "strategies_options_time_split_p.png")`)

To download high resolution, [click here](`r file.path("tca_corpus", "figures", "strategies_options_time_split_p.pdf")`){target="_blank"}

#### Assesment of individual terms

These numbers are the number of hits of TCA Corpus AND each individual term of the Option

This is based on the data from OpenAlex using API calls to get the count.

```{r}
#| label: strategies_options_asess
#| output: asis

strategies_options <- readRDS(file.path("tca_corpus", "data", "strategies_options.rds"))

lapply(
    names(strategies_options),
    function(strategy) {
        res <- data.frame(
            Strategy = strsplit(strategy, " \\|\\|\\| ")[[1]][1],
            Concept = strsplit(strategy, " \\|\\|\\| ")[[1]][2],
            Term = strategies_options[[strategy]]$assess_search_terms$term,
            Count = strategies_options[[strategy]]$assess_search_terms$count
        )
    }
) |>
    do.call(what = rbind) |>
    IPBES.R::table_dt(
        fn = "strategies_options_assessment_terms",
        fixedColumns = list(leftColumns = 2)
    )
```


#### Differences between Original and shortened search terms

Due to length constraints, search terms were removed by removing the ones with the mallest number of hits.

The differences the [original search term](./tca_corpus/input/search%20terms/strategies_options.org.md) and the [used search term](./tca_corpus/input/search%20terms/strategies_options.md) is shown below

```{r}
#| label: strategies_options_diff
#|

diffviewer::visual_diff(
    file.path("tca_corpus", "input", "search terms", "strategies_options.org.md"),
    file.path("tca_corpus", "input", "search terms", "strategies_options.md")
)
```

#### Counts over Years for each TCA Corpus AND Strategies / Options

```{r}
#| label: strategies_options_years
#|
# data from OpernAlex vie count
# strategies_options <- readRDS(file.path("tca_corpus", "data", "strategies_options.rds"))

# data from corpora
strategies_options <- readRDS(file.path("tca_corpus", "data", "strategies_options_corpus.rds"))

lapply(
    names(strategies_options),
    function(strategy) {
        data.frame(
            Strategy = strsplit(strategy, " \\|\\|\\| ")[[1]][1],
            Concept = strsplit(strategy, " \\|\\|\\| ")[[1]][2],
            Year = strategies_options[[strategy]]$years$publication_year,
            Count = strategies_options[[strategy]]$years$count
        )
    }
) |>
    do.call(what = rbind) |>
    IPBES.R::table_dt(
        fn = "strategies_options_counts_per_year",
        fixedColumns = list(leftColumns = 2)
    )
```

<!-- #### [ ] Strategies / Options Works in Case Corpus -->

```{r}
#| label: actors_strategies_options_table
#| eval: false

readRDS(file.path("tca_corpus", "data", "strategies_options_cases.rds"))$strategies_options_case |>
    dplyr::mutate(
        p_cases = round(p_cases, 4)
    ) |>
    IPBES.R::table_dt(
        fn = "strategies_options__cases",
        fixedColumns = list(leftColumns = 0)
    )
```


#### Matrix showing overlap of Strategies / Options

The size of the nodes indicates the number of works in each node, while the thicknes of the edges (lines) indicates the proportion of overlapping works of the maximum possible, i.e. $p_{i,j} = \frac{n_{\text{links}(i,j)}}{\min(n_i, n_j)}$

The nodes can be moved and if selected, the links to / from that node are highlighted.

```{r}
#| label: strategies_option_overlap_figure
#|

fn <- file.path("tca_corpus", "figures", "strategies_options_interactive_matrix")

if (!file.exists(paste0(fn, ".html"))) {
    data <- readRDS(file.path("tca_corpus", "data", "strategies_options_overlap.rds"))

    value <- diag(data)

    for (i in 1:ncol(data)) {
        for (j in 1:nrow(data)){
            data[i, j] <- data[i, j] / min(value[i], value[j])
        }
    }

    # Create a graph from the adjacency matrix
    g <- igraph::graph_from_adjacency_matrix(
        data,
        mode = "max",
        weighted = TRUE,
        diag = FALSE
    )

    # Convert the graph to a data frame
    nodes <- data.frame(
        id = igraph::V(g)$name,
        label = igraph::V(g)$name,
        value = value
    ) # |>
    # arrange(desc(value))

    edges <- igraph::as_data_frame(
        g,
        what = "edges"
    )

    names(edges) <- c("from", "to", "width")
    edges$width <- (edges$width / max(edges$width)) * 20

    nodes$strategy <- sapply(strsplit(nodes$id, " \\|\\|\\| "), `[`, 1)
    nodes$option <- sapply(strsplit(nodes$id, " \\|\\|\\| "), `[`, 2)
    nodes$option <- sapply(
        strsplit(nodes$option, "\\. |: "),
        function(x) {
            return(x[length(x)])
        }
    )

    colors <- rainbow(length(nodes$strategy)) # Generate a color for each strategy
    names(colors) <- nodes$strategy
    nodes$color <- colors[nodes$strategy]


    nodes$label <- sapply(
        strsplit(nodes$option, "\\. |: "),
        function(x) {
            return(x[length(x)])
        }
    )

    # Calculate grid positions for the nodes

    node_x <- 1:length(unique(nodes$strategy))
    names(node_x) <- unique(nodes$strategy)
    nodes$x <- node_x[nodes$strategy] * 300

    node_y <- 1:length(unique(nodes$option))
    offset <- 0
    for (strategy in unique(nodes$strategy)) {
        id_s <- nodes$strategy == strategy
        names(node_y) <- unique(nodes$option[id_s])
        nodes$y[id_s] <- (node_y[nodes$option[id_s]] + offset) * 200
        if (offset == 0.4) {
            offset <- 0
        } else {
            offset <- 0.4
        }
    }


    # Center the nodes
    nodes$x <- (nodes$x - max(nodes$x) / 2) / 3
    nodes$y <- (nodes$y - max(nodes$y) / 2) / 3


    # Create the interactive network graph

    p <- visNetwork(
        nodes,
        edges,
        height = "900px",
        width = "1800px"
    ) |>
        visNodes(
            x = nodes$x,
            y = nodes$y,
            fixed = FALSE,
            physics = FALSE,
            font = list(
                size = 10
            )
        ) |>
        visEdges(
            physics = TRUE,
            color = list(
                color = "lightgray",
                highlight = "black"
            )
        ) |>
        visPhysics(
            solver = "barnesHut",
            timestep = 0.1,
            stabilization = list(
                iterations = 1000
            )
        )


    htmlwidgets::saveWidget(
        p,
        file = paste0(fn, ".html")
    )
    webshot::webshot(paste0(fn, ".html"), file = paste0(fn, ".pdf"), delay = 3)

    unlink(paste0(fn, "_files"), recursive = TRUE)
}
```

<iframe src="tca_corpus/figures/strategies_options_interactive_matrix.html" width="100%" height="800px"></iframe>


To open the interactive graph or to download it, [click here](tca_corpus/figures/strategies_options_interactive_matrix.html){target="_blank"}

To download high resolution pdf, [click here](`r file.path("tca_corpus", "figures", "strategies_options_interactive_matrix.pdf")`){target="_blank"}

```{r}
#| label: strategies_option_overlap_table
#|

readRDS(file.path("tca_corpus", "data", "strategies_options_overlap.rds")) |>
    IPBES.R::table_dt(
        fn = "strategies_options_overlap",
        fixedColumns = list(leftColumns = 0)
    )
```

#### [x] Matrix showing overlap of Strategies / Options Cases Only

The size of the nodes indicates the number of works in each node, while the thicknes of the edges (lines) indicates the proportion of overlapping works of the maximum possible, i.e. $p_{i,j} = \frac{n_{\text{links}(i,j)}}{\min(n_i, n_j)}$

The nodes can be moved and if selected, the links to / from that node are highlighted.

```{r}
#| label: strategies_option_overlap_cases_figure
#|

fn <- file.path("tca_corpus", "figures", "strategies_options_interactive_matrix_cases")

if (!file.exists(paste0(fn, ".html"))) {
    data <- readRDS(file.path("tca_corpus", "data", "strategies_options_overlap_cases.rds"))

    value <- diag(data)

    for (i in 1:ncol(data)) {
        for (j in 1:nrow(data)){
            data[i, j] <- data[i, j] / min(value[i], value[j])
        }
    }

    # Create a graph from the adjacency matrix
    g <- igraph::graph_from_adjacency_matrix(
        data,
        mode = "max",
        weighted = TRUE,
        diag = FALSE
    )

    # Convert the graph to a data frame
    nodes <- data.frame(
        id = igraph::V(g)$name,
        label = igraph::V(g)$name,
        value = value
    ) # |>
    # arrange(desc(value))

    edges <- igraph::as_data_frame(
        g,
        what = "edges"
    )

    names(edges) <- c("from", "to", "width")
    edges$width <- (edges$width / max(edges$width)) * 20

    nodes$strategy <- sapply(strsplit(nodes$id, " \\|\\|\\| "), `[`, 1)
    nodes$option <- sapply(strsplit(nodes$id, " \\|\\|\\| "), `[`, 2)
    nodes$option <- sapply(
        strsplit(nodes$option, "\\. |: "),
        function(x) {
            return(x[length(x)])
        }
    )

    colors <- rainbow(length(nodes$strategy)) # Generate a color for each strategy
    names(colors) <- nodes$strategy
    nodes$color <- colors[nodes$strategy]


    nodes$label <- sapply(
        strsplit(nodes$option, "\\. |: "),
        function(x) {
            return(x[length(x)])
        }
    )

    # Calculate grid positions for the nodes

    node_x <- 1:length(unique(nodes$strategy))
    names(node_x) <- unique(nodes$strategy)
    nodes$x <- node_x[nodes$strategy] * 300

    node_y <- 1:length(unique(nodes$option))
    offset <- 0
    for (strategy in unique(nodes$strategy)) {
        id_s <- nodes$strategy == strategy
        names(node_y) <- unique(nodes$option[id_s])
        nodes$y[id_s] <- (node_y[nodes$option[id_s]] + offset) * 200
        if (offset == 0.4) {
            offset <- 0
        } else {
            offset <- 0.4
        }
    }


    # Center the nodes
    nodes$x <- (nodes$x - max(nodes$x) / 2) / 3
    nodes$y <- (nodes$y - max(nodes$y) / 2) / 3


    # Create the interactive network graph

    p <- visNetwork(
        nodes,
        edges,
        height = "900px",
        width = "1800px"
    ) |>
        visNodes(
            x = nodes$x,
            y = nodes$y,
            fixed = FALSE,
            physics = FALSE,
            font = list(
                size = 10
            )
        ) |>
        visEdges(
            physics = TRUE,
            color = list(
                color = "lightgray",
                highlight = "black"
            )
        ) |>
        visPhysics(
            solver = "barnesHut",
            timestep = 0.1,
            stabilization = list(
                iterations = 1000
            )
        )


    htmlwidgets::saveWidget(
        p,
        file = paste0(fn, ".html")
    )
    webshot::webshot(paste0(fn, ".html"), file = paste0(fn, ".pdf"), delay = 3)

    unlink(paste0(fn, "_files"), recursive = TRUE)
}
```

<iframe src="tca_corpus/figures/strategies_options_interactive_matrix_cases.html" width="100%" height="800px"></iframe>


To open the interactive graph or to download it, [click here](tca_corpus/figures/strategies_options_interactive_matrix_cases.html){target="_blank"}

To download high resolution pdf, [click here](`r file.path("tca_corpus", "figures", "strategies_options_interactive_matrix_cases.pdf")`){target="_blank"}

```{r}
#| label: strategies_option_overlap_cases_cases_table
#|

readRDS(file.path("tca_corpus", "data", "strategies_options_overlap_cases.rds")) |>
    IPBES.R::table_dt(
        fn = "strategies_options_overlap_cases",
        fixedColumns = list(leftColumns = 1)
    )
```

## Actors for transformative change 

The file [actors.md](tca_corpus/input/search terms/actors.md) contains the terms for the different actors.

I will now iterate through all of them and identify the number of hits per individual search term. This can be used as a result in itself in interpreting the importance of each term as well as to shorten the search term to be able to use it together with the [TCA search term](tca_corpus/input/search terms/tca_corpus.txt).

### Methods

#### Prepare Search Terms
```{r}
#| label: actors_prepare_searchterms

fn <- file.path("tca_corpus", "data", "actors_terms.rds")
if (!file.exists(fn)) {

     sts <- data.frame(
        "Sector" = NA,
        "Actor_Group" = NA,
        term = params$sts_actors
    )
    sts <- sts[sts$term != "", ]
    for (i in 1:nrow(sts)) {
        if (grepl("^# ", sts$term[i])) {
            sector <- gsub("^# ", "", sts$term[i])
        } else if (grepl("^## ", sts$term[i])) {
            actor_group <- gsub("^## ", "", sts$term[i])
        } else {
            sts$Sector[i] <- sector
            sts$Actor_Group[i] <- actor_group
        }
    }
    sts <- sts[!is.na(sts$Sector), ]
    sts <- sts[!is.na(sts$Actor_Group), ]

    sts$Name <- paste0(sts$Sector, " ||| ", sts$Actor_Group)

    actors_terms <- split(sts$term, sts$Name)
    #############

    saveRDS(actors_terms, file = fn)
}
```

#### Run the search terms
```{r}
#| label: actors_get_counts

fn <- file.path("tca_corpus", "data", "actors.rds")
if (!file.exists(file.path(fn))) {
    actors_terms <- readRDS(file.path("tca_corpus", "data", "actors_terms.rds"))
    actors_options <- lapply(
        names(actors_terms),
        function(actor) {
            message("- ", actor)
            result <- list()
            result$term <- paste(actors_terms[[actor]], collapse = " ")
            result$count <- NA
            result$years <- data.frame(publication_year = NA, count = NA)
            result$assess_search_terms <- assess_search_term(
                st = actors_terms[[actor]],
                AND_term = params$s_1_tca_corpus,
                remove = " OR$",
                excl_others = FALSE,
                verbose = FALSE,
                mc.cores = params$mc.cores
            ) |>
                dplyr::arrange(desc(count))
            #
            try(
                {
                    result$count <- openalexR::oa_fetch(
                        title_and_abstract.search = IPBES.R::compact(paste0("(", params$s_1_tca_corpus, ") AND (", result$term, ")")),
                        count_only = TRUE,
                        output = "list",
                        verbose = FALSE
                    )$count
                    message("  - ", result$count, " hits")
                    #
                    result$years <- openalexR::oa_fetch(
                        title_and_abstract.search = IPBES.R::compact(paste0("(", params$s_1_tca_corpus, ") AND (", result$term, ")")),
                        group_by = "publication_year",
                        output = "dataframe",
                        verbose = FALSE
                    ) |>
                        dplyr::select(
                            publication_year = key_display_name,
                            count
                        ) |>
                        dplyr::arrange(
                            publication_year
                        )
                },
                silent = FALSE
            )
            return(result)
        }
    )

    # Assign the names to the first and second level list
    names(actors_options) <- names(actors_terms)

    saveRDS(actors_options, file = fn)
}
```

#### Download Actors Corpus (only ids)

```{r}
#| label: download_actors_corpus
#| eval: false
#|

tic()

actors_terms <- readRDS(file.path("tca_corpus", "data", "actors_terms.rds"))
lapply(
    names(actors_terms),
    function(actor) {
        message("- ", actor)
        IPBES.R::corpus_download(
            pages_dir = paste0(params$pages_actors_dir, "_", make.names(actor)),
            title_and_abstract_search = IPBES.R::compact(paste0("(", params$s_1_tca_corpus, ") AND (", paste0(actors_terms[[actor]], collapse = " "), ")")),
            select_fields = c("id"),
            continue = TRUE,
            delete_pages_dir = FALSE,
            dry_run = FALSE,
            verbose = FALSE,
            mc_cores = 8
        )
    }
)

toc()
```

#### Convert Actors Pages to rds (only ids)

```{r}
#| label: convert_actors_corpus
#|

tic()

fn <- file.path("tca_corpus", "data", "actors_ids.rds")

if (!file.exists(fn)) {
    actors_terms <- readRDS(file.path("tca_corpus", "data", "actors_terms.rds"))
    actors_ids <- lapply(
        names(actors_terms),
        function(actor) {
            message("- ", actor)
            paste0(params$pages_actors_dir, "_", make.names(actor)) |>
                list.files(
                    pattern = ".rds$",
                    recursive = TRUE,
                    full.names = TRUE
                ) |>
                sapply(
                    FUN = function(f) {
                        readRDS(f)
                    }
                ) |>
                unlist() |>
                as.vector() |>
                unique()
        }
    )
    names(actors_ids) <- names(actors_terms)

    saveRDS(actors_ids, file = fn)
}

toc()
```


#### [x] Convert Actors Pages in Cases to rds (only ids)

```{r}
#| label: convert_avtors_corpus_cases
#|

fn <- file.path("tca_corpus", "data", "actors_ids_cases.rds")

if (!file.exists(fn)) {
    case_ids <- corpus_read(params$corpus_cases_dir) |>
        dplyr::distinct(id) |>
        collect() |>
        unlist()
    actors_ids <- readRDS(file.path("tca_corpus", "data", "actors_ids.rds"))

    actors_ids_cases <- lapply(
        actors_ids,
        function(actor) {
            intersect(actor, case_ids)
        }
    )

    saveRDS(actors_ids_cases, file = fn)
}
```


#### Extract per year from Actors Pages

```{r}
#| label: extract_actors_per_year_corpus
#|

tic()

fn <- file.path("tca_corpus", "data", "actors_corpus.rds")

if (!file.exists(fn)) {
    actors_terms <- readRDS(file.path("tca_corpus", "data", "actors_terms.rds"))
    actors_ids <- readRDS(file.path("tca_corpus", "data", "actors_ids.rds"))

    actors_corpus <- lapply(
        names(actors_terms),
        function(actor) {
            message("- ", actor)
            years <- paste0(params$pages_actors_dir, "_", make.names(actor)) |>
                list.files(
                    pattern = "^set_publication_year=",
                    recursive = FALSE,
                    full.names = FALSE
                ) |>
                gsub(
                    pattern = "set_publication_year=",
                    replacement = ""
                ) |>
                as.integer()
            #
            result <- list()
            result$term <- paste(actors_terms[[actor]], collapse = " ")
            result$count <- length(actors_ids[[actor]])


            result$years <- data.frame(
                publication_year = years,
                count = sapply(
                    years,
                    function(year) {
                        paste0(params$pages_actors_dir, "_", make.names(actor)) |>
                            list.files(
                                pattern = ".rds$",
                                recursive = TRUE,
                                full.names = TRUE
                            ) |>
                            grep(
                                pattern = paste0("set_publication_year=", year),
                                value = TRUE
                            ) |>
                            sapply(
                                FUN = function(f) {
                                    readRDS(f)
                                }
                            ) |>
                            unlist() |>
                            as.vector() |>
                            unique() |>
                            length()
                    }
                )
            )
            return(result)
        }
    )

    names(actors_corpus) <- names(actors_terms)

    saveRDS(actors_corpus, file = fn)
}

toc()
```

#### Create Actors Matrix

```{r}
#| label: actors_overlap
#|

fn <- file.path("tca_corpus", "data", "actors_overlap.rds")

if (!file.exists(fn)) {
    actors_ids <- readRDS(file.path("tca_corpus", "data", "actors_ids.rds"))

    overlap_count <- function(x, y) {
        length(intersect(x, y))
    }

    actors_overlap <- outer(
        actors_ids,
        actors_ids,
        Vectorize(overlap_count)
    )

    saveRDS(actors_overlap, file = fn)
    write.csv(actors_overlap, file = gsub("rds$", "csv", fn))
}
```


<!-- #### [ ] Actors in Case Corpus -->

```{r}
#| label: cases_actors_corpus
#| eval: false

fn <- file.path("tca_corpus", "data", "actors_case.rds")

if (!file.exists(fn)) {
    case_ids <- corpus_read(params$corpus_cases_dir) |>
        dplyr::distinct(id) |>
        collect() |>
        unlist()
    actors_ids <- readRDS(file.path("tca_corpus", "data", "actors_ids.rds"))

    actors_case <- lapply(
        names(actors_ids),
        function(actor) {
            message("- ", actor)
            count <- sum(actors_ids[[actor]] %in% case_ids)
            data.frame(
                actor = actor,
                count_cases = count,
                count_tca = length(actors_ids[[actor]]),
                p_cases = count / length(actors_ids[[actor]])
            )
        }
    ) |>
        do.call(what = rbind)

    actors_cases <- list(
        timestamp = Sys.time(),
        actors_case = actors_case
    )

    saveRDS(actors_cases, file = fn)
}
```
#### [x] Create Actors Matrix Cases Only

```{r}
#| label: actors_overlap_cases
#|

fn <- file.path("tca_corpus", "data", "actors_overlap_cases.rds")

if (!file.exists(fn)) {
    actors_ids <- readRDS(file.path("tca_corpus", "data", "actors_ids_cases.rds"))

    overlap_count <- function(x, y) {
        length(intersect(x, y))
    }

    actors_overlap <- outer(
        actors_ids,
        actors_ids,
        Vectorize(overlap_count)
    )

    saveRDS(actors_overlap, file = fn)
    write.csv(actors_overlap, file = gsub("rds$", "csv", fn))
    
    rm(actors_overlap)
}
```


### Results

#### Count of Actors Table
```{r}
#| label: actors_counts_data
#|

# data from OpernAlex vie count
# actors <- readRDS(file.path("tca_corpus", "data", "actors.rds"))

# data from corpora
actors <- readRDS(file.path("tca_corpus", "data", "actors_corpus.rds"))

data <- lapply(
    names(actors),
    function(actor) {
        data.frame(
            Actor = actor,
            Count = actors[[actor]]$count,
            Count_until_1992 = sum(actors[[actor]]$years$count[actors[[actor]]$years$publication_year <= 1992]),
            Count_after_1992 = sum(actors[[actor]]$years$count[actors[[actor]]$years$publication_year > 1992])
        )
    }
) |>
    do.call(what = rbind)

data |>
    IPBES.R::table_dt(
        fn = "strategies_options_counts",
        fixedColumns = list(leftColumns = 2)
    )
```


#### Assesment of individual terms

These numbers are the number of hits of TCA Corpus AND each individual term of the Option

```{r}
#| label: actors_asess
#| output: asis

actors <- readRDS(file.path("tca_corpus", "data", "actors.rds"))

lapply(
    names(actors),
    function(actor) {
        res <- data.frame(
            Actor = actor,
            Term = actors[[actor]]$assess_search_terms$term,
            Count = actors[[actor]]$assess_search_terms$count
        )
    }
) |>
    do.call(what = rbind) |>
    IPBES.R::table_dt(
        fn = "actors_assessment_terms",
        fixedColumns = list(leftColumns = 2)
    )
```


#### Differences between Original and shortened search terms

Due to length constraints, search terms were removed by removing the ones with the mallest number of hits.

The differences the [original search term](./tca_corpus/input/search%20terms/actors.org.md) and the [used search term](./tca_corpus/input/search%20terms/actors.md) is shown below

```{r}
#| label: diff_actors
#|

diffviewer::visual_diff(
    file.path("tca_corpus", "input", "search terms", "actors.org.md"),
    file.path("tca_corpus", "input", "search terms", "actors.md")
)
```


#### Counts over Years for each TCA Corpus AND Actors

```{r}
#| label: actors_years
#|

# data from OpernAlex vie count
# actors <- readRDS(file.path("tca_corpus", "data", "actors.rds"))

# data from corpora
actors <- readRDS(file.path("tca_corpus", "data", "actors_corpus.rds"))

lapply(
    names(actors),
    function(actor) {
        data.frame(
            Actor = actor,
            Year = actors[[actor]]$years$publication_year,
            Count = actors[[actor]]$years$count
        )
    }
) |>
    do.call(what = rbind) |>
    IPBES.R::table_dt(
        fn = "actors_counts_per_year",
        fixedColumns = list(leftColumns = 2)
    )
```


#### Plot of the Count of the Actors split at 1992

This data is corrected for different research oputput before and after 1992 by dividing by the overall reasarch output in that perio as reflected on [OpenAlex](https://openalex.org).

```{r}
#| label: actors_counts_figure
#|

figname <- file.path("tca_corpus", "figures", "actors_time_split")

figs <- list.files(
    path = dirname(figname),
    pattern = basename(figname),
    recursive = TRUE
) |> length()

if (figs < 4) {
    # data from OpernAlex vie count
    # actors <- readRDS(file.path("tca_corpus", "data", "actors.rds"))

    # data from corpora
    actors <- readRDS(file.path("tca_corpus", "data", "actors_corpus.rds"))


    oa <- openalexR::oa_fetch(
        search = "",
        group_by = "publication_year",
        output = "dataframe",
        verbose = FALSE
    )
    oa_until_1992 <- sum(oa$count[oa$key <= 1992])
    oa_after_1992 <- sum(oa$count[oa$key > 1992])

    data <- lapply(
        names(actors),
        function(actor) {
            data.frame(
                Actor = actor,
                Count = actors[[actor]]$count,
                Count_until_1992 = sum(actors[[actor]]$years$count[actors[[actor]]$years$publication_year <= 1992]),
                Count_after_1992 = sum(actors[[actor]]$years$count[actors[[actor]]$years$publication_year > 1992])
            )
        }
    ) |>
        do.call(what = rbind) |>
        dplyr::mutate(
            Count_until_1992 = Count_until_1992 / oa_until_1992,
            Count_after_1992 = Count_after_1992 / oa_after_1992,
        ) |>
        dplyr::group_by(
            Actor
        ) |>
        dplyr::mutate(
            Count_until_1992_p = Count_until_1992 / sum(Count_until_1992 + Count_after_1992),
            Count_after_1992_p = Count_after_1992 / sum(Count_until_1992 + Count_after_1992)
        )


    figure <- data |>
        tidyr::pivot_longer(
            cols = c(Count_until_1992, Count_after_1992),
            names_to = "Period",
            values_to = "Count_year"
        ) |>
        # Create the plot
        ggplot(aes(x = Actor, y = Count_year, fill = Period)) +
        geom_bar(stat = "identity") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
        labs(x = "Actors", y = "Count", fill = "Period")


    ggplot2::ggsave(
        paste0(figname, ".pdf"),
        width = 12,
        height = 18,
        figure
    )
    ggplot2::ggsave(
        paste0(figname, ".png"),
        width = 12,
        height = 18,
        figure
    )

    figure <- data |>
        tidyr::pivot_longer(
            cols = c(Count_until_1992_p, Count_after_1992_p),
            names_to = "Period",
            values_to = "Count_p_year"
        ) |>
        # Create the plot
        ggplot(aes(x = Actor, y = Count_p_year, fill = Period)) +
        geom_bar(stat = "identity") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
        labs(x = "Actor", y = "Count", fill = "Period")


    ggplot2::ggsave(
        paste0(figname, "_p.pdf"),
        width = 12,
        height = 18,
        figure
    )
    ggplot2::ggsave(
        paste0(figname, "_p.png"),
        width = 12,
        height = 18,
        figure
    )
}

```

![The graph shows the by overall research output corrected publications before and after 1992 for each `Actor`](`r file.path("tca_corpus", "figures", "actors_time_split.png")`)

To download high resolution, [click here](`r file.path("tca_corpus", "figures", "actors_time_split.pdf")`){target="_blank"}

![The second graph shows the proportion of the by overall research output corrected publications before and after 1992 for each `Actor`](`r file.path("tca_corpus", "figures", "actors_time_split_p.png")`)

To download high resolution, [click here](`r file.path("tca_corpus", "figures", "actors_time_split_p.pdf")`){target="_blank"}

<!-- #### [ ] Actors Works in Case Corpus -->

```{r}
#| label: actors_cases_table
#| eval: false

readRDS(file.path("tca_corpus", "data", "actors_case.rds"))$actors_case |>
    dplyr::mutate(
        p_cases = round(p_cases, 4)
    ) |>
    IPBES.R::table_dt(
        fn = "actors_cases",
        fixedColumns = list(leftColumns = 0)
    )
```


#### Matrix showing overlap of Actors

The size of the nodes indicates the number of works in each node, while the thicknes of the edges (lines) indicates the proportion of overlapping works of the maximum possible, i.e. $p_{i,j} = \frac{n_{\text{links}(i,j)}}{\min(n_i, n_j)}$


The nodes can be moved and if selected, the links to / from that node are highlighted.

```{r}
#| label: actors_overlap_figure
#|

fn <- file.path("tca_corpus", "figures", "actors_interactive_matrix")

if (!file.exists(paste0(fn, ".html"))) {
    data <- readRDS(file.path("tca_corpus", "data", "actors_overlap.rds"))

    value <- diag(data)

    for (i in 1:ncol(data)) {
        for (j in 1:nrow(data)) {
            data[i, j] <- data[i, j] / min(value[i], value[j])
        }
    }


    # Create a graph from the adjacency matrix
    g <- igraph::graph_from_adjacency_matrix(
        data,
        mode = "max",
        weighted = TRUE,
        diag = FALSE
    )

    # Convert the graph to a data frame
    nodes <- data.frame(
        id = igraph::V(g)$name,
        label = igraph::V(g)$name,
        value = value
    )
    edges <- igraph::as_data_frame(
        g,
        what = "edges"
    )

    names(edges) <- c("from", "to", "width")
    edges$width <- (edges$width / max(edges$width)) * 20

    nodes$sector <- sapply(strsplit(nodes$label, " \\|\\|\\| "), `[`, 1)
    nodes$actor_group <- sapply(strsplit(nodes$label, " \\|\\|\\| "), `[`, 2)

    colors <- rainbow(length(nodes$sector)) # Generate a color for each sector
    names(colors) <- nodes$sector
    nodes$color <- colors[nodes$sector]

    nodes$label <- nodes$option

    # Calculate grid positions for the nodes
    node_x <- 1:length(unique(nodes$sector))
    names(node_x) <- unique(nodes$sector)
    nodes$x <- node_x[nodes$sector] * 300

    node_y <- 1:length(unique(nodes$actor_group))
    offset <- 0
    for (sector in unique(nodes$sector)) {
        id_s <- nodes$sector == sector
        names(node_y) <- unique(nodes$actor_group[id_s])
        nodes$y[id_s] <- (node_y[nodes$actor_group[id_s]] + offset) * 200
        if (offset == 0.5) {
            offset <- 0
        } else {
            offset <- 0.5
        }
    }

    # Center the nodes
    nodes$x <- nodes$x - max(nodes$x) / 2
    nodes$y <- nodes$y - max(nodes$y) / 2

    nodes$label <- nodes$actor_group

    # Create the interactive network graph

    p <- visNetwork(
        nodes,
        edges,
        height = "900px",
        width = "1800px"
    ) |>
        visNodes(
            x = nodes$x,
            y = nodes$y,
            fixed = FALSE,
            physics = FALSE,
            font = list(
                size = 24
            )
        ) |>
        visEdges(
            physics = TRUE,
            color = list(
                color = "lightgray",
                highlight = "black"
            )
        ) |>
        visPhysics(
            solver = "barnesHut",
            timestep = 0.1,
            stabilization = list(
                iterations = 1000
            )
        )

    htmlwidgets::saveWidget(
        p,
        file = paste0(fn, ".html")
    )
    webshot::webshot(paste0(fn, ".html"), file = paste0(fn, ".pdf"), delay = 3)
    unlink(paste0(fn, "_files"), recursive = TRUE)
}
```

<iframe src="tca_corpus/figures/actors_interactive_matrix.html" width="100%" height="800px"></iframe>

To open the interactive graph or to download it, [click here](tca_corpus/figures/actors_interactive_matrix.html){target="_blank"}

To download high resolution pdf, [click here](`r file.path("tca_corpus", "figures", "actors_interactive_matrix.pdf")`){target="_blank"}


```{r}
#| label: actors_overlap_table
#|

readRDS(file.path("tca_corpus", "data", "actors_overlap.rds")) |>
    IPBES.R::table_dt(
        fn = "actors_overlap",
        fixedColumns = list(leftColumns = 0)
    )
```


#### [x] Matrix showing overlap of Actors Cases Only

The size of the nodes indicates the number of works in each node, while the thicknes of the edges (lines) indicates the proportion of overlapping works of the maximum possible, i.e. $p_{i,j} = \frac{n_{\text{links}(i,j)}}{\min(n_i, n_j)}$


The nodes can be moved and if selected, the links to / from that node are highlighted.

```{r}
#| label: actors_overlap_cases_figure
#|

fn <- file.path("tca_corpus", "figures", "actors_interactive_matrix_cases")

if (!file.exists(paste0(fn, ".html"))) {
    data <- readRDS(file.path("tca_corpus", "data", "actors_overlap_cases.rds"))

    value <- diag(data)

    for (i in 1:ncol(data)) {
        for (j in 1:nrow(data)) {
            data[i, j] <- data[i, j] / min(value[i], value[j])
        }
    }


    # Create a graph from the adjacency matrix
    g <- igraph::graph_from_adjacency_matrix(
        data,
        mode = "max",
        weighted = TRUE,
        diag = FALSE
    )

    # Convert the graph to a data frame
    nodes <- data.frame(
        id = igraph::V(g)$name,
        label = igraph::V(g)$name,
        value = value
    )
    edges <- igraph::as_data_frame(
        g,
        what = "edges"
    )

    names(edges) <- c("from", "to", "width")
    edges$width <- (edges$width / max(edges$width)) * 20

    nodes$sector <- sapply(strsplit(nodes$label, " \\|\\|\\| "), `[`, 1)
    nodes$actor_group <- sapply(strsplit(nodes$label, " \\|\\|\\| "), `[`, 2)

    colors <- rainbow(length(nodes$sector)) # Generate a color for each sector
    names(colors) <- nodes$sector
    nodes$color <- colors[nodes$sector]

    nodes$label <- nodes$option

    # Calculate grid positions for the nodes
    node_x <- 1:length(unique(nodes$sector))
    names(node_x) <- unique(nodes$sector)
    nodes$x <- node_x[nodes$sector] * 300

    node_y <- 1:length(unique(nodes$actor_group))
    offset <- 0
    for (sector in unique(nodes$sector)) {
        id_s <- nodes$sector == sector
        names(node_y) <- unique(nodes$actor_group[id_s])
        nodes$y[id_s] <- (node_y[nodes$actor_group[id_s]] + offset) * 200
        if (offset == 0.5) {
            offset <- 0
        } else {
            offset <- 0.5
        }
    }

    # Center the nodes
    nodes$x <- nodes$x - max(nodes$x) / 2
    nodes$y <- nodes$y - max(nodes$y) / 2

    nodes$label <- nodes$actor_group

    # Create the interactive network graph

    p <- visNetwork(
        nodes,
        edges,
        height = "900px",
        width = "1800px"
    ) |>
        visNodes(
            x = nodes$x,
            y = nodes$y,
            fixed = FALSE,
            physics = FALSE,
            font = list(
                size = 24
            )
        ) |>
        visEdges(
            physics = TRUE,
            color = list(
                color = "lightgray",
                highlight = "black"
            )
        ) |>
        visPhysics(
            solver = "barnesHut",
            timestep = 0.1,
            stabilization = list(
                iterations = 1000
            )
        )

    htmlwidgets::saveWidget(
        p,
        file = paste0(fn, ".html")
    )
    webshot::webshot(paste0(fn, ".html"), file = paste0(fn, ".pdf"), delay = 3)
    unlink(paste0(fn, "_files"), recursive = TRUE)
}
```

<iframe src="tca_corpus/figures/actors_interactive_matrix_cases.html" width="100%" height="800px"></iframe>

To open the interactive graph or to download it, [click here](tca_corpus/figures/actors_interactive_matrix_cases.html){target="_blank"}

To download high resolution pdf, [click here](`r file.path("tca_corpus", "figures", "actors_interactive_matrix_cases.pdf")`){target="_blank"}


```{r}
#| label: actors_overlap_cases_table
#|

readRDS(file.path("tca_corpus", "data", "actors_overlap_cases.rds")) |>
    IPBES.R::table_dt(
        fn = "actors_overlap",
        fixedColumns = list(leftColumns = 1)
    )
```


## Actors x Stategies / Options

```{r}
#| label: actors_strategies_options_overlap_data

fn <- file.path("tca_corpus", "data", "actors_strategies_options_overlap.rds")

if (!file.exists(fn)) {
    actors_ids <- file.path("tca_corpus", "data", "actors_ids.rds") |> readRDS()
    strategies_options_ids <- file.path("tca_corpus", "data", "strategies_options_ids.rds") |> readRDS()
    sapply(actors_ids, function(actor) {
        sapply(strategies_options_ids, function(strategy_option) {
            length(intersect(actor, strategy_option))
        })
    }) |>
        saveRDS(file = fn)
}
```

```{r}
#| label: actors_strategies_options_overlap_table
readRDS(file.path("tca_corpus", "data", "actors_strategies_options_overlap.rds")) |>
    IPBES.R::table_dt(
        fn = "actors_strategies_options_overlap",
        fixedColumns = list(leftColumns = 1)
    )
```

The colour indicates s indicates the proportion of overlapping works of the maximum possible, i.e. $p_{i,j} = \frac{n_{\text{links}(i,j)}}{\min(n_i, n_j)}$

```{r}
#| label: actors_strategies_options_overlap_figure
#|

fn <- file.path("tca_corpus", "figures", "actors_strategies_options_matrix")

n <- list.files(
    path = dirname(fn),
    pattern = basename(fn),
    recursive = TRUE
) |> length()

plot_heatmap <- function(data, actors_ids, strategies_options_ids, plot_data = NULL) {
    if (is.null(plot_data)) {
        actors_count <- sapply(
            actors_ids,
            length
        )
        strategies_options_count <- sapply(
            strategies_options_ids,
            length
        )

        for (i in 1:length(strategies_options_count)) {
            for (j in 1:length(actors_count)) {
                data[i, j] <- data[i, j] / min(strategies_options_count[i], actors_count[j])
            }
        }
    } else {
        data <- plot_data
    }

    sector <- strsplit(colnames(data), split = "\\ \\|\\|\\|\\ ") |> sapply(FUN = "[[", 1)
    strategy <- strsplit(rownames(data), split = "\\ \\|\\|\\|\\ ") |> sapply(FUN = "[[", 1)

    colnames(data) <- strsplit(colnames(data), split = "\\ \\|\\|\\|\\ ") |>
        sapply(FUN = "[[", 2)
    rownames(data) <- strsplit(rownames(data), split = "\\ \\|\\|\\|\\ ") |>
        sapply(FUN = "[[", 2)

    dat_ggplot <- as.data.frame(as.table(data)) |>
        dplyr::rename("y" = Var1, "x" = Var2, "Proportion" = Freq)

    levels(dat_ggplot$y) <- rownames(data) |>
        gsub(pattern = "Action ", replacement = "")

    p <- dat_ggplot |>
        ggplot(
            aes(x = x, y = y, fill = Proportion)
        ) +
        geom_tile() +
        scale_fill_gradient2(
            low = "#196C71", # Blue color for negative values
            mid = "#f7f7f7", # Light beige color for zero
            high = "#B65719", # Red color for positive values
            midpoint = 0 # Set the midpoint to zero
        ) +
        theme(
            axis.text.x = element_text(angle = 90, hjust = 1)
        ) +
        ggplot2::ylab("Strategies / Actions") +
        ggplot2::xlab("Actors")

    vline <- c(0, diff(as.numeric(factor(sector))) != 0)
    hline <- c(0, diff(as.numeric(factor(strategy))) != 0)

    p <- p + geom_vline(xintercept = 0.5, color = "black")
    for (i in vline * 1:(length(vline))) {
        if (i > 0) {
            p <- p + geom_vline(xintercept = i - 0.5, color = "black")
        }
    }
    p <- p + geom_vline(xintercept = length(vline) + 0.5, color = "black")

    p <- p + geom_hline(yintercept = 0.5, color = "black")
    for (i in hline * 1:(length(hline))) {
        if (i > 0) {
            p <- p + geom_hline(yintercept = i - 0.5, color = "black")
        }
    }
    p <- p + geom_hline(yintercept = length(hline) + 0.5, color = "black")


    text_data <- data.frame(
        text_y = c(24, 19.5, 15, 9, 3.5),
        label = c(
            "1. Place-based actions",
            "2. Changing direct drivers",
            "3. Transforming economic systems",
            "4. Transforming governance systems",
            "5. Shifting vies, values, and paradigms"
        )
    )
    for (i in 1:nrow(text_data)) {
        p <- p +
            geom_text(
                y = text_data$text_y[[i]],
                x = 0.6,
                label = text_data$label[[i]],
                size = 3,
                hjust = 0
            )
    }

    text_data <- data.frame(
        text_x = c(3.5, 8, 11.5, 15.5),
        label = c(
            "Civil society",
            "Private sector",
            "Government sector",
            "Communication and\nknowledge sector"
        )
    )
    for (i in 1:nrow(text_data)) {
        p <- p +
            geom_text(
                y = 25.5,
                x = text_data$text_x[[i]],
                label = text_data$label[[i]],
                size = 3,
                hjust = "middle"
            )
    }

    p <- p +
        scale_y_discrete(
            limits = rev(levels(dat_ggplot$y))
        )

    return(p)
}

if (n < 12) {
    p <- plot_heatmap(
        data = readRDS(file.path("tca_corpus", "data", "actors_strategies_options_overlap.rds")),
        actors_ids = readRDS(file.path("tca_corpus", "data", "actors_ids.rds")),
        strategies_options_ids = readRDS(file.path("tca_corpus", "data", "strategies_options_ids.rds"))
    )

    saveRDS(p, paste0(fn, ".ggp"))

    ggplot2::ggsave(
        plot = p,
        paste0(fn, ".pdf"),
        width = 12,
        height = 6
    )
    ggplot2::ggsave(
        plot = p,
        paste0(fn, ".svg"),
        width = 12,
        height = 6
    )
    ggplot2::ggsave(
        plot = p,
        paste0(fn, ".png"),
        width = 12,
        height = 6
    )
}
```


![](`r file.path("tca_corpus", "figures", "actors_strategies_options_matrix.png")`)

To download high resolution, [click here](tca_corpus/figures/actors_strategies_options_matrix.pdf){target="_blank"}


## [x] Actors x Stategies / Options Cases Only

```{r}
#| label: actors_strategies_options_overlap_cases_data

fn <- file.path("tca_corpus", "data", "actors_strategies_options_overlap_cases.rds")

if (!file.exists(fn)) {
    actors_ids <- file.path("tca_corpus", "data", "actors_ids_cases.rds") |>
        readRDS()

    strategies_options_ids <- file.path("tca_corpus", "data", "strategies_options_ids_cases.rds") |>
        readRDS()

    sapply(actors_ids, function(actor) {
        sapply(strategies_options_ids, function(strategy_option) {
            length(intersect(actor, strategy_option))
        })
    }) |>
        saveRDS(file = fn)
}
```

```{r}
#| label: actors_strategies_options_overlap_cases_table
readRDS(file.path("tca_corpus", "data", "actors_strategies_options_overlap_cases.rds")) |>
    IPBES.R::table_dt(
        fn = "actors_strategies_options_overlap_cases",
        fixedColumns = list(leftColumns = 1)
    )
```

The colour indicates s indicates the proportion of overlapping works of the maximum possible, i.e. $p_{i,j} = \frac{n_{\text{links}(i,j)}}{\min(n_i, n_j)}$

```{r}
#| label: actors_strategies_options_overlap_cases_figure
#|

fn <- file.path("tca_corpus", "figures", "actors_strategies_options_matrix_cases")

n <- list.files(
    path = dirname(fn),
    pattern = basename(fn),
    recursive = TRUE
) |> length()

if (n < 8) {
    p <- plot_heatmap(
        data = readRDS(file.path("tca_corpus", "data", "actors_strategies_options_overlap_cases.rds")),
        actors_ids = readRDS(file.path("tca_corpus", "data", "actors_ids_cases.rds")),
        strategies_options_ids = readRDS(file.path("tca_corpus", "data", "strategies_options_ids_cases.rds"))
    )

    saveRDS(p, paste0(fn, ".ggp"))

    ggplot2::ggsave(
        plot = p,
        paste0(fn, ".pdf"),
        width = 12,
        height = 6
    )
    ggplot2::ggsave(
        plot = p,
        paste0(fn, ".svg"),
        width = 12,
        height = 6
    )
    ggplot2::ggsave(
        plot = p,
        paste0(fn, ".png"),
        width = 12,
        height = 6
    )
}
```


![](`r file.path("tca_corpus", "figures", "actors_strategies_options_matrix_cases.png")`)

To download high resolution, [click here](tca_corpus/figures/actors_strategies_options_matrix_cases.pdf){target="_blank"}


## Differenct Cases / all


The colour indicates s indicates the proportion of overlapping works of the maximum possible, i.e. $p_{i,j} = \frac{n_{\text{links}(i,j)}}{\min(n_i, n_j)}$

```{r}
#| label: actors_strategies_options_overlap_difference_figure
#|

fn <- file.path("tca_corpus", "figures", "actors_strategies_options_matrix_difference")

n <- list.files(
    path = dirname(fn),
    pattern = basename(fn),
    recursive = TRUE
) |> length()

if (n < 4) {
    data <- readRDS(file.path("tca_corpus", "data", "actors_strategies_options_overlap.rds"))

    actors_count <- sapply(
        readRDS(file.path("tca_corpus", "data", "actors_ids.rds")),
        length
    )
    strategies_options_count <- sapply(
        readRDS(file.path("tca_corpus", "data", "strategies_options_ids.rds")),
        length
    )

    for (i in 1:length(strategies_options_count)) {
        for (j in 1:length(actors_count)) {
            data[i, j] <- data[i, j] / min(strategies_options_count[i], actors_count[j])
        }
    }

    rm(actors_count, strategies_options_count)

    ####

    data_cases <- readRDS(file.path("tca_corpus", "data", "actors_strategies_options_overlap_cases.rds"))

    actors_count <- sapply(
        readRDS(file.path("tca_corpus", "data", "actors_ids_cases.rds")),
        length
    )
    strategies_options_count <- sapply(
        readRDS(file.path("tca_corpus", "data", "strategies_options_ids_cases.rds")),
        length
    )

    for (i in 1:length(strategies_options_count)) {
        for (j in 1:length(actors_count)) {
            data_cases[i, j] <- data_cases[i, j] / min(strategies_options_count[i], actors_count[j])
        }
    }
    rm(actors_count, strategies_options_count)

    ####

    data <- data - data_cases
    rm(data_cases)

    ####


    p <- plot_heatmap(
        plot_data = data
    )

    saveRDS(p, paste0(fn, ".ggp"))

    ggplot2::ggsave(
        plot = p,
        paste0(fn, ".pdf"),
        width = 12,
        height = 6
    )
    ggplot2::ggsave(
        plot = p,
        paste0(fn, ".svg"),
        width = 12,
        height = 6
    )
    ggplot2::ggsave(
        plot = p,
        paste0(fn, ".png"),
        width = 12,
        height = 6
    )
}
```


![](`r file.path("tca_corpus", "figures", "actors_strategies_options_matrix_difference.png")`)

To download high resolution, [click here](tca_corpus/figures/actors_strategies_options_matrix_difference.pdf){target="_blank"}

## Combined Actors x Stategies Panel

```{r}
#| label: actors_strategies_options_combined_panel
#|

p_tca <- readRDS(file.path("tca_corpus", "figures", "actors_strategies_options_matrix.ggp")) +
    theme(
        legend.position = "none"
    )
p_cases <- readRDS(file.path("tca_corpus", "figures", "actors_strategies_options_matrix_cases.ggp")) +
    theme(
        axis.title.y = element_blank(),
        axis.text.y = element_blank()
    )
p_diff <- readRDS(file.path("tca_corpus", "figures", "actors_strategies_options_matrix_difference.ggp"))

p <- (p_tca | p_cases) /
    p_diff +
    plot_layout(
        heights = c(0.5, 1)
    )

####

fn <- file.path("tca_corpus", "figures", "actors_strategies_options_combined_panel")

saveRDS(
    p, 
    paste0(fn, ".ggp")
)

ggplot2::ggsave(
    plot = p,
    paste0(fn, ".pdf"),
    width = 17,
    height = 18
)
ggplot2::ggsave(
    plot = p,
    paste0(fn, ".svg"),
    width = 17,
    height = 18
)
ggplot2::ggsave(
    plot = p,
    paste0(fn, ".png"),
    width = 17,
    height = 18
)

```

![](`r file.path("tca_corpus", "figures", "actors_strategies_options_combined_panel.png")`)

To download high resolution pdf, [click here](tca_corpus/figures/actors_strategies_options_combined_panel.pdf){target="_blank"}

To download high resolution svg, [click here](tca_corpus/figures/actors_strategies_options_combined_panel.pdf){target="_blank"}

## Works not mentioning Actors and Strategies / Options

```{r}
#| label: works_not_mentioning_actors_strategies_options
#| 

ids <- list(
    strategies = file.path("tca_corpus", "data", "strategies_options_ids.rds") |>
        readRDS() |>
        unlist(),
    actors = file.path("tca_corpus", "data", "actors_ids.rds") |>
        readRDS() |>
        unlist(),
    strategies_cases = file.path("tca_corpus", "data", "strategies_options_ids_cases.rds") |>
        readRDS() |>
        unlist(),
    actors_cases = file.path("tca_corpus", "data", "actors_ids_cases.rds") |>
        readRDS() |>
        unlist()
)

list(
    data.frame(
        type = "TCA Corpus not in Strategies / Options",
        not_in_corpus = params$corpus_dir |>
            corpus_read() |>
            distinct(id) |>
            filter(!id %in% ids$strategies) |>
            collect() |>
            nrow(),
        corpus_size = params$corpus_dir |>
            corpus_read() |>
            distinct(id) |>
            collect() |>
            nrow()
    ),
    data.frame(
        type = "TCA Corpus not in Actors",
        not_in_corpus = params$corpus_dir |>
            corpus_read() |>
            distinct(id) |>
            filter(!id %in% ids$actors) |>
            collect() |>
            nrow(),
        corpus_size = params$corpus_dir |>
            corpus_read() |>
            distinct(id) |>
            collect() |>
            nrow()
    ),
    data.frame(
        type = "TCA Corpus not in neither",
        not_in_corpus = params$corpus_dir |>
            corpus_read() |>
            distinct(id) |>
            filter(!id %in% c(ids$strategies, ids$actors)) |>
            collect() |>
            nrow(),
        corpus_size = params$corpus_dir |>
            corpus_read() |>
            distinct(id) |>
            collect() |>
            nrow()
    ),
    data.frame(
        type = "Cases Corpus not in Strategies / Options",
        not_in_corpus = params$corpus_cases_dir |>
            corpus_read() |>
            distinct(id) |>
            filter(!id %in% ids$strategies_cases) |>
            collect() |>
            nrow(),
        corpus_size = params$corpus_cases_dir |>
            corpus_read() |>
            distinct(id) |>
            collect() |>
            nrow()
    ),
    data.frame(
        type = "Cases Corpus not in Actors",
        not_in_corpus = params$corpus_cases_dir |>
            corpus_read() |>
            distinct(id) |>
            filter(!id %in% ids$actors_cases) |>
            collect() |>
            nrow(),
        corpus_size = params$corpus_cases_dir |>
            corpus_read() |>
            distinct(id) |>
            collect() |>
            nrow()
    ),
    data.frame(
        type = "Cases Corpus not in neither",
        not_in_corpus = params$corpus_dir |>
            corpus_read() |>
            distinct(id) |>
            filter(!id %in% c(ids$strategies, ids$actors)) |>
            collect() |>
            nrow(),
        corpus_size = params$corpus_cases_dir |>
            corpus_read() |>
            distinct(id) |>
            collect() |>
            nrow()
    )
) |>
    do.call(what = rbind) |>
    mutate(
        proportion = not_in_corpus / corpus_size
    ) |>
    knitr::kable()
```

# Tries

## Dendrograms
### Strategies Dendrogram

```{r}
#| label: strategies_options_tree_data
#|
sts <- data.frame(
    "Strategy" = NA,
    "Option" = NA,
    term = params$sts_strategies_options |>
        gsub(pattern = "\"", replacement = "") |>
        gsub(pattern = " OR$", replacement = "")
)

sts <- sts[sts$term != "", ]
for (i in 1:nrow(sts)) {
    if (grepl("^# ", sts$term[i])) {
        strategy <- gsub("^# ", "", sts$term[i]) |>
            gsub(pattern = "\"", replacement = "") |>
            gsub(pattern = " OR$", replacement = "")
    } else if (grepl("^## ", sts$term[i])) {
        option <- gsub("^## ", "", sts$term[i]) |>
            gsub(pattern = "\"", replacement = "") |>
            gsub(pattern = " OR$", replacement = "")
    } else {
        sts$Strategy[i] <- strategy
        sts$Option[i] <- option
    }
}
sts <- sts[!is.na(sts$Strategy), ]
sts <- sts[!is.na(sts$Option), ]
```

```{r}
#| label: strategies_options_tree
#| eval: true
#|


fn <- file.path("tca_corpus", "figures", "strategies_options_dendrogram_interactive.html")
if (!file.exists(fn)) {
    p <- collapsibleTree::collapsibleTree(
        sts,
        hierarchy = c("Strategy", "Option", "term")
    )

    htmlwidgets::saveWidget(
        p,
        file = fn
    )

    unlink(file.path("tca_corpus", "figures", "strategies_options_dendrogram_interactive_files"), recursive = TRUE)
}
```

<iframe src="tca_corpus/figures/strategies_options_dendrogram_interactive.html" width="100%" height="800px"></iframe>

To open the interactive graph or to download it, [click here](tca_corpus/figures/strategies_options_dendrogram_interactive.html){target="_blank"}

```{r}
#| label: strategies_options_static_tree_plot
#| fig.height: 10
#| fig.width: 10


if (length(list.files(file.path("tca_corpus", "figures"), pattern = "strategies_options_dendrogram_circular")) < 2) {
    strategies_options_tree <- data.frame(
        from = c(rep("x", length(sts$Strategy)), sts$Strategy),
        to = c(sts$Strategy, sts$Option)
    ) |>
        dplyr::distinct()

    vertices <- data.frame(
        names = unique(c(strategies_options_tree$from, strategies_options_tree$to)),
        to = unique(c(strategies_options_tree$from, strategies_options_tree$to))
    )
    vertices$color <- ifelse(vertices$names %in% sts$Strategy, "blue", "red")
    vertices$hjust <- 0.5
    vertices$vjust <- 0
    vertices$vjust[grep("5)", vertices$to)] <- -0.3
    # vertices$hjust[grep("5)", vertices$to)] <- 1
    vertices$vjust[grep("1)", vertices$to)] <- 1
    vertices$to[grep("5)", vertices$to)] <- gsub(", and", ",\nand", vertices$to[grep("5)", vertices$to)])
    vertices$to[grep("5)", vertices$to)] <- gsub("ion and", "ion\nand", vertices$to[grep("5)", vertices$to)])


    tree <- igraph::graph_from_data_frame(
        d = strategies_options_tree,
        vertices = vertices
    )


    p <- ggraph::ggraph(
        tree,
        layout = "dendrogram",
        circular = TRUE
    ) +
        ggraph::geom_edge_diagonal() +
        ggraph::geom_node_label(
            ggplot2::aes(
                label = to,
                colour = color,
                vjust = vjust,
                hjust = hjust
            ),
            angle = 0
        ) +
        ggplot2::ylim(-1, 1) +
        ggplot2::xlim(-1.5, 1.5)

    ggplot2::ggsave(
        plot = p,
        file.path("tca_corpus", "figures", "strategies_options_dendrogram_circular.pdf"),
        width = 12,
        height = 6
    )
    ggplot2::ggsave(
        plot = p,
        file.path("tca_corpus", "figures", "strategies_options_dendrogram_circular.png"),
        width = 12,
        height = 6
    )
}
```

![](`r file.path("tca_corpus", "figures", "strategies_options_dendrogram_circular.png")`)

To download high resolution, [click here](tca_corpus/figures/strategies_options_dendrogram_circular.pdf`){target="_blank"}

### Actors Dendrogram

```{r}
#| label: actors_tree_data
#|
sts <- data.frame(
    "Sector" = NA,
    "Actor_Group" = NA,
    term = params$sts_actors |>
        gsub(pattern = "\"", replacement = "") |>
        gsub(pattern = " OR$", replacement = "")
)

sts <- sts[sts$term != "", ]
for (i in 1:nrow(sts)) {
    if (grepl("^# ", sts$term[i])) {
        sector <- gsub("^# ", "", sts$term[i]) |>
            gsub(pattern = "\"", replacement = "") |>
            gsub(pattern = " OR$", replacement = "")
    } else if (grepl("^## ", sts$term[i])) {
        actor <- gsub("^## ", "", sts$term[i]) |>
            gsub(pattern = "\"", replacement = "") |>
            gsub(pattern = " OR$", replacement = "")
    } else {
        sts$Sector[i] <- sector
        sts$Actor_Group[i] <- actor
    }
}
sts <- sts[!is.na(sts$Sector), ]
sts <- sts[!is.na(sts$Actor_Group), ]
```

```{r}
#| label: actors_tree
#| eval: true
#|


fn <- file.path("tca_corpus", "figures", "actors_dendrogram_interactive.html")
if (!file.exists(fn)) {
    p <- collapsibleTree::collapsibleTree(
        sts,
        hierarchy = c("Sector", "Actor_Group", "term")
    )

    htmlwidgets::saveWidget(
        p,
        file = fn
    )

    unlink(file.path("tca_corpus", "figures", "actors_dendrogram_interactive_files"), recursive = TRUE)
}
```

<iframe src="tca_corpus/figures/actors_dendrogram_interactive.html" width="100%" height="800px"></iframe>

To open the interactive graph or to download it, [click here](tca_corpus/figures/actors_dendrogram_interactive.html){target="_blank"}

```{r}
#| label: actors_static_tree_plot
#| fig.height: 10
#| fig.width: 10


if (length(list.files(file.path("tca_corpus", "figures"), pattern = "actors_dendrogram_circular")) < 2) {
    actors_tree <- data.frame(
        from = c(rep("x", length(sts$Sector)), sts$Sector),
        to = c(sts$Sector, sts$Actor_Group)
    ) |>
        dplyr::distinct()

    vertices <- data.frame(
        names = unique(c(actors_tree$from, actors_tree$to)),
        to = unique(c(actors_tree$from, actors_tree$to))
    )
    vertices$color <- ifelse(vertices$names %in% sts$Sector, "blue", "red")
    vertices$hjust <- 0.5
    vertices$vjust <- 0
    vertices$vjust[grep("5)", vertices$to)] <- -0.3
    # vertices$hjust[grep("5)", vertices$to)] <- 1
    vertices$vjust[grep("1)", vertices$to)] <- 1
    vertices$to[grep("5)", vertices$to)] <- gsub(", and", ",\nand", vertices$to[grep("5)", vertices$to)])
    vertices$to[grep("5)", vertices$to)] <- gsub("ion and", "ion\nand", vertices$to[grep("5)", vertices$to)])


    tree <- igraph::graph_from_data_frame(
        d = actors_tree,
        vertices = vertices
    )


    p <- ggraph::ggraph(
        tree,
        layout = "dendrogram",
        circular = TRUE
    ) +
        ggraph::geom_edge_diagonal() +
        ggraph::geom_node_label(
            ggplot2::aes(
                label = to,
                colour = color,
                vjust = vjust,
                hjust = hjust
            ),
            angle = 0
        ) +
        ggplot2::ylim(-1, 1) +
        ggplot2::xlim(-1.5, 1.5)

    ggplot2::ggsave(
        plot = p,
        file.path("tca_corpus", "figures", "actors_dendrogram_circular.pdf"),
        width = 12,
        height = 6
    )
    ggplot2::ggsave(
        plot = p,
        file.path("tca_corpus", "figures", "actors_dendrogram_circular.png"),
        width = 12,
        height = 6
    )
}
```

![](`r file.path("tca_corpus", "figures", "actors_dendrogram_circular.png")`)

To download high resolution, [click here](tca_corpus/figures/actors_dendrogram_circular.pdf`){target="_blank"}