Skip to content

Commit

Permalink
update vignettes
Browse files Browse the repository at this point in the history
  • Loading branch information
BERENZ committed May 4, 2024
1 parent 9f461d4 commit 9a3a4cc
Show file tree
Hide file tree
Showing 8 changed files with 159 additions and 25 deletions.
3 changes: 3 additions & 0 deletions .Rproj.user/shared/notebooks/paths
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/Users/berenz/Downloads/Template of Abstract in Latex.tex="A4C7846D"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/DESCRIPTION="019D16E4"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/controls.R="5BC637B7"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_annoy.R="684202BA"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_hnsw.R="A4FAA5A3"
Expand All @@ -17,3 +18,5 @@
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/tests/tinytest.R="D6BBCDC1"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v1-deduplication.Rmd="9D34DD44"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v2-reclin.Rmd="289A4D2F"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v3-evaluation.Rmd="E778A54F"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v4-integration.Rmd="E3EFC8F1"
4 changes: 3 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,7 @@ Suggests:
tinytest,
reclin2,
knitr,
rmarkdown
rmarkdown,
fastLink,
RecordLinkage
VignetteBuilder: knitr
29 changes: 18 additions & 11 deletions R/blocking.R
Original file line number Diff line number Diff line change
Expand Up @@ -157,36 +157,46 @@ blocking <- function(x,
if (.Platform$OS.type == "unix") {
x_tokens <- text2vec::itoken_parallel(
iterable = x,
tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = control_txt$n_shingles),
tokenizer = function(x) tokenizers::tokenize_character_shingles(x,
n = control_txt$n_shingles,
lowercase = control_txt$lowercase,
strip_non_alphanum = control_txt$strip_non_alphanum),
n_chunks = control_txt$n_chunks,
progressbar = verbose)
} else {
x_tokens <- text2vec::itoken(
iterable = x,
tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = control_txt$n_shingles),
tokenizer = function(x) tokenizers::tokenize_character_shingles(x,
n = control_txt$n_shingles,
lowercase = control_txt$lowercase,
strip_non_alphanum = control_txt$strip_non_alphanum),
n_chunks = control_txt$n_chunks,
progressbar = verbose)
}


x_voc <- text2vec::create_vocabulary(x_tokens)
x_vec <- text2vec::vocab_vectorizer(x_voc)
x_dtm <- text2vec::create_dtm(x_tokens, x_vec)


if (is.null(y_default)) {
y_dtm <- x_dtm
} else {
if (.Platform$OS.type == "unix") {
y_tokens <- text2vec::itoken_parallel(
iterable = y,
tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = control_txt$n_shingles),
tokenizer = function(x) tokenizers::tokenize_character_shingles(x,
n = control_txt$n_shingles,
lowercase = control_txt$lowercase,
strip_non_alphanum = control_txt$strip_non_alphanum),
n_chunks = control_txt$n_chunks,
progressbar = verbose)
} else {
y_tokens <- text2vec::itoken(
iterable = y,
tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = control_txt$n_shingles),
tokenizer = function(x) tokenizers::tokenize_character_shingles(x,
n = control_txt$n_shingles,
lowercase = control_txt$lowercase,
strip_non_alphanum = control_txt$strip_non_alphanum),
n_chunks = control_txt$n_chunks,
progressbar = verbose)
}
Expand All @@ -197,15 +207,13 @@ blocking <- function(x,
}
}


colnames_xy <- intersect(colnames(x_dtm), colnames(y_dtm))

if (verbose %in% 1:2) {
cat(sprintf("===== starting search (%s, x, y: %d, %d, t: %d) =====\n",
ann, nrow(x_dtm), nrow(y_dtm), length(colnames_xy)))
}


x_df <- switch(ann,
"nnd" = method_nnd(x = x_dtm[, colnames_xy],
y = y_dtm[, colnames_xy],
Expand Down Expand Up @@ -267,11 +275,10 @@ blocking <- function(x,

x_df[, `:=`(block, x_block[names(x_block) %in% x_df$query_g])]


## if true are given
if (!is.null(true_blocks)) {

setDT(true_blocks)
setDT(true_blocks) ## move it somewhere else

pairs_to_eval <- x_df[y %in% true_blocks$y, c("x", "y", "block")]
pairs_to_eval[true_blocks, on = c("x", "y"), both := TRUE]
Expand Down Expand Up @@ -306,7 +313,7 @@ blocking <- function(x,

}

## consider using RcppAlgos::comboGeneral(nrow(pairs_to_eval_long), 2, nThreads=n_threads)
#consider using RcppAlgos::comboGeneral(nrow(pairs_to_eval_long), 2, nThreads=n_threads)
candidate_pairs <- utils::combn(nrow(pairs_to_eval_long), 2)

same_block <- pairs_to_eval_long$block_id[candidate_pairs[1, ]] == pairs_to_eval_long$block_id[candidate_pairs[2, ]]
Expand Down
16 changes: 11 additions & 5 deletions R/controls.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,19 +69,25 @@ controls_ann <- function(
#' @author Maciej Beręsewicz
#'
#' @description
#' Controls for text data used in the \code{blocking} functions
#' Controls for text data used in the \code{blocking} functions, passed to [tokenizers::tokenize_character_shingles].
#'
#' @param n_shingles length of shingles (default 2L), passed to [tokenizers::tokenize_character_shingles],
#' @param n_chunks passed to (default 10L) [tokenizers::tokenize_character_shingles].
#' @param n_shingles length of shingles (default `2L`),
#' @param n_chunks passed to (default `10L`),
#' @param lowercase should the caracters be made lowercase? (default `TRUE`)
#' @param strip_non_alphanum should punctuation and white space be stripped? (default `TRUE`)
#'
#' @returns Returns a list with parameters.
#'
#' @export
controls_txt <- function(
n_shingles = 2L,
n_chunks = 10L
n_chunks = 10L,
lowercase = TRUE,
strip_non_alphanum = TRUE
) {

list(n_shingles = n_shingles,
n_chunks = n_chunks)
n_chunks = n_chunks,
lowercase = lowercase,
strip_non_alphanum = strip_non_alphanum)
}
4 changes: 2 additions & 2 deletions R/methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ print.blocking <- function(x,...) {
cat("Blocking based on the", x$method, "method.\n")
cat("Number of blocks: ", length(unique(block_ids)), ".\n",sep="")
cat("Number of columns used for blocking: ", NROW(x$colnames), ".\n",sep="")
cat("Reduction ratio: ", round(rr, 4), ".\n",sep="")
cat("Reduction ratio: ", sprintf("%.4f", rr), ".\n",sep="")

cat("========================================================\n")
cat("Distribution of the size of the blocks:")
Expand All @@ -20,7 +20,7 @@ print.blocking <- function(x,...) {
if (!is.null(x$metrics)) {
cat("========================================================\n")
cat("Evaluation metrics (standard):\n" )
print(round(x$metrics*100, 4))
sprintf("%.4f", x$metrics*100)

}
invisible(x)
Expand Down
17 changes: 13 additions & 4 deletions man/controls_txt.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 6 additions & 2 deletions vignettes/v2-reclin.Rmd
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
---
title: "Blocking records for record linkage"
author: "Maciej Beręsewicz"
execute:
warning: false
message: false
lang: en
output:
html_vignette:
df_print: kable
Expand Down Expand Up @@ -88,7 +92,6 @@ cis[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap,
```



# Linking datasets

## Using basic functionalities of `blocking` package
Expand All @@ -110,6 +113,7 @@ Example pairs
```{r}
head(result1$result, n= 10)
```

Let's look at the first pair. Clearly there is a typo on the `pername1` but all other variables are the same so it seems that this is a match.

```{r}
Expand All @@ -127,7 +131,7 @@ cis[3901, ]

## Assessing the quality

For some records we have information on the correct linkage. We can use this information to assess our approach.
For some records we have information on the correct linkage. We can use this information to assess our approach but note that information on assessing the quality is described in detail in the other vignette.

```{r}
matches <- merge(x = census[, .(x=1:.N, person_id)],
Expand Down
103 changes: 103 additions & 0 deletions vignettes/v4-integration.Rmd
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
---
title: "Integration with existing packages"
author: "Maciej Beręsewicz"
execute:
warning: false
message: false
lang: en
output:
html_vignette:
df_print: kable
Expand All @@ -20,6 +24,105 @@ knitr::opts_chunk$set(
)
```

# Setup

```{r setup}
library(blocking)
library(reclin2)
library(fastLink)
library(RecordLinkage)
```

# Data

In the example we will use the same dataset as in the *Blocking records for record linkage* vignette.

```{r}
census <- read.csv("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/census.csv")
cis <- read.csv("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/cis.csv")
setDT(census)
setDT(cis)
census[is.na(dob_day), dob_day := ""]
census[is.na(dob_mon), dob_mon := ""]
census[is.na(dob_year), dob_year := ""]
cis[is.na(dob_day), dob_day := ""]
cis[is.na(dob_mon), dob_mon := ""]
cis[is.na(dob_year), dob_year := ""]
census[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, enumpc)]
cis[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, enumpc)]
census[, x:=1:.N]
cis[, y:=1:.N]
```

# Integration with the `reclin2` package

The package contains function `pair_ann` which aims at integration with `reclin2` package. This function works as follows

```{r}
pair_ann(x = census[1:1000],
y = cis[1:1000],
on = "txt",
deduplication = FALSE)
```

Which provides you information on the total number of pairs. This can be further included in the pipeline of the `reclin2` package.

```{r}
pair_ann(x = census[1:1000],
y = cis[1:1000],
on = "txt",
deduplication = FALSE,
ann = "hnsw") |>
compare_pairs(on = "txt", comparators = list(cmp_jarowinkler())) |>
score_simple("score", on = "txt") |>
select_threshold("threshold", score = "score", threshold = 0.75) |>
link(selection = "threshold") |>
head()
```

# Usage with `fastLink` package

In order to use it with the `fastLink` package you need to add information on blocks to datasets and create appropriate subsets just like presented in the `blockData` function.

```{r}
blocks <- blocking(x = census$txt[1:1000],
y = cis$txt[1:1000],
#verbose = 1,
seed = 2024)
census[blocks$result, on = "x", block:=as.integer(i.block)]
cis[blocks$result, on = "y", block:=as.integer(i.block)]
```

Then you can use `blockData` function.

```{r}
blocked_data <- blockData(dfA = cis[1:1000],
dfB = census[1:1000],
varnames = "block")
```

```{r}
blocked_data$block.3
```

```{r}
dfA_block1 <- cis[1:1000][blocked_data$block.3$dfA.inds,]
dfB_block1 <- census[1:1000][blocked_data$block.3$dfB.inds,]
```


# Usage with `RecordLinkage` package

Te same can be done with the `RecordLinkage` package

```{r}
pairs <- RecordLinkage::compare.linkage(dataset1 = cis[1:500],
dataset2 = census[1:500],
blockfld = list(12),
strcmp = c("pername1", "pername2", "sex", "enumcap", "enumpc"))
summary(pairs)
```



0 comments on commit 9a3a4cc

Please sign in to comment.