Skip to content

Commit

Permalink
first vignette
Browse files Browse the repository at this point in the history
  • Loading branch information
BERENZ committed Nov 5, 2023
1 parent 031c2d2 commit 7379a16
Show file tree
Hide file tree
Showing 10 changed files with 116 additions and 16 deletions.
2 changes: 1 addition & 1 deletion .Rproj.user/E3DB6272/pcs/files-pane.pper
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@
"ascending": true
}
],
"path": "~/git/nauka/ncn-foreigners/software/blocking/inst/tinytest"
"path": "~/git/nauka/ncn-foreigners/software/blocking/R"
}
2 changes: 1 addition & 1 deletion .Rproj.user/E3DB6272/pcs/source-pane.pper
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"activeTab": 0,
"activeTab": 1,
"activeTabSourceWindow0": 0
}
4 changes: 2 additions & 2 deletions .Rproj.user/E3DB6272/pcs/windowlayoutstate.pper
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
"windowheight": 806
},
"right": {
"splitterpos": 397,
"topwindowstate": "NORMAL",
"splitterpos": 512,
"topwindowstate": "MINIMIZE",
"panelheight": 768,
"windowheight": 806
}
Expand Down
2 changes: 1 addition & 1 deletion .Rproj.user/E3DB6272/pcs/workbench-pane.pper
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"TabSet1": 3,
"TabSet2": 0,
"TabSet2": 3,
"TabZoom": {}
}
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,6 @@ importFrom(mlpack,knn)
importFrom(mlpack,lsh)
importFrom(text2vec,create_dtm)
importFrom(text2vec,create_vocabulary)
importFrom(text2vec,itoken)
importFrom(text2vec,itoken_parallel)
importFrom(text2vec,vocab_vectorizer)
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
5. initial support for the `reclin2` package.
6. class `blocking` introduced.
7. s3method for printing.
8. first vignette added.
31 changes: 24 additions & 7 deletions R/blocking.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#' Imports
#' @importFrom text2vec itoken
#' @importFrom text2vec itoken_parallel
#' @importFrom text2vec create_vocabulary
#' @importFrom text2vec vocab_vectorizer
Expand Down Expand Up @@ -108,11 +109,20 @@ blocking <- function(x,
if (verbose %in% 1:2) cat("===== creating tokens =====\n")

## tokens for x
l_tokens <- text2vec::itoken(
iterable = x,
tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = control_txt$n_shingles),
n_chunks = control_txt$n_chunks,
progressbar = verbose)
if (.Platform$OS.type == "unix") {
l_tokens <- text2vec::itoken_parallel(
iterable = x,
tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = control_txt$n_shingles),
n_chunks = control_txt$n_chunks,
progressbar = verbose)
} else {
l_tokens <- text2vec::itoken(
iterable = x,
tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = control_txt$n_shingles),
n_chunks = control_txt$n_chunks,
progressbar = verbose)
}


l_voc <- text2vec::create_vocabulary(l_tokens)
l_vec <- text2vec::vocab_vectorizer(l_voc)
Expand All @@ -122,12 +132,19 @@ blocking <- function(x,
if (is.null(y_default)) {
l_dtm_y <- l_dtm
} else {
l_tokens_y <- text2vec::itoken(
if (.Platform$OS.type == "unix") {
l_tokens_y <- text2vec::itoken_parallel(
iterable = y,
tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = control_txt$n_shingles),
n_chunks = control_txt$n_chunks,
progressbar = verbose)

} else {
l_tokens_y <- text2vec::itoken(
iterable = y,
tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = control_txt$n_shingles),
n_chunks = control_txt$n_chunks,
progressbar = verbose)
}
l_voc_y <- text2vec::create_vocabulary(l_tokens_y)
l_vec_y <- text2vec::vocab_vectorizer(l_voc_y)
l_dtm_y <- text2vec::create_dtm(l_tokens_y, l_vec_y)
Expand Down
2 changes: 1 addition & 1 deletion R/methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ print.blocking <- function(x,...) {
cat("Number of columns used for blocking: ", NROW(x$colnames), ".\n",sep="")
cat("Distribution of the size of the blocks:")

print(table(table(unique(blocks_summ))))
print(table(table(blocks_summ)))

invisible(x)
}
2 changes: 1 addition & 1 deletion blocking.Rproj
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ StripTrailingWhitespace: Yes
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace
PackageRoxygenize: rd,collate,namespace,vignette
85 changes: 83 additions & 2 deletions vignettes/v1-deduplication.Rmd
Original file line number Diff line number Diff line change
@@ -1,19 +1,100 @@
---
title: "v1-deduplication"
title: "Blocking records for deduplication"
author: "Maciej Beręsewicz"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{v1-deduplication}
%\VignetteIndexEntry{Blocking records for deduplication}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
warning = FALSE,
message = FALSE,
comment = "#>"
)
```

# Setup

Read required packages

```{r setup}
library(blocking)
library(reclin2)
library(data.table)
```

Read the `RLdata500` data used in the [RecordLinkage](https://CRAN.R-project.org/package=RecordLinkage) package from the [dblink](https://github.com/cleanzr/dblink) Github repository.

```{r}
df <- fread("https://raw.githubusercontent.com/cleanzr/dblink/dc3dd0daf55f8a303863423817a0f0042b3c275a/examples/RLdata500.csv")
head(df)
```
This dataset contains `r nrow(df)` with `r NROW(unique(df$ent_id))` entities.

# Blocking for deduplication

Now we create a new column that concatenates the information in each row.

```{r}
df[, id_count :=.N, ent_id] ## how many times given unit occurs
df[is.na(fname_c2), fname_c2:=""]
df[is.na(lname_c2), lname_c2:=""]
df[, bm:=sprintf("%02d", bm)] ## add leading zeros to month
df[, bd:=sprintf("%02d", bd)] ## add leading zeros to month
df[, txt:=tolower(paste0(fname_c1,fname_c2,lname_c1,lname_c2,by,bm,bd))]
head(df)
```

In the next step we use the newly created column in the `blocking` function. If we specify verbose, we get information about the progress.

```{r}
df_blocks <- blocking(x = df$txt, ann = "hnsw", verbose=TRUE)
```

Results are as follows:

+ based in `RcppHNSW` we created 133 blocks,
+ it was based on 429 columns (2 character shingles),
+ we have 46 blocks of 2 elements, 43 blocks of 3 elements, ..., 1 block of 17 elements.

```{r}
df_blocks
```
Structure of the object is as follows:

+ `result` - a data.table with identifiers and block IDs,
+ `method` - the method used,
+ `metrics` - based on the `igraph::compare` methods for comparing graphs (here NULL),
+ `colnames` - column names used for the comparison.

```{r}
str(df_blocks,1)
```
The resulting data.table has three columns:

+ `x` - Reference dataset (i.e. `df') -- this may not contain all units of `df',
+ `y` - query (each row of `df`) -- this will return all units of `df`,
+ `block` -- the block ID.

```{r}
head(df_blocks$result)
```

We add block information to the final dataset.

```{r}
df_block_result <- copy(df_blocks$result[order(y),])
df[, block_id := df_block_result$block]
head(df)
```

Finally, we can check in how many blocks the same entities (`ent_id`) are observed. In our example, all the same entities are in the same blocks.

```{r}
df[, .(uniq_blocks = uniqueN(block_id)), .(ent_id)][, .N, uniq_blocks]
```

0 comments on commit 7379a16

Please sign in to comment.