first vignette

ncn-foreigners · Nov 5, 2023 · 7379a16 · 7379a16
1 parent 031c2d2
commit 7379a16
Show file tree

Hide file tree

Showing 10 changed files with 116 additions and 16 deletions.
diff --git a/.Rproj.user/E3DB6272/pcs/files-pane.pper b/.Rproj.user/E3DB6272/pcs/files-pane.pper
@@ -5,5 +5,5 @@
             "ascending": true
         }
     ],
-    "path": "~/git/nauka/ncn-foreigners/software/blocking/inst/tinytest"
+    "path": "~/git/nauka/ncn-foreigners/software/blocking/R"
 }
diff --git a/.Rproj.user/E3DB6272/pcs/source-pane.pper b/.Rproj.user/E3DB6272/pcs/source-pane.pper
@@ -1,4 +1,4 @@
 {
-    "activeTab": 0,
+    "activeTab": 1,
     "activeTabSourceWindow0": 0
 }
diff --git a/.Rproj.user/E3DB6272/pcs/windowlayoutstate.pper b/.Rproj.user/E3DB6272/pcs/windowlayoutstate.pper
@@ -6,8 +6,8 @@
         "windowheight": 806
     },
     "right": {
-        "splitterpos": 397,
-        "topwindowstate": "NORMAL",
+        "splitterpos": 512,
+        "topwindowstate": "MINIMIZE",
         "panelheight": 768,
         "windowheight": 806
     }

diff --git a/.Rproj.user/E3DB6272/pcs/workbench-pane.pper b/.Rproj.user/E3DB6272/pcs/workbench-pane.pper
@@ -1,5 +1,5 @@
 {
     "TabSet1": 3,
-    "TabSet2": 0,
+    "TabSet2": 3,
     "TabZoom": {}
 }
diff --git a/NAMESPACE b/NAMESPACE
@@ -23,5 +23,6 @@ importFrom(mlpack,knn)
 importFrom(mlpack,lsh)
 importFrom(text2vec,create_dtm)
 importFrom(text2vec,create_vocabulary)
+importFrom(text2vec,itoken)
 importFrom(text2vec,itoken_parallel)
 importFrom(text2vec,vocab_vectorizer)
diff --git a/NEWS.md b/NEWS.md
@@ -7,3 +7,4 @@
 5. initial support for the `reclin2` package.
 6. class `blocking` introduced.
 7. s3method for printing.
+8. first vignette added.
diff --git a/R/blocking.R b/R/blocking.R
@@ -1,4 +1,5 @@
 #' Imports
+#' @importFrom text2vec itoken
 #' @importFrom text2vec itoken_parallel
 #' @importFrom text2vec create_vocabulary
 #' @importFrom text2vec vocab_vectorizer
@@ -108,11 +109,20 @@ blocking <- function(x,
     if (verbose %in% 1:2) cat("===== creating tokens =====\n")
 
     ## tokens for x
-    l_tokens <- text2vec::itoken(
-      iterable = x,
-      tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = control_txt$n_shingles),
-      n_chunks = control_txt$n_chunks,
-      progressbar = verbose)
+    if (.Platform$OS.type == "unix") {
+      l_tokens <- text2vec::itoken_parallel(
+        iterable = x,
+        tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = control_txt$n_shingles),
+        n_chunks = control_txt$n_chunks,
+        progressbar = verbose)
+    } else {
+      l_tokens <- text2vec::itoken(
+        iterable = x,
+        tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = control_txt$n_shingles),
+        n_chunks = control_txt$n_chunks,
+        progressbar = verbose)
+    }
+
 
     l_voc <- text2vec::create_vocabulary(l_tokens)
     l_vec <- text2vec::vocab_vectorizer(l_voc)
@@ -122,12 +132,19 @@ blocking <- function(x,
     if (is.null(y_default)) {
       l_dtm_y <- l_dtm
     } else {
-      l_tokens_y <- text2vec::itoken(
+      if (.Platform$OS.type == "unix") {
+      l_tokens_y <- text2vec::itoken_parallel(
         iterable = y,
         tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = control_txt$n_shingles),
         n_chunks = control_txt$n_chunks,
         progressbar = verbose)
-
+      } else {
+        l_tokens_y <- text2vec::itoken(
+          iterable = y,
+          tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = control_txt$n_shingles),
+          n_chunks = control_txt$n_chunks,
+          progressbar = verbose)
+      }
       l_voc_y <- text2vec::create_vocabulary(l_tokens_y)
       l_vec_y <- text2vec::vocab_vectorizer(l_voc_y)
       l_dtm_y <- text2vec::create_dtm(l_tokens_y, l_vec_y)

diff --git a/R/methods.R b/R/methods.R
@@ -9,7 +9,7 @@ print.blocking <- function(x,...) {
   cat("Number of columns used for blocking: ", NROW(x$colnames), ".\n",sep="")
   cat("Distribution of the size of the blocks:")
 
-  print(table(table(unique(blocks_summ))))
+  print(table(table(blocks_summ)))
 
   invisible(x)
 }
diff --git a/blocking.Rproj b/blocking.Rproj
@@ -18,4 +18,4 @@ StripTrailingWhitespace: Yes
 BuildType: Package
 PackageUseDevtools: Yes
 PackageInstallArgs: --no-multiarch --with-keep.source
-PackageRoxygenize: rd,collate,namespace
+PackageRoxygenize: rd,collate,namespace,vignette
diff --git a/vignettes/v1-deduplication.Rmd b/vignettes/v1-deduplication.Rmd
@@ -1,19 +1,100 @@
 ---
-title: "v1-deduplication"
+title: "Blocking records for deduplication"
+author: "Maciej Beręsewicz"
 output: rmarkdown::html_vignette
 vignette: >
-  %\VignetteIndexEntry{v1-deduplication}
+  %\VignetteIndexEntry{Blocking records for deduplication}
   %\VignetteEngine{knitr::rmarkdown}
   %\VignetteEncoding{UTF-8}
 ---
 
 ```{r, include = FALSE}
 knitr::opts_chunk$set(
   collapse = TRUE,
+  warning = FALSE,
+  message = FALSE,
   comment = "#>"
 )
 ```
 
+# Setup
+
+Read required packages
+
 ```{r setup}
 library(blocking)
+library(reclin2)
+library(data.table)
+```
+
+Read the `RLdata500` data used in the [RecordLinkage](https://CRAN.R-project.org/package=RecordLinkage) package from the [dblink](https://github.com/cleanzr/dblink) Github repository.
+
+```{r}
+df <- fread("https://raw.githubusercontent.com/cleanzr/dblink/dc3dd0daf55f8a303863423817a0f0042b3c275a/examples/RLdata500.csv")
+head(df)
 ```
+This dataset contains `r nrow(df)` with `r NROW(unique(df$ent_id))` entities.
+
+# Blocking for deduplication
+
+Now we create a new column that concatenates the information in each row. 
+
+```{r}
+df[, id_count :=.N, ent_id] ## how many times given unit occurs
+df[is.na(fname_c2), fname_c2:=""]
+df[is.na(lname_c2), lname_c2:=""]
+df[, bm:=sprintf("%02d", bm)] ## add leading zeros to month
+df[, bd:=sprintf("%02d", bd)] ## add leading zeros to month
+df[, txt:=tolower(paste0(fname_c1,fname_c2,lname_c1,lname_c2,by,bm,bd))]
+head(df)
+```
+
+In the next step we use the newly created column in the `blocking` function. If we specify verbose, we get information about the progress.
+
+```{r}
+df_blocks <- blocking(x = df$txt, ann = "hnsw", verbose=TRUE)
+```
+
+Results are as follows:
+
++ based in `RcppHNSW` we created 133 blocks,
++ it was based on 429 columns (2 character shingles),
++ we have 46 blocks of 2 elements, 43 blocks of 3 elements, ..., 1 block of 17 elements.
+
+```{r}
+df_blocks
+```
+Structure of the object is as follows:
+
++ `result` - a data.table with identifiers and block IDs,
++ `method` - the method used,
++ `metrics` - based on the `igraph::compare` methods for comparing graphs (here NULL),
++ `colnames` - column names used for the comparison.
+
+```{r}
+str(df_blocks,1)
+```
+The resulting data.table has three columns:
+
++ `x` - Reference dataset (i.e. `df') -- this may not contain all units of `df',
++ `y` - query (each row of `df`) -- this will return all units of `df`,
++ `block` -- the block ID.
+
+```{r}
+head(df_blocks$result)
+```
+
+We add block information to the final dataset.
+
+```{r}
+df_block_result <- copy(df_blocks$result[order(y),])
+df[, block_id := df_block_result$block]
+head(df)
+```
+
+Finally, we can check in how many blocks the same entities (`ent_id`) are observed. In our example, all the same entities are in the same blocks. 
+
+```{r}
+df[, .(uniq_blocks = uniqueN(block_id)), .(ent_id)][, .N, uniq_blocks]
+```
+