Skip to content

Commit

Permalink
more robust args checking, new vignettes templates, sparse matrix for…
Browse files Browse the repository at this point in the history
… hnsw
  • Loading branch information
BERENZ committed Nov 18, 2023
1 parent f43a01f commit c3458c7
Show file tree
Hide file tree
Showing 18 changed files with 176 additions and 59 deletions.
2 changes: 1 addition & 1 deletion .Rproj.user/E3DB6272/pcs/source-pane.pper
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"activeTab": 0,
"activeTab": 1,
"activeTabSourceWindow0": 0
}
6 changes: 3 additions & 3 deletions .Rproj.user/E3DB6272/pcs/windowlayoutstate.pper
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
{
"left": {
"splitterpos": 309,
"splitterpos": 216,
"topwindowstate": "NORMAL",
"panelheight": 768,
"windowheight": 806
},
"right": {
"splitterpos": 453,
"topwindowstate": "NORMAL",
"splitterpos": 477,
"topwindowstate": "MAXIMIZE",
"panelheight": 768,
"windowheight": 806
}
Expand Down
2 changes: 1 addition & 1 deletion .Rproj.user/E3DB6272/pcs/workbench-pane.pper
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"TabSet1": 3,
"TabSet2": 0,
"TabSet2": 3,
"TabZoom": {}
}
2 changes: 2 additions & 0 deletions .Rproj.user/E3DB6272/sources/prop/INDEX
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,5 @@
~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Fvignettes%2Fv1-deduplication.Rmd="0DDC09E2"
~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Fvignettes%2Fv2-reclin.Rmd="59CC71B7"
~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Fvignettes%2Fv3-evaluation.Rmd="412A89DC"
~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Fvignettes%2Fv4-integration.Rmd="F09D0DE8"
~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Fvignettes%2Fv5-bigdata.Rmd="738E79A5"
2 changes: 2 additions & 0 deletions .Rproj.user/shared/notebooks/paths
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,5 @@
/Users/berenz/git/nauka/ncn-foreigners/software/blocking/vignettes/v1-deduplication.Rmd="50261E1C"
/Users/berenz/git/nauka/ncn-foreigners/software/blocking/vignettes/v2-reclin.Rmd="101F7D66"
/Users/berenz/git/nauka/ncn-foreigners/software/blocking/vignettes/v3-evaluation.Rmd="2AE4AF89"
/Users/berenz/git/nauka/ncn-foreigners/software/blocking/vignettes/v4-integration.Rmd="F2A99A37"
/Users/berenz/git/nauka/ncn-foreigners/software/blocking/vignettes/v5-bigdata.Rmd="A8E5A993"
47 changes: 34 additions & 13 deletions R/blocking.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
#'
#' @examples
#'
#' ## general example
#' ## an example using RcppHNSW
#' df_example <- data.frame(txt = c("jankowalski", "kowalskijan", "kowalskimjan",
#' "kowaljan", "montypython", "pythonmonty", "cyrkmontypython", "monty"))
#'
Expand All @@ -57,15 +57,27 @@
#'
#' result
#'
#' ## an example with true blocks
#' ## an example using RcppAnnoy
#'
#' result_annoy <- blocking(x = df_example$txt,
#' ann = "annoy",
#' distance = "angular")
#'
#' result_annoy
#'
#' ## an example using mlpack::lsh
#'
#' result_lsh <- blocking(x = df_example$txt,
#' ann = "lsh")
#'
#' result_lsh
#' @export
blocking <- function(x,
y = NULL,
deduplication = TRUE,
on = NULL,
on_blocking = NULL,
ann = c("hnsw", "lsh", "annoy", "kd", "nnd"),
ann = c("hnsw", "annoy", "lsh", "kd", "nnd"),
distance = c("cosine", "euclidean", "l2", "ip", "manhatan", "hamming", "angular"),
ann_write = NULL,
ann_colnames = NULL,
Expand All @@ -77,13 +89,29 @@ blocking <- function(x,
control_txt = controls_txt(),
control_ann = controls_ann()) {

## checks
## defaults
if (missing(verbose)) verbose <- 0
if (missing(ann)) ann <- "hnsw"
if (missing(distance)) distance <- switch(ann,
"hnsw"="cosine",
"annoy"="angular",
"lsh"=NULL,
"kd"=NULL)

stopifnot("Only character or matrix x is supported" = is.character(x) | is.matrix(x))
if (!is.null(ann_write)) {
stopifnot("Path provided in the `ann_write` is incorrect" = file.exists(ann_write) )
}
#stopifnot("Distance for Annoy should be `euclidean, manhatan, hamming, angular`" =
# distance %in% c("euclidean", "manhatan", "hamming", "angular") & ann == "annoy")

if (ann == "hnsw") {
stopifnot("Distance for HNSW should be `l2, euclidean, cosine, ip`" =
distance %in% c("l2", "euclidean", "cosine", "ip"))
}

if (ann == "annoy") {
stopifnot("Distance for Annoy should be `euclidean, manhatan, hamming, angular`" =
distance %in% c("euclidean", "manhatan", "hamming", "angular"))
}


if (!is.null(true_blocks)) {
Expand All @@ -100,13 +128,6 @@ blocking <- function(x,
}
}

## defaults
if (missing(verbose)) verbose <- 0
if (missing(ann)) ann <- "hnsw"

## this this should be done depending on the distance

if (missing(distance)) distance <- "cosine"

if (!is.null(y)) {
deduplication <- FALSE
Expand Down
3 changes: 2 additions & 1 deletion R/controls.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ controls_ann <- function(
projections = 10,
tables = 30),

annoy = list(n_trees = 250),
annoy = list(n_trees = 250,
build_on_disk = FALSE),

kd = list(algorithm = "dual_tree",
epsilon = 0,
Expand Down
6 changes: 6 additions & 0 deletions R/method_annoy.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ method_annoy <- function(x,

l_ind$setSeed(seed)

if (control$annoy$build_on_disk) {
temp_annoy <- tempfile(pattern="annoy", fileext="tree")
cat("Building index on disk:", temp_annoy, "\n")
l_ind$onDiskBuild(temp_annoy)

}
if (verbose) l_ind$setVerbose(1)

## index - this does not require dense matrix (sparse can be used?)
Expand Down
34 changes: 6 additions & 28 deletions R/method_hnsw.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,42 +48,20 @@ method_hnsw <- function(x,
l_ind$setNumThreads(n_threads)
l_ind$setGrainSize(control$hnsw$grain_size)

## add items from a sparse matrix in a batches
if (verbose) {
pb <- utils::txtProgressBar(style = 3)
}
starts <- seq(1, nrow(x), 1000) ## by 1000 batches

for (i in 1:NROW(starts)) {
## check if last element is used
l_ind$addItems(as.matrix(x[starts[i]:(starts[i]+999),]))
if (exists("pb")) utils::setTxtProgressBar(pb,i)
for (i in 1:nrow(x)) {
l_ind$addItem(x[i,])
}
if (exists("pb")) close(pb)

## query based on sparse data in batches
l_ind$setEf(control$hnsw$ef_s)

## this should be changed to loop
## add items from a sparse matrix in a batches

# if (verbose) {
# pb <- utils::txtProgressBar(style = 3)
# }
starts <- seq(1, nrow(x), 1000) ## by 1000 batches

l_1nn <- list()
l_1nn_m <- list()

for (i in 1:NROW(starts)) {
## check if last element is used
l_1nn_m[[i]] <- l_ind$getAllNNsList(as.matrix(y[starts[i]:(starts[i]+999),]), k, TRUE)$item
#if (exists("pb")) utils::setTxtProgressBar(pb,i)
for (i in 1:nrow(y)) {
l_1nn_m[[i]] <- l_ind$getNNsList(y[i,], k, TRUE)
}

#if (exists("pb")) close(pb)

l_1nn$idx <- do.call('rbind',l_1nn_m)
l_1nn <- list(idx = do.call("rbind",lapply(l_1nn_m, "[[", "item")),
dist = do.call("rbind",lapply(l_1nn_m, "[[", "distance")))

} else {
x <- as.matrix(x)
Expand Down
13 changes: 13 additions & 0 deletions inst/tinytest/test_blocking.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
source("test_data.R")



expect_silent(
blocking(x = df_example$txt)
)
Expand Down Expand Up @@ -31,6 +33,17 @@ expect_silent(
)


# test parameters ---------------------------------------------------------

expect_error(
blocking(x = df_example$txt, ann = "hnsw", distance = "manhatan")
)

expect_error(
blocking(x = df_example$txt, ann = "annoy", distance = "cosine")
)


# testing evaluation matrices ---------------------------------------------

result <- blocking(x = df_example$txt)
Expand Down
1 change: 1 addition & 0 deletions inst/tinytest/test_hnsw.R
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,4 @@ expect_stdout(
ann_write = ".")
)


18 changes: 15 additions & 3 deletions man/blocking.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/controls_ann.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 19 additions & 1 deletion vignettes/v1-deduplication.Rmd
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
---
title: "Blocking records for deduplication"
author: "Maciej Beręsewicz"
output: rmarkdown::html_vignette
output:
html_vignette:
df_print: kable
toc: true
number_sections: true
fig_width: 6
fig_height: 4
vignette: >
%\VignetteIndexEntry{Blocking records for deduplication}
%\VignetteEngine{knitr::rmarkdown}
Expand Down Expand Up @@ -97,6 +103,7 @@ We add block information to the final dataset.
```{r}
df_block_result <- copy(df_blocks$result[order(y),])
df[, block_id := df_block_result$block]
df[, block_dist := df_block_result$dist]
head(df)
```

Expand All @@ -105,4 +112,15 @@ Finally, we can check in how many blocks the same entities (`ent_id`) are observ
```{r}
df[, .(uniq_blocks = uniqueN(block_id)), .(ent_id)][, .N, uniq_blocks]
```
Compare distances for block that contains different (1) and the same (2) units.

```{r}
boxplot(block_dist ~ id_count, data=df, xlab = "Block type", ylab = "Distances")
```

```{r}
plot(density(df[id_count==2]$block_dist), col = "blue", xlim = c(0, 0.8),
main = "Distribution of distances between\nclusters type (1=red, 2=blue)")
lines(density(df[id_count==1]$block_dist), col = "red", xlim = c(0, 0.8))
```

13 changes: 10 additions & 3 deletions vignettes/v2-reclin.Rmd
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
---
title: "v2-reclin"
output: rmarkdown::html_vignette
title: "Blocking records for record linkage"
author: "Maciej Beręsewicz"
output:
html_vignette:
df_print: kable
toc: true
number_sections: true
fig_width: 6
fig_height: 4
vignette: >
%\VignetteIndexEntry{v2-reclin}
%\VignetteIndexEntry{Blocking records for record linkage}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
Expand Down
14 changes: 10 additions & 4 deletions vignettes/v3-evaluation.Rmd
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
---
title: "v3-evaluation"
output: rmarkdown::html_vignette
title: "Evaluation of blocking procedures"
author: "Maciej Beręsewicz"
output:
html_vignette:
df_print: kable
toc: true
number_sections: true
fig_width: 6
fig_height: 4
vignette: >
%\VignetteIndexEntry{v3-evaluation}
%\VignetteIndexEntry{Evaluation of blocking procedures}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
Expand Down
25 changes: 25 additions & 0 deletions vignettes/v4-integration.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
---
title: "Integration with existing packages"
author: "Maciej Beręsewicz"
output:
html_vignette:
df_print: kable
toc: true
number_sections: true
fig_width: 6
fig_height: 4
vignette: >
%\VignetteIndexEntry{Integration with existing packages}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
```

```{r setup}
library(blocking)
```
Loading

0 comments on commit c3458c7

Please sign in to comment.