Skip to content

Commit

Permalink
further work on accuracy metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
BERENZ committed May 8, 2024
1 parent 51c5431 commit 9638fc2
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 50 deletions.
1 change: 0 additions & 1 deletion .Rproj.user/shared/notebooks/paths
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
/Users/berenz/Downloads/Template of Abstract in Latex.tex="A4C7846D"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/.gitignore="C912F95E"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/DESCRIPTION="019D16E4"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/controls.R="5BC637B7"
Expand Down
43 changes: 27 additions & 16 deletions R/blocking.R
Original file line number Diff line number Diff line change
Expand Up @@ -279,38 +279,49 @@ blocking <- function(x,
## if true are given
if (!is.null(true_blocks)) {

setDT(true_blocks) ## move it somewhere else
setDT(true_blocks)

#true_blocks_copy <- copy(true_blocks)

pairs_to_eval <- x_df[y %in% true_blocks$y, c("x", "y", "block")]
pairs_to_eval[true_blocks, on = c("x", "y"), both := TRUE]
pairs_to_eval[is.na(both), both := FALSE]
pairs_to_eval[true_blocks, on = c("x", "y"), both := 0L]
pairs_to_eval[is.na(both), both := -1L]

true_blocks[pairs_to_eval, on = c("x", "y"), both := TRUE]
true_blocks[is.na(both), both := FALSE]
true_blocks[pairs_to_eval, on = c("x", "y"), both := 0L]
true_blocks[is.na(both), both := 1L]
true_blocks[, block:=block+max(pairs_to_eval$block)]

pairs_to_eval <- rbind(pairs_to_eval, true_blocks[both == FALSE, .(x,y,block)], fill = TRUE)
pairs_to_eval <- rbind(pairs_to_eval, true_blocks[both == 1L, .(x,y,block, both)])

if (!deduplication) {

pairs_to_eval[, row_id := 1:.N]
pairs_to_eval[, x2:=x+max(y)]
pairs_to_eval_long <- melt(pairs_to_eval[, .(y, x2, block, both)], id.vars = c("block", "both"))
pairs_to_eval_long[!is.na(both), block_id := .GRP, block]
block_id_max <- max(pairs_to_eval_long$block_id, na.rm = T)
pairs_to_eval_long[is.na(both), block_id:=block_id_max + rleid(block)]
pairs_to_eval_long[both == TRUE | is.na(both), true_id := .GRP, block]
true_id_max <- max(pairs_to_eval_long$true_id, na.rm = T)
pairs_to_eval_long[both==FALSE, true_id := true_id_max+rleid(block)]

pairs_to_eval_long <- melt(pairs_to_eval[, .(y, x2, row_id, block, both)], id.vars = c("row_id", "block", "both"))
pairs_to_eval_long[both == 0L, ":="(block_id = .GRP, true_id = .GRP), block]

block_id_max <- max(pairs_to_eval_long$block_id, na.rm = TRUE)
pairs_to_eval_long[both == -1L, block_id:= block_id_max + .GRP, row_id]
block_id_max <- max(pairs_to_eval_long$block_id, na.rm = TRUE)
pairs_to_eval_long[both == 1L & is.na(block_id), block_id := block_id_max + rleid(row_id)]

true_id_max <- max(pairs_to_eval_long$true_id, na.rm = TRUE)
pairs_to_eval_long[both == 1L, true_id:= true_id_max + .GRP, row_id]
true_id_max <- max(pairs_to_eval_long$true_id, na.rm = TRUE)
pairs_to_eval_long[both == -1L & is.na(true_id), true_id := true_id_max + rleid(row_id)]

} else {

pairs_to_eval_long <- melt(pairs_to_eval[, .(y, x, block, both)], id.vars = c("block", "both"))
## this does not work yet
pairs_to_eval[, row_id := 1:.N]
pairs_to_eval_long <- melt(pairs_to_eval[, .(y, x, row_id, block, both)], id.vars = c("block", "both"))
pairs_to_eval_long[!is.na(both), block_id := .GRP, block]
block_id_max <- max(pairs_to_eval_long$block_id, na.rm = T)
pairs_to_eval_long[is.na(both), block_id:=block_id_max + rleid(block)]
pairs_to_eval_long[is.na(both), block_id:= block_id_max + .GRP, row_id]
pairs_to_eval_long[both == TRUE | is.na(both), true_id := .GRP, block]
true_id_max <- max(pairs_to_eval_long$true_id, na.rm = T)
pairs_to_eval_long[both==FALSE, true_id := true_id_max+rleid(block)]
pairs_to_eval_long[both==FALSE, true_id := true_id_max + .GRP, row_id]

}

Expand Down
29 changes: 0 additions & 29 deletions inst/tinytest/test_blocking.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,35 +77,6 @@ expect_error(
)


# testing evaluation matrices ---------------------------------------------

result <- blocking(x = df_example$txt)

expect_silent(
blocking(x = df_example$txt,
true_blocks = result$result[, c("x", "y", "block")])
)

expect_error(
blocking(x = df_example$txt,
true_blocks = result$result)
)

expect_equal(
blocking(x = df_example$txt,
true_blocks = result$result[, c("x", "y", "block")])$metrics,
c(recall = 1, precision = 1, fpr = 0, fnr = 0, accuracy = 1, specificity = 1)
)

# check if true_block is a vector

# expect_silent(
# blocking(x = df_example$txt,
# #true_blocks = result$result$block)
# true_blocks = result$result[, c("x", "y", "block")])
# )


## printing

expect_silent(
Expand Down
27 changes: 27 additions & 0 deletions inst/tinytest/test_true_blocks.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# testing evaluation matrices ---------------------------------------------

result <- blocking(x = df_example$txt)

# expect_silent(
# blocking(x = df_example$txt,
# true_blocks = result$result[, c("x", "y", "block")])
# )
#
# expect_error(
# blocking(x = df_example$txt,
# true_blocks = result$result)
# )

# expect_equal(
# blocking(x = df_example$txt,
# true_blocks = result$result[, c("x", "y", "block")])$metrics,
# c(recall = 1, precision = 1, fpr = 0, fnr = 0, accuracy = 1, specificity = 1)
# )

# check if true_block is a vector

# expect_silent(
# blocking(x = df_example$txt,
# #true_blocks = result$result$block)
# true_blocks = result$result[, c("x", "y", "block")])
# )
9 changes: 5 additions & 4 deletions vignettes/v2-reclin.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,8 @@ all records in the cis).
```

```{r}
census <- read.csv("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/census.csv")
cis <- read.csv("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/cis.csv")
setDT(census)
setDT(cis)
census <- fread("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/census.csv")
cis <- fread("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/cis.csv")
```

+ `census` object has `r nrow(census)` rows and `r ncol(census)`,
Expand All @@ -81,6 +79,9 @@ head(cis)
We need to create new columns that concatanates variables from `pername1` to `enumpc`. In the first step we replace `NA`s with `''`.

```{r}
census[, ":="(dob_day=as.character(dob_day), dob_mon=as.character(dob_mon), dob_year=as.character(dob_year))]
cis[, ":="(dob_day=as.character(dob_day), dob_mon=as.character(dob_mon),dob_year=as.character(dob_year))]
census[is.na(dob_day), dob_day := ""]
census[is.na(dob_mon), dob_mon := ""]
census[is.na(dob_year), dob_year := ""]
Expand Down
8 changes: 8 additions & 0 deletions vignettes/v3-evaluation.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ knitr::opts_chunk$set(
)
```

# Setup

```{r setup}
library(blocking)
```

# Methodology

The package implements the following measures of assessing the quality of blocking

#

0 comments on commit 9638fc2

Please sign in to comment.