From 9638fc252130d6043d51bc5f6258c155588e943a Mon Sep 17 00:00:00 2001 From: BERENZ Date: Wed, 8 May 2024 16:04:37 +0200 Subject: [PATCH] further work on accuracy metrics --- .Rproj.user/shared/notebooks/paths | 1 - R/blocking.R | 43 +++++++++++++++++++----------- inst/tinytest/test_blocking.R | 29 -------------------- inst/tinytest/test_true_blocks.R | 27 +++++++++++++++++++ vignettes/v2-reclin.Rmd | 9 ++++--- vignettes/v3-evaluation.Rmd | 8 ++++++ 6 files changed, 67 insertions(+), 50 deletions(-) diff --git a/.Rproj.user/shared/notebooks/paths b/.Rproj.user/shared/notebooks/paths index 4fb4c7c..5afd9ce 100644 --- a/.Rproj.user/shared/notebooks/paths +++ b/.Rproj.user/shared/notebooks/paths @@ -1,4 +1,3 @@ -/Users/berenz/Downloads/Template of Abstract in Latex.tex="A4C7846D" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/.gitignore="C912F95E" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/DESCRIPTION="019D16E4" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/controls.R="5BC637B7" diff --git a/R/blocking.R b/R/blocking.R index 4883cb1..b01b6b2 100644 --- a/R/blocking.R +++ b/R/blocking.R @@ -279,38 +279,49 @@ blocking <- function(x, ## if true are given if (!is.null(true_blocks)) { - setDT(true_blocks) ## move it somewhere else + setDT(true_blocks) + + #true_blocks_copy <- copy(true_blocks) pairs_to_eval <- x_df[y %in% true_blocks$y, c("x", "y", "block")] - pairs_to_eval[true_blocks, on = c("x", "y"), both := TRUE] - pairs_to_eval[is.na(both), both := FALSE] + pairs_to_eval[true_blocks, on = c("x", "y"), both := 0L] + pairs_to_eval[is.na(both), both := -1L] - true_blocks[pairs_to_eval, on = c("x", "y"), both := TRUE] - true_blocks[is.na(both), both := FALSE] + true_blocks[pairs_to_eval, on = c("x", "y"), both := 0L] + true_blocks[is.na(both), both := 1L] true_blocks[, block:=block+max(pairs_to_eval$block)] - pairs_to_eval <- rbind(pairs_to_eval, true_blocks[both == FALSE, .(x,y,block)], fill = TRUE) + pairs_to_eval <- rbind(pairs_to_eval, true_blocks[both == 1L, .(x,y,block, both)]) if (!deduplication) { + pairs_to_eval[, row_id := 1:.N] pairs_to_eval[, x2:=x+max(y)] - pairs_to_eval_long <- melt(pairs_to_eval[, .(y, x2, block, both)], id.vars = c("block", "both")) - pairs_to_eval_long[!is.na(both), block_id := .GRP, block] - block_id_max <- max(pairs_to_eval_long$block_id, na.rm = T) - pairs_to_eval_long[is.na(both), block_id:=block_id_max + rleid(block)] - pairs_to_eval_long[both == TRUE | is.na(both), true_id := .GRP, block] - true_id_max <- max(pairs_to_eval_long$true_id, na.rm = T) - pairs_to_eval_long[both==FALSE, true_id := true_id_max+rleid(block)] + + pairs_to_eval_long <- melt(pairs_to_eval[, .(y, x2, row_id, block, both)], id.vars = c("row_id", "block", "both")) + pairs_to_eval_long[both == 0L, ":="(block_id = .GRP, true_id = .GRP), block] + + block_id_max <- max(pairs_to_eval_long$block_id, na.rm = TRUE) + pairs_to_eval_long[both == -1L, block_id:= block_id_max + .GRP, row_id] + block_id_max <- max(pairs_to_eval_long$block_id, na.rm = TRUE) + pairs_to_eval_long[both == 1L & is.na(block_id), block_id := block_id_max + rleid(row_id)] + + true_id_max <- max(pairs_to_eval_long$true_id, na.rm = TRUE) + pairs_to_eval_long[both == 1L, true_id:= true_id_max + .GRP, row_id] + true_id_max <- max(pairs_to_eval_long$true_id, na.rm = TRUE) + pairs_to_eval_long[both == -1L & is.na(true_id), true_id := true_id_max + rleid(row_id)] } else { - pairs_to_eval_long <- melt(pairs_to_eval[, .(y, x, block, both)], id.vars = c("block", "both")) + ## this does not work yet + pairs_to_eval[, row_id := 1:.N] + pairs_to_eval_long <- melt(pairs_to_eval[, .(y, x, row_id, block, both)], id.vars = c("block", "both")) pairs_to_eval_long[!is.na(both), block_id := .GRP, block] block_id_max <- max(pairs_to_eval_long$block_id, na.rm = T) - pairs_to_eval_long[is.na(both), block_id:=block_id_max + rleid(block)] + pairs_to_eval_long[is.na(both), block_id:= block_id_max + .GRP, row_id] pairs_to_eval_long[both == TRUE | is.na(both), true_id := .GRP, block] true_id_max <- max(pairs_to_eval_long$true_id, na.rm = T) - pairs_to_eval_long[both==FALSE, true_id := true_id_max+rleid(block)] + pairs_to_eval_long[both==FALSE, true_id := true_id_max + .GRP, row_id] } diff --git a/inst/tinytest/test_blocking.R b/inst/tinytest/test_blocking.R index 8583665..f17e00c 100644 --- a/inst/tinytest/test_blocking.R +++ b/inst/tinytest/test_blocking.R @@ -77,35 +77,6 @@ expect_error( ) -# testing evaluation matrices --------------------------------------------- - -result <- blocking(x = df_example$txt) - -expect_silent( - blocking(x = df_example$txt, - true_blocks = result$result[, c("x", "y", "block")]) -) - -expect_error( - blocking(x = df_example$txt, - true_blocks = result$result) -) - -expect_equal( - blocking(x = df_example$txt, - true_blocks = result$result[, c("x", "y", "block")])$metrics, - c(recall = 1, precision = 1, fpr = 0, fnr = 0, accuracy = 1, specificity = 1) -) - -# check if true_block is a vector - -# expect_silent( -# blocking(x = df_example$txt, -# #true_blocks = result$result$block) -# true_blocks = result$result[, c("x", "y", "block")]) -# ) - - ## printing expect_silent( diff --git a/inst/tinytest/test_true_blocks.R b/inst/tinytest/test_true_blocks.R index e69de29..5c34d9d 100644 --- a/inst/tinytest/test_true_blocks.R +++ b/inst/tinytest/test_true_blocks.R @@ -0,0 +1,27 @@ +# testing evaluation matrices --------------------------------------------- + +result <- blocking(x = df_example$txt) + +# expect_silent( +# blocking(x = df_example$txt, +# true_blocks = result$result[, c("x", "y", "block")]) +# ) +# +# expect_error( +# blocking(x = df_example$txt, +# true_blocks = result$result) +# ) + +# expect_equal( +# blocking(x = df_example$txt, +# true_blocks = result$result[, c("x", "y", "block")])$metrics, +# c(recall = 1, precision = 1, fpr = 0, fnr = 0, accuracy = 1, specificity = 1) +# ) + +# check if true_block is a vector + +# expect_silent( +# blocking(x = df_example$txt, +# #true_blocks = result$result$block) +# true_blocks = result$result[, c("x", "y", "block")]) +# ) diff --git a/vignettes/v2-reclin.Rmd b/vignettes/v2-reclin.Rmd index 3e39a40..4ecf612 100644 --- a/vignettes/v2-reclin.Rmd +++ b/vignettes/v2-reclin.Rmd @@ -58,10 +58,8 @@ all records in the cis). ``` ```{r} -census <- read.csv("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/census.csv") -cis <- read.csv("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/cis.csv") -setDT(census) -setDT(cis) +census <- fread("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/census.csv") +cis <- fread("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/cis.csv") ``` + `census` object has `r nrow(census)` rows and `r ncol(census)`, @@ -81,6 +79,9 @@ head(cis) We need to create new columns that concatanates variables from `pername1` to `enumpc`. In the first step we replace `NA`s with `''`. ```{r} +census[, ":="(dob_day=as.character(dob_day), dob_mon=as.character(dob_mon), dob_year=as.character(dob_year))] +cis[, ":="(dob_day=as.character(dob_day), dob_mon=as.character(dob_mon),dob_year=as.character(dob_year))] + census[is.na(dob_day), dob_day := ""] census[is.na(dob_mon), dob_mon := ""] census[is.na(dob_year), dob_year := ""] diff --git a/vignettes/v3-evaluation.Rmd b/vignettes/v3-evaluation.Rmd index 6b006d2..de1597e 100644 --- a/vignettes/v3-evaluation.Rmd +++ b/vignettes/v3-evaluation.Rmd @@ -20,6 +20,14 @@ knitr::opts_chunk$set( ) ``` +# Setup + ```{r setup} library(blocking) ``` + +# Methodology + +The package implements the following measures of assessing the quality of blocking + +#