From f4bb05eab9bc9fa1cce09ff9e8c1c8db7db62473 Mon Sep 17 00:00:00 2001 From: LTLA Date: Sat, 10 Aug 2024 02:27:27 -0700 Subject: [PATCH] Further editing. --- R/RcppExports.R | 8 +-- R/findDistance.R | 15 +++--- R/findKNN.R | 8 ++- R/findNeighbors.R | 4 +- R/queryDistance.R | 11 +++-- R/queryKNN.R | 13 +++-- R/queryNeighbors.R | 12 ++--- inst/NEWS.Rd | 3 ++ man/findDistance.Rd | 7 +-- man/findKNN.Rd | 5 +- man/findNeighbors.Rd | 4 +- man/queryDistance.Rd | 10 ++-- man/queryKNN.Rd | 10 ++-- man/queryNeighbors.Rd | 21 +++----- src/RcppExports.cpp | 18 ++++--- src/generics.cpp | 26 ++++++++-- tests/testthat/test-findDistance.R | 63 +++++++++++++++++++++++ tests/testthat/test-findKNN.R | 6 +++ tests/testthat/test-queryDistance.R | 77 +++++++++++++++++++++++++++++ tests/testthat/test-queryKNN.R | 10 +++- vignettes/userguide.Rmd | 13 +++++ 21 files changed, 273 insertions(+), 71 deletions(-) create mode 100644 tests/testthat/test-findDistance.R create mode 100644 tests/testthat/test-queryDistance.R diff --git a/R/RcppExports.R b/R/RcppExports.R index a0d8f44..de4e3ba 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -17,12 +17,12 @@ generic_build <- function(builder, data) { .Call('_BiocNeighbors_generic_build', PACKAGE = 'BiocNeighbors', builder, data) } -generic_find_knn <- function(prebuilt_ptr, num_neighbors, chosen, num_threads, last_distance_only, report_index, report_distance) { - .Call('_BiocNeighbors_generic_find_knn', PACKAGE = 'BiocNeighbors', prebuilt_ptr, num_neighbors, chosen, num_threads, last_distance_only, report_index, report_distance) +generic_find_knn <- function(prebuilt_ptr, num_neighbors, force_variable_neighbors, chosen, num_threads, last_distance_only, report_index, report_distance) { + .Call('_BiocNeighbors_generic_find_knn', PACKAGE = 'BiocNeighbors', prebuilt_ptr, num_neighbors, force_variable_neighbors, chosen, num_threads, last_distance_only, report_index, report_distance) } -generic_query_knn <- function(prebuilt_ptr, query, num_neighbors, num_threads, last_distance_only, report_index, report_distance) { - .Call('_BiocNeighbors_generic_query_knn', PACKAGE = 'BiocNeighbors', prebuilt_ptr, query, num_neighbors, num_threads, last_distance_only, report_index, report_distance) +generic_query_knn <- function(prebuilt_ptr, query, num_neighbors, force_variable_neighbors, num_threads, last_distance_only, report_index, report_distance) { + .Call('_BiocNeighbors_generic_query_knn', PACKAGE = 'BiocNeighbors', prebuilt_ptr, query, num_neighbors, force_variable_neighbors, num_threads, last_distance_only, report_index, report_distance) } generic_find_all <- function(prebuilt_ptr, chosen, thresholds, num_threads, report_index, report_distance) { diff --git a/R/findDistance.R b/R/findDistance.R index f99fc1b..175d9cf 100644 --- a/R/findDistance.R +++ b/R/findDistance.R @@ -1,6 +1,6 @@ -#' Find the distance to the k-th nearest neighbor +#' Distance to the k-th nearest neighbor #' -#' Find the distance to the k-th nearest neighbor for each point in a dataset, using a variety of algorithms. +#' Find the distance to the k-th nearest neighbor for each point in a dataset. #' #' @inheritParams findKNN #' @@ -11,7 +11,7 @@ #' @return #' Numeric vector of length equal to the number of points in \code{X} (or \code{subset}, if provided), #' containing the distance from each point to its \code{k}-th nearest neighbor. -#' This is equivalent to but faster than taking the last distance of the output from \code{\link{findKNN}}. +#' This is equivalent to but more memory efficient than using \code{\link{findKNN}} and subsetting to the last distance. #' #' @author #' Aaron Lun @@ -34,16 +34,17 @@ NULL #' @export -setMethod("findDistance", c("matrix", "ANY"), function(X, k, get.index=TRUE, get.distance=TRUE, num.threads=1, subset=NULL, ..., BNPARAM=NULL) { +setMethod("findDistance", c("matrix", "ANY"), function(X, k, num.threads=1, subset=NULL, ..., BNPARAM=NULL) { ptr <- buildIndex(X, ..., BNPARAM=BNPARAM) - callGeneric(ptr, k=k, get.index=get.index, get.distance=get.distance, num.threads=num.threads, subset=subset, ...) + callGeneric(ptr, k=k, num.threads=num.threads, subset=subset, ...) }) #' @export -setMethod("findDistance", c("externalptr", "ANY"), function(X, k, get.index=TRUE, get.distance=TRUE, num.threads=1, subset=NULL, ..., BNPARAM=NULL) { +setMethod("findDistance", c("externalptr", "ANY"), function(X, k, num.threads=1, subset=NULL, ..., BNPARAM=NULL) { generic_find_knn( X, - num_neighbors=k, + num_neighbors=as.integer(k), + force_variable_neighbors=is(k, "AsIs"), chosen=subset, num_threads=num.threads, last_distance_only=TRUE, diff --git a/R/findKNN.R b/R/findKNN.R index b8af06e..d4e222a 100644 --- a/R/findKNN.R +++ b/R/findKNN.R @@ -1,6 +1,6 @@ #' Find k-nearest neighbors #' -#' Find the k-nearest neighbors of each point in a dataset, using a variety of algorithms. +#' Find the k-nearest neighbors of each point in a dataset. #' #' @param X A numeric matrix where rows correspond to data points and columns correspond to variables (i.e., dimensions). #' Alternatively, a prebuilt index from \code{\link{buildIndex}}. @@ -8,6 +8,7 @@ #' #' Alternatively, an integer vector of length equal to the number of points in \code{X}, specifying the number of neighbors to identify for each point. #' If \code{subset} is provided, this should have length equal to the length of \code{subset}. +#' Users should wrap this vector in an \link{AsIs} class to distinguish length-1 vectors from integer scalars. #' #' All \code{k} should be less than or equal to the number of points in \code{X} minus 1, otherwise the former will be capped at the latter with a warning. #' @param get.index A logical scalar indicating whether the indices of the nearest neighbors should be recorded. @@ -68,6 +69,8 @@ #' @seealso #' \code{\link{buildIndex}}, to build an index ahead of time. #' +#' \code{\link{findDistance}}, to efficiently obtain the distance to the k-th nearest neighbor. +#' #' @aliases #' findKNN,matrix,ANY-method #' findKNN,externalptr,ANY-method @@ -99,7 +102,8 @@ setMethod("findKNN", c("externalptr", "ANY"), function(X, k, get.index=TRUE, get output <- generic_find_knn( X, - num_neighbors=k, + num_neighbors=as.integer(k), + force_variable_neighbors=is(k, "AsIs"), chosen=subset, num_threads=num.threads, last_distance_only=FALSE, diff --git a/R/findNeighbors.R b/R/findNeighbors.R index 6e04989..424e0ee 100644 --- a/R/findNeighbors.R +++ b/R/findNeighbors.R @@ -1,6 +1,6 @@ -#' Find all neighbors in range +#' Find neighbors within a threshold distance #' -#' Find all neighboring data points within a certain distance of each point. +#' Find all neighbors within a threshold distance of each point of a dataset. #' #' @inheritParams findKNN #' @param threshold A positive numeric scalar specifying the maximum distance at which a point is considered a neighbor. diff --git a/R/queryDistance.R b/R/queryDistance.R index 7c4a1aa..9056c58 100644 --- a/R/queryDistance.R +++ b/R/queryDistance.R @@ -1,6 +1,6 @@ -#' Query for the distance to the k-th nearest neighbor +#' Distance to the k-th nearest neighbor to query points #' -#' Query a dataset to determine the distance to the k-th nearest neighbor of each point in another dataset, using a variety of algorithms. +#' Query a reference dataset to determine the distance to the k-th nearest neighbor of each point in a query dataset. #' #' @inheritParams queryKNN #' @@ -11,7 +11,7 @@ #' @return #' Numeric vector of length equal to the number of points in \code{query} (or \code{subset}, if provided), #' containing the distance from each point to its \code{k}-th nearest neighbor. -#' This is equivalent to but faster than taking the last distance of the output of \code{\link{queryKNN}}. +#' This is equivalent to but more memory efficient than using \code{\link{queryKNN}} and subsetting to the last distance. #' #' @author #' Aaron Lun @@ -37,7 +37,7 @@ NULL #' @export setMethod("queryDistance", c("matrix", "ANY"), function(X, query, k, num.threads=1, subset=NULL, transposed=FALSE, ..., BNPARAM=NULL) { ptr <- buildIndex(X, transposed=transposed, ..., BNPARAM=BNPARAM) - callGeneric(ptr, query=query, k=k, get.index=get.index, get.distance=get.distance, num.threads=num.threads, subset=subset, transposed=transposed, ...) + callGeneric(ptr, query=query, k=k, num.threads=num.threads, subset=subset, transposed=transposed, ...) }) #' @export @@ -50,7 +50,8 @@ setMethod("queryDistance", c("externalptr", "ANY"), function(X, query, k, num.th generic_query_knn( X, query=query, - num_neighbors=k, + num_neighbors=as.integer(k), + force_variable_neighbors=is(k, "AsIs"), num_threads=num.threads, last_distance_only=TRUE, report_index=FALSE, diff --git a/R/queryKNN.R b/R/queryKNN.R index c65b2ab..ac551cd 100644 --- a/R/queryKNN.R +++ b/R/queryKNN.R @@ -1,12 +1,16 @@ -#' Query for the k-nearest neighbors +#' Query k-nearest neighbors #' -#' Query a dataset for the k-nearest neighbors of points in another dataset, using a variety of algorithms. +#' Query a reference dataset for the k-nearest neighbors of each point in a query dataset. #' +#' @param X The reference dataset to be queried. +#' This should be a numeric matrix where rows correspond to reference points and columns correspond to variables (i.e., dimensions). +#' Alternatively, a prebuilt index from \code{\link{buildIndex}}. #' @inheritParams findKNN #' @param k A positive integer scalar specifying the number of nearest neighbors to retrieve. #' #' Alternatively, an integer vector of length equal to the number of points in \code{query}, specifying the number of neighbors to identify for each point. #' If \code{subset} is provided, this should have length equal to the length of \code{subset}. +#' Users should wrap this vector in an \link{AsIs} class to distinguish length-1 vectors from integer scalars. #' #' All \code{k} should be less than or equal to the number of points in \code{X}, otherwise the former will be capped at the latter with a warning. #' @param query A numeric matrix of query points, containing the same number of columns as \code{X}. @@ -53,6 +57,8 @@ #' #' @seealso #' \code{\link{buildIndex}}, to build an index ahead of time. +#' +#' \code{\link{queryDistance}}, to obtain the distance from each query point to its k-th nearest neighbor. #' #' @aliases #' queryKNN,matrix,ANY-method @@ -92,7 +98,8 @@ setMethod("queryKNN", c("externalptr", "ANY"), function(X, query, k, get.index=T output <- generic_query_knn( X, query=query, - num_neighbors=k, + num_neighbors=as.integer(k), + force_variable_neighbors=is(k, "AsIs"), num_threads=num.threads, last_distance_only=FALSE, report_index=!isFALSE(get.index), diff --git a/R/queryNeighbors.R b/R/queryNeighbors.R index c22a6b8..5782100 100644 --- a/R/queryNeighbors.R +++ b/R/queryNeighbors.R @@ -1,14 +1,12 @@ -#' Query neighbors in range +#' Query neighbors within a threshold distance #' -#' Find all neighboring data points within a certain distance of a query point. +#' Find all points in a reference dataset that lie within a threshold distance of each point in a query dataset. #' -#' @inheritParams findKNN -#' @param query A numeric matrix of query points, containing the same number of columns as \code{X}. +#' @inheritParams queryKNN #' @param threshold A positive numeric scalar specifying the maximum distance at which a point is considered a neighbor. #' Alternatively, a vector containing a different distance threshold for each query point. -#' @param transposed A logical scalar indicating whether \code{X} and \code{query} are transposed, -#' in which case both matrices are assumed to contain dimensions in the rows and data points in the columns. -#' @param subset An integer, logical or character vector indicating the rows of \code{query} (or columns, if \code{transposed=TRUE}) for which the neighbors should be identified. +#' @param get.index A logical scalar indicating whether the indices of the neighbors should be recorded. +#' @param get.distance A logical scalar indicating whether distances to the neighbors should be recorded. #' #' @details #' This function identifies all points in \code{X} that within \code{threshold} of each point in \code{query}. diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd index 54f095d..c927514 100644 --- a/inst/NEWS.Rd +++ b/inst/NEWS.Rd @@ -16,6 +16,9 @@ The removal of this guarantee makes it easier to extend \pkg{BiocNeighbors} to n \item All functions (\code{findKNN()}, \code{queryNeighbors()}, etc.) will no longer coerce `X` to a matrix, to avoid the headache of S4 dispatch ambiguity. Users should coerce their data into matrix format before supplying it to these functions. + +\item The \code{last=} option in \code{findKNN()} and \code{queryKNN()} has been replaced by the \code{findDistance()} and \code{queryDistance()} functions instead. +This provides a much more intuitive method for the typical use of \code{last=}, i.e., to obtain the distance to the k-th nearest neighbor. }} \section{Version 1.10.0}{\itemize{ diff --git a/man/findDistance.Rd b/man/findDistance.Rd index ba77a5d..ccadcbf 100644 --- a/man/findDistance.Rd +++ b/man/findDistance.Rd @@ -6,7 +6,7 @@ \alias{findDistance,externalptr,ANY-method} \alias{findDistance,matrix-method} \alias{findDistance,externalptr-method} -\title{Find the distance to the k-th nearest neighbor} +\title{Distance to the k-th nearest neighbor} \usage{ findDistance(X, k, num.threads = 1, subset = NULL, ..., BNPARAM = NULL) } @@ -18,6 +18,7 @@ Alternatively, a prebuilt index from \code{\link{buildIndex}}.} Alternatively, an integer vector of length equal to the number of points in \code{X}, specifying the number of neighbors to identify for each point. If \code{subset} is provided, this should have length equal to the length of \code{subset}. +Users should wrap this vector in an \link{AsIs} class to distinguish length-1 vectors from integer scalars. All \code{k} should be less than or equal to the number of points in \code{X} minus 1, otherwise the former will be capped at the latter with a warning.} @@ -35,10 +36,10 @@ Ignored if \code{x} contains a prebuilt index.} \value{ Numeric vector of length equal to the number of points in \code{X} (or \code{subset}, if provided), containing the distance from each point to its \code{k}-th nearest neighbor. -This is equivalent to but faster than taking the last distance of the output from \code{\link{findKNN}}. +This is equivalent to but more memory efficient than using \code{\link{findKNN}} and subsetting to the last distance. } \description{ -Find the distance to the k-th nearest neighbor for each point in a dataset, using a variety of algorithms. +Find the distance to the k-th nearest neighbor for each point in a dataset. } \details{ If multiple queries are to be performed to the same \code{X}, it may be beneficial to build the index from \code{X} with \code{\link{buildIndex}}. diff --git a/man/findKNN.Rd b/man/findKNN.Rd index 908c57a..9cb3faf 100644 --- a/man/findKNN.Rd +++ b/man/findKNN.Rd @@ -29,6 +29,7 @@ Alternatively, a prebuilt index from \code{\link{buildIndex}}.} Alternatively, an integer vector of length equal to the number of points in \code{X}, specifying the number of neighbors to identify for each point. If \code{subset} is provided, this should have length equal to the length of \code{subset}. +Users should wrap this vector in an \link{AsIs} class to distinguish length-1 vectors from integer scalars. All \code{k} should be less than or equal to the number of points in \code{X} minus 1, otherwise the former will be capped at the latter with a warning.} @@ -86,7 +87,7 @@ The \eqn{i}-th vector contains the distances of neighboring points in \code{X} t } } \description{ -Find the k-nearest neighbors of each point in a dataset, using a variety of algorithms. +Find the k-nearest neighbors of each point in a dataset. } \details{ If multiple queries are to be performed to the same \code{X}, it may be beneficial to build the index from \code{X} with \code{\link{buildIndex}}. @@ -101,6 +102,8 @@ head(out$distance) } \seealso{ \code{\link{buildIndex}}, to build an index ahead of time. + +\code{\link{findDistance}}, to efficiently obtain the distance to the k-th nearest neighbor. } \author{ Aaron Lun diff --git a/man/findNeighbors.Rd b/man/findNeighbors.Rd index 8e088a0..b548004 100644 --- a/man/findNeighbors.Rd +++ b/man/findNeighbors.Rd @@ -8,7 +8,7 @@ \alias{findNeighbors,matrix-method} \alias{findNeighbors,externalptr-method} \alias{findNeighbors,missing-method} -\title{Find all neighbors in range} +\title{Find neighbors within a threshold distance} \usage{ findNeighbors( X, @@ -61,7 +61,7 @@ The \code{i}-th entry contains the number of neighbors of \eqn{i} within \code{t If \code{subset} is not \code{NULL}, each entry of the above vector/lists corresponds to a point in the subset, in the same order as supplied in \code{subset}. } \description{ -Find all neighboring data points within a certain distance of each point. +Find all neighbors within a threshold distance of each point of a dataset. } \details{ This function identifies all points in \code{X} that within \code{threshold} of each point in \code{X}. diff --git a/man/queryDistance.Rd b/man/queryDistance.Rd index bcf4869..8a765ae 100644 --- a/man/queryDistance.Rd +++ b/man/queryDistance.Rd @@ -6,7 +6,7 @@ \alias{queryDistance,externalptr,ANY-method} \alias{queryDistance,matrix-method} \alias{queryDistance,externalptr-method} -\title{Query for the distance to the k-th nearest neighbor} +\title{Distance to the k-th nearest neighbor to query points} \usage{ queryDistance( X, @@ -20,7 +20,8 @@ queryDistance( ) } \arguments{ -\item{X}{A numeric matrix where rows correspond to data points and columns correspond to variables (i.e., dimensions). +\item{X}{The reference dataset to be queried. +This should be a numeric matrix where rows correspond to reference points and columns correspond to variables (i.e., dimensions). Alternatively, a prebuilt index from \code{\link{buildIndex}}.} \item{query}{A numeric matrix of query points, containing the same number of columns as \code{X}.} @@ -29,6 +30,7 @@ Alternatively, a prebuilt index from \code{\link{buildIndex}}.} Alternatively, an integer vector of length equal to the number of points in \code{query}, specifying the number of neighbors to identify for each point. If \code{subset} is provided, this should have length equal to the length of \code{subset}. +Users should wrap this vector in an \link{AsIs} class to distinguish length-1 vectors from integer scalars. All \code{k} should be less than or equal to the number of points in \code{X}, otherwise the former will be capped at the latter with a warning.} @@ -48,10 +50,10 @@ Ignored if \code{x} contains a prebuilt index.} \value{ Numeric vector of length equal to the number of points in \code{query} (or \code{subset}, if provided), containing the distance from each point to its \code{k}-th nearest neighbor. -This is equivalent to but faster than taking the last distance of the output of \code{\link{queryKNN}}. +This is equivalent to but more memory efficient than using \code{\link{queryKNN}} and subsetting to the last distance. } \description{ -Query a dataset to determine the distance to the k-th nearest neighbor of each point in another dataset, using a variety of algorithms. +Query a reference dataset to determine the distance to the k-th nearest neighbor of each point in a query dataset. } \details{ If multiple queries are to be performed to the same \code{X}, it may be beneficial to build the index from \code{X} with \code{\link{buildIndex}}. diff --git a/man/queryKNN.Rd b/man/queryKNN.Rd index 0428e87..1172008 100644 --- a/man/queryKNN.Rd +++ b/man/queryKNN.Rd @@ -8,7 +8,7 @@ \alias{queryKNN,matrix-method} \alias{queryKNN,externalptr-method} \alias{queryKNN,missing-method} -\title{Query for the k-nearest neighbors} +\title{Query k-nearest neighbors} \usage{ queryKNN( X, @@ -24,7 +24,8 @@ queryKNN( ) } \arguments{ -\item{X}{A numeric matrix where rows correspond to data points and columns correspond to variables (i.e., dimensions). +\item{X}{The reference dataset to be queried. +This should be a numeric matrix where rows correspond to reference points and columns correspond to variables (i.e., dimensions). Alternatively, a prebuilt index from \code{\link{buildIndex}}.} \item{query}{A numeric matrix of query points, containing the same number of columns as \code{X}.} @@ -33,6 +34,7 @@ Alternatively, a prebuilt index from \code{\link{buildIndex}}.} Alternatively, an integer vector of length equal to the number of points in \code{query}, specifying the number of neighbors to identify for each point. If \code{subset} is provided, this should have length equal to the length of \code{subset}. +Users should wrap this vector in an \link{AsIs} class to distinguish length-1 vectors from integer scalars. All \code{k} should be less than or equal to the number of points in \code{X}, otherwise the former will be capped at the latter with a warning.} @@ -92,7 +94,7 @@ The \eqn{i}-th vector contains the distances of neighboring points in \code{X} t } } \description{ -Query a dataset for the k-nearest neighbors of points in another dataset, using a variety of algorithms. +Query a reference dataset for the k-nearest neighbors of each point in a query dataset. } \details{ If multiple queries are to be performed to the same \code{X}, it may be beneficial to build the index from \code{X} with \code{\link{buildIndex}}. @@ -108,6 +110,8 @@ head(out$distance) } \seealso{ \code{\link{buildIndex}}, to build an index ahead of time. + +\code{\link{queryDistance}}, to obtain the distance from each query point to its k-th nearest neighbor. } \author{ Aaron Lun diff --git a/man/queryNeighbors.Rd b/man/queryNeighbors.Rd index f3eeec6..9ca4f70 100644 --- a/man/queryNeighbors.Rd +++ b/man/queryNeighbors.Rd @@ -8,7 +8,7 @@ \alias{queryNeighbors,matrix-method} \alias{queryNeighbors,externalptr-method} \alias{queryNeighbors,missing-method} -\title{Query neighbors in range} +\title{Query neighbors within a threshold distance} \usage{ queryNeighbors( X, @@ -24,7 +24,8 @@ queryNeighbors( ) } \arguments{ -\item{X}{A numeric matrix where rows correspond to data points and columns correspond to variables (i.e., dimensions). +\item{X}{The reference dataset to be queried. +This should be a numeric matrix where rows correspond to reference points and columns correspond to variables (i.e., dimensions). Alternatively, a prebuilt index from \code{\link{buildIndex}}.} \item{query}{A numeric matrix of query points, containing the same number of columns as \code{X}.} @@ -32,21 +33,13 @@ Alternatively, a prebuilt index from \code{\link{buildIndex}}.} \item{threshold}{A positive numeric scalar specifying the maximum distance at which a point is considered a neighbor. Alternatively, a vector containing a different distance threshold for each query point.} -\item{get.index}{A logical scalar indicating whether the indices of the nearest neighbors should be recorded. -Setting this to \code{FALSE} improves efficiency if the indices are not of interest. +\item{get.index}{A logical scalar indicating whether the indices of the neighbors should be recorded.} -Alternatively, if \code{k} is an integer scalar, this may be a string containing \code{"normal"} or \code{"transposed"}. -The former is the same as \code{TRUE}, while the latter returns the index matrix in transposed format.} - -\item{get.distance}{A logical scalar indicating whether distances to the nearest neighbors should be recorded. -Setting this to \code{FALSE} improves efficiency if the distances are not of interest. - -Alternatively, if \code{k} is an integer scalar, this may be a string containing \code{"normal"} or \code{"transposed"}. -The former is the same as \code{TRUE}, while the latter returns the distance matrix in transposed format.} +\item{get.distance}{A logical scalar indicating whether distances to the neighbors should be recorded.} \item{num.threads}{Integer scalar specifying the number of threads to use for the search.} -\item{subset}{An integer, logical or character vector indicating the rows of \code{query} (or columns, if \code{transposed=TRUE}) for which the neighbors should be identified.} +\item{subset}{An integer, logical or character vector indicating the rows of \code{query} (or columns, if \code{transposed=TRUE}) for which the nearest neighbors should be identified.} \item{transposed}{A logical scalar indicating whether \code{X} and \code{query} are transposed, in which case both matrices are assumed to contain dimensions in the rows and data points in the columns.} @@ -75,7 +68,7 @@ The \code{i}-th entry contains the number of neighbors of \eqn{i} within \code{t If \code{subset} is not \code{NULL}, each entry of the above vector/lists refers to a point in the subset, in the same order as supplied in \code{subset}. } \description{ -Find all neighboring data points within a certain distance of a query point. +Find all points in a reference dataset that lie within a threshold distance of each point in a query dataset. } \details{ This function identifies all points in \code{X} that within \code{threshold} of each point in \code{query}. diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index a6c380d..27859fd 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -56,34 +56,36 @@ BEGIN_RCPP END_RCPP } // generic_find_knn -SEXP generic_find_knn(SEXP prebuilt_ptr, Rcpp::IntegerVector num_neighbors, Rcpp::Nullable chosen, int num_threads, bool last_distance_only, bool report_index, bool report_distance); -RcppExport SEXP _BiocNeighbors_generic_find_knn(SEXP prebuilt_ptrSEXP, SEXP num_neighborsSEXP, SEXP chosenSEXP, SEXP num_threadsSEXP, SEXP last_distance_onlySEXP, SEXP report_indexSEXP, SEXP report_distanceSEXP) { +SEXP generic_find_knn(SEXP prebuilt_ptr, Rcpp::IntegerVector num_neighbors, bool force_variable_neighbors, Rcpp::Nullable chosen, int num_threads, bool last_distance_only, bool report_index, bool report_distance); +RcppExport SEXP _BiocNeighbors_generic_find_knn(SEXP prebuilt_ptrSEXP, SEXP num_neighborsSEXP, SEXP force_variable_neighborsSEXP, SEXP chosenSEXP, SEXP num_threadsSEXP, SEXP last_distance_onlySEXP, SEXP report_indexSEXP, SEXP report_distanceSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::traits::input_parameter< SEXP >::type prebuilt_ptr(prebuilt_ptrSEXP); Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type num_neighbors(num_neighborsSEXP); + Rcpp::traits::input_parameter< bool >::type force_variable_neighbors(force_variable_neighborsSEXP); Rcpp::traits::input_parameter< Rcpp::Nullable >::type chosen(chosenSEXP); Rcpp::traits::input_parameter< int >::type num_threads(num_threadsSEXP); Rcpp::traits::input_parameter< bool >::type last_distance_only(last_distance_onlySEXP); Rcpp::traits::input_parameter< bool >::type report_index(report_indexSEXP); Rcpp::traits::input_parameter< bool >::type report_distance(report_distanceSEXP); - rcpp_result_gen = Rcpp::wrap(generic_find_knn(prebuilt_ptr, num_neighbors, chosen, num_threads, last_distance_only, report_index, report_distance)); + rcpp_result_gen = Rcpp::wrap(generic_find_knn(prebuilt_ptr, num_neighbors, force_variable_neighbors, chosen, num_threads, last_distance_only, report_index, report_distance)); return rcpp_result_gen; END_RCPP } // generic_query_knn -SEXP generic_query_knn(SEXP prebuilt_ptr, Rcpp::NumericMatrix query, Rcpp::IntegerVector num_neighbors, int num_threads, bool last_distance_only, bool report_index, bool report_distance); -RcppExport SEXP _BiocNeighbors_generic_query_knn(SEXP prebuilt_ptrSEXP, SEXP querySEXP, SEXP num_neighborsSEXP, SEXP num_threadsSEXP, SEXP last_distance_onlySEXP, SEXP report_indexSEXP, SEXP report_distanceSEXP) { +SEXP generic_query_knn(SEXP prebuilt_ptr, Rcpp::NumericMatrix query, Rcpp::IntegerVector num_neighbors, bool force_variable_neighbors, int num_threads, bool last_distance_only, bool report_index, bool report_distance); +RcppExport SEXP _BiocNeighbors_generic_query_knn(SEXP prebuilt_ptrSEXP, SEXP querySEXP, SEXP num_neighborsSEXP, SEXP force_variable_neighborsSEXP, SEXP num_threadsSEXP, SEXP last_distance_onlySEXP, SEXP report_indexSEXP, SEXP report_distanceSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::traits::input_parameter< SEXP >::type prebuilt_ptr(prebuilt_ptrSEXP); Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type query(querySEXP); Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type num_neighbors(num_neighborsSEXP); + Rcpp::traits::input_parameter< bool >::type force_variable_neighbors(force_variable_neighborsSEXP); Rcpp::traits::input_parameter< int >::type num_threads(num_threadsSEXP); Rcpp::traits::input_parameter< bool >::type last_distance_only(last_distance_onlySEXP); Rcpp::traits::input_parameter< bool >::type report_index(report_indexSEXP); Rcpp::traits::input_parameter< bool >::type report_distance(report_distanceSEXP); - rcpp_result_gen = Rcpp::wrap(generic_query_knn(prebuilt_ptr, query, num_neighbors, num_threads, last_distance_only, report_index, report_distance)); + rcpp_result_gen = Rcpp::wrap(generic_query_knn(prebuilt_ptr, query, num_neighbors, force_variable_neighbors, num_threads, last_distance_only, report_index, report_distance)); return rcpp_result_gen; END_RCPP } @@ -156,8 +158,8 @@ static const R_CallMethodDef CallEntries[] = { {"_BiocNeighbors_exhaustive_builder", (DL_FUNC) &_BiocNeighbors_exhaustive_builder, 1}, {"_BiocNeighbors_find_mutual_nns", (DL_FUNC) &_BiocNeighbors_find_mutual_nns, 2}, {"_BiocNeighbors_generic_build", (DL_FUNC) &_BiocNeighbors_generic_build, 2}, - {"_BiocNeighbors_generic_find_knn", (DL_FUNC) &_BiocNeighbors_generic_find_knn, 7}, - {"_BiocNeighbors_generic_query_knn", (DL_FUNC) &_BiocNeighbors_generic_query_knn, 7}, + {"_BiocNeighbors_generic_find_knn", (DL_FUNC) &_BiocNeighbors_generic_find_knn, 8}, + {"_BiocNeighbors_generic_query_knn", (DL_FUNC) &_BiocNeighbors_generic_query_knn, 8}, {"_BiocNeighbors_generic_find_all", (DL_FUNC) &_BiocNeighbors_generic_find_all, 6}, {"_BiocNeighbors_generic_query_all", (DL_FUNC) &_BiocNeighbors_generic_query_all, 6}, {"_BiocNeighbors_hnsw_builder", (DL_FUNC) &_BiocNeighbors_hnsw_builder, 4}, diff --git a/src/generics.cpp b/src/generics.cpp index ccd319b..f925aa5 100644 --- a/src/generics.cpp +++ b/src/generics.cpp @@ -41,7 +41,16 @@ Rcpp::List format_range_output(const std::vector >& results) } //[[Rcpp::export(rng=false)]] -SEXP generic_find_knn(SEXP prebuilt_ptr, Rcpp::IntegerVector num_neighbors, Rcpp::Nullable chosen, int num_threads, bool last_distance_only, bool report_index, bool report_distance) { +SEXP generic_find_knn( + SEXP prebuilt_ptr, + Rcpp::IntegerVector num_neighbors, + bool force_variable_neighbors, + Rcpp::Nullable chosen, + int num_threads, + bool last_distance_only, + bool report_index, + bool report_distance) +{ BiocNeighbors::PrebuiltPointer cast(prebuilt_ptr); if (!R_ExternalPtrAddr(SEXP(cast))) { throw std::runtime_error("null pointer to a prebuilt index"); @@ -81,7 +90,7 @@ SEXP generic_find_knn(SEXP prebuilt_ptr, Rcpp::IntegerVector num_neighbors, Rcpp bool is_k_variable = false; int const_k = 0; std::vector variable_k; - if (num_neighbors.size() != 1) { + if (num_neighbors.size() != 1 || force_variable_neighbors) { is_k_variable = true; if (static_cast(num_neighbors.size()) != num_output) { throw std::runtime_error("length of 'k' must be equal to the number of points in the index or 'subset'"); @@ -196,7 +205,16 @@ SEXP generic_find_knn(SEXP prebuilt_ptr, Rcpp::IntegerVector num_neighbors, Rcpp } //[[Rcpp::export(rng=false)]] -SEXP generic_query_knn(SEXP prebuilt_ptr, Rcpp::NumericMatrix query, Rcpp::IntegerVector num_neighbors, int num_threads, bool last_distance_only, bool report_index, bool report_distance) { +SEXP generic_query_knn( + SEXP prebuilt_ptr, + Rcpp::NumericMatrix query, + Rcpp::IntegerVector num_neighbors, + bool force_variable_neighbors, + int num_threads, + bool last_distance_only, + bool report_index, + bool report_distance) +{ BiocNeighbors::PrebuiltPointer cast(prebuilt_ptr); if (!R_ExternalPtrAddr(SEXP(cast))) { throw std::runtime_error("null pointer to a prebuilt index"); @@ -223,7 +241,7 @@ SEXP generic_query_knn(SEXP prebuilt_ptr, Rcpp::NumericMatrix query, Rcpp::Integ bool is_k_variable = false; int const_k = 0; std::vector variable_k; - if (num_neighbors.size() != 1) { + if (num_neighbors.size() != 1 || force_variable_neighbors) { is_k_variable = true; if (static_cast(num_neighbors.size()) != nquery) { throw std::runtime_error("length of 'k' must be equal to the number of points in the index or 'subset'"); diff --git a/tests/testthat/test-findDistance.R b/tests/testthat/test-findDistance.R new file mode 100644 index 0000000..d154669 --- /dev/null +++ b/tests/testthat/test-findDistance.R @@ -0,0 +1,63 @@ +# library(testthat); library(BiocNeighbors); source("setup.R"); source("test-findDistance.R") + +set.seed(888888) + +test_that("findDistance works with basic options", { + Y <- matrix(rnorm(10000), ncol=20) + + dist <- findDistance(Y, k=8) + ref <- findKNN(Y, k=8) + expect_identical(ref$distance[,8], dist) + + # Respects alternative methods. + adist <- findDistance(Y, k=8, BNPARAM=AnnoyParam()) + expect_false(identical(dist, adist)) +}) + +test_that("findDistance works in parallel", { + Y <- matrix(rnorm(10000), ncol=20) + + out <- findDistance(Y, k=8) + dist <- findDistance(Y, k=8, num.threads=2) + expect_identical(out[,8], dist) +}) + +test_that("findDistance works with subsets", { + Y <- matrix(rnorm(10000), ncol=20) + + out <- findDistance(Y, k=8) + dist <- findDistance(Y, k=8, subset=1:10) + expect_identical(out[1:10], dist) + + expect_warning(dist <- findDistance(Y[0,,drop=FALSE], k=8), "capped") + expect_identical(length(dist), 0L) +}) + +test_that("findDistance works with variable k", { + Y <- matrix(rnorm(10000), ncol=20) + + k <- rep(c(4, 10), length.out=nrow(Y)) + out <- findDistance(Y, k=k) + + keep <- k == 4 + ref <- findDistance(Y, k=4) + expect_identical(ref[keep], out[keep]) + + keep <- k == 10 + ref <- findDistance(Y, k=10) + expect_identical(ref[keep], out[keep]) + + # AsIs forced variable makes no difference here. + out <- findDistance(Y, k=I(10), subset=1) + ref <- findDistance(Y, k=10, subset=1) + expect_identical(out, ref) +}) + +test_that("findDistance works with prebuilt indices", { + Y <- matrix(rnorm(10000), ncol=20) + + built <- buildIndex(Y, k=8) + dist <- findDistance(Y, k=8) + predist <- findDistance(built, k=8) + expect_identical(dist, predist) +}) diff --git a/tests/testthat/test-findKNN.R b/tests/testthat/test-findKNN.R index adda359..f246fe8 100644 --- a/tests/testthat/test-findKNN.R +++ b/tests/testthat/test-findKNN.R @@ -59,6 +59,12 @@ test_that("findKNN works with variable k", { ref <- findKNN(Y, k=10) expect_identical(do.call(rbind, out$index[keep]), ref$index[keep,]) expect_identical(do.call(rbind, out$distance[keep]), ref$distance[keep,]) + + # The AsIs forced variable works. + out <- findKNN(Y, k=I(10), subset=1) + ref <- findKNN(Y, k=10, subset=1) + expect_identical(out$index[[1]], ref$index[1,]) + expect_identical(out$distance[[1]], ref$distance[1,]) }) test_that("findKNN works with prebuilt indices", { diff --git a/tests/testthat/test-queryDistance.R b/tests/testthat/test-queryDistance.R new file mode 100644 index 0000000..e7ed561 --- /dev/null +++ b/tests/testthat/test-queryDistance.R @@ -0,0 +1,77 @@ +# library(testthat); library(BiocNeighbors); source("setup.R"); source("test-queryDistance.R") + +set.seed(77777) + +test_that("queryDistance works with basic options", { + Y <- matrix(rnorm(10000), ncol=20) + Z <- matrix(rnorm(2000), ncol=20) + + out <- queryDistance(Y, Z, k=8) + ref <- queryKNN(Y, Z, k=8) + expect_equal(ref$distance[,8], out) + + # Respects alternative methods. + adist <- queryDistance(Y, Z, k=8, BNPARAM=AnnoyParam()) + expect_false(identical(dist, adist)) +}) + +test_that("queryDistance works in parallel", { + Y <- matrix(rnorm(10000), ncol=20) + Z <- matrix(rnorm(2000), ncol=20) + + out <- queryDistance(Y, Z, k=8) + pout <- queryDistance(Y, Z, k=8, num.threads=2) + expect_equal(out, pout) +}) + +test_that("queryDistance works with subsets", { + Y <- matrix(rnorm(10000), ncol=20) + Z <- matrix(rnorm(2000), ncol=20) + + out <- queryDistance(Y, Z, k=8) + sout <- queryDistance(Y, Z, subset=1:10, k=8) + expect_equal(out[1:10], sout) + + expect_warning(out <- queryDistance(Y[0,,drop=FALSE], Z, k=8), "capped") + expect_identical(out, numeric(nrow(Z))) +}) + +test_that("queryDistance works with variable k", { + Y <- matrix(rnorm(10000), ncol=20) + Z <- matrix(rnorm(2000), ncol=20) + + k <- rep(c(4, 10), length.out=nrow(Z)) + out <- queryDistance(Y, Z, k=k) + + keep <- k == 4 + ref <- queryDistance(Y, Z, k=4) + expect_identical(out[keep], ref[keep]) + + keep <- k == 10 + ref <- queryDistance(Y, Z, k=10) + expect_identical(out[keep], ref[keep]) + + # AsIs has no effect here. + out <- queryDistance(Y, Z, k=10, subset=1) + ref <- queryDistance(Y, Z, k=I(10), subset=1) + expect_identical(out, ref) +}) + +test_that("queryDistance works with prebuilt indices", { + Y <- matrix(rnorm(10000), ncol=20) + Z <- matrix(rnorm(2000), ncol=20) + + built <- buildIndex(Y, k=8) + out <- queryDistance(Y, Z, k=8) + preout <- queryDistance(built, Z, k=8) + expect_identical(out, preout) +}) + +test_that("queryDistance works when inputs are transposed", { + Y <- matrix(rnorm(10000), ncol=20) + Z <- matrix(rnorm(2000), ncol=20) + + out <- queryDistance(Y, Z, k=8) + tout <- queryDistance(t(Y), t(Z), k=8, transposed=TRUE) + expect_identical(out, tout) +}) diff --git a/tests/testthat/test-queryKNN.R b/tests/testthat/test-queryKNN.R index 8f8cb1a..96b8600 100644 --- a/tests/testthat/test-queryKNN.R +++ b/tests/testthat/test-queryKNN.R @@ -44,8 +44,8 @@ test_that("queryKNN works with subsets", { expect_equal(out, sout) expect_warning(out <- queryKNN(Y[0,,drop=FALSE], Z, k=8), "capped") - expect_identical(ncol(out$index), 0L) - expect_identical(ncol(out$distance), 0L) + expect_identical(dim(out$index), c(nrow(Z), 0L)) + expect_identical(dim(out$distance), c(nrow(Z), 0L)) }) test_that("queryKNN works with variable k", { @@ -64,6 +64,12 @@ test_that("queryKNN works with variable k", { ref <- queryKNN(Y, Z, k=10) expect_identical(do.call(rbind, out$index[keep]), ref$index[keep,]) expect_identical(do.call(rbind, out$distance[keep]), ref$distance[keep,]) + + # The AsIs forced variable works. + out <- queryKNN(Y, Z, k=I(10), subset=1) + ref <- queryKNN(Y, Z, k=10, subset=1) + expect_identical(out$index[[1]], ref$index[1,]) + expect_identical(out$distance[[1]], ref$distance[1,]) }) test_that("queryKNN works with prebuilt indices", { diff --git a/vignettes/userguide.Rmd b/vignettes/userguide.Rmd index 9b6dac5..3027320 100644 --- a/vignettes/userguide.Rmd +++ b/vignettes/userguide.Rmd @@ -75,6 +75,19 @@ vout <- findKNN(data, k=10, BNPARAM=VptreeParam(distance="Manhattan")) str(vout) ``` +If the number of neighbors differs for each point, we can supply an integer vector to `k=` instead. +This yields a list of vectors containing the neighbors and their distances to each point. + +```{r} +var.k <- sample(10, nrow(data), replace=TRUE) + +# use I() to distinguish between scalar and length-1 vectors. +var.out <- findKNN(data, k=I(var.k)) + +head(var.out$index) +head(var.out$distance) +``` + `queryKNN()` is a related function that will find the `k`-nearest neighbors in one dataset based on query points in another dataset. Here, the rows of the output matrices correspond to rows of our `query` matrix.