Merge pull request #247 from immunomind/dev

immunomind · May 31, 2022 · 2fdb2d0 · 2fdb2d0
2 parents a6a5059 + 2b6d707
commit 2fdb2d0
Show file tree

Hide file tree

Showing 63 changed files with 335 additions and 340 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: immunarch
 Type: Package
 Title: Bioinformatics Analysis of T-Cell and B-Cell Immune Repertoires
-Version: 0.6.8
+Version: 0.6.9
 Authors@R: c(
     person("Vadim I.", "Nazarov", , "support@immunomind.io", c("aut", "cre")),
     person("Vasily O.", "Tsvetkov", , role = "aut"),
@@ -83,6 +83,6 @@ Suggests:
     rmarkdown
 VignetteBuilder: knitr
 Encoding: UTF-8
-RoxygenNote: 7.1.2
+RoxygenNote: 7.2.0
 LazyData: true
 LazyDataCompression: xz
diff --git a/R/align_lineage.R b/R/align_lineage.R
@@ -1,6 +1,6 @@
-#' This function aligns all sequences incliding germline that belong to one clonal lineage and one cluster.
-#' After clustering, building clonal lineage and germline, the next step is to analyze the degree of mutation
-#' and maturity of each clonal lineage. This allows you to find high mature cells and cells with a large
+#' This function aligns all sequences (incliding germline) that belong to one clonal lineage and one cluster.
+#' After clustering and building the clonal lineage and germline, the next step is to analyze the degree of mutation
+#' and maturity of each clonal lineage. This allows for finding high mature cells and cells with a large
 #' number of offspring. The phylogenetic analysis will find mutations that increase the affinity of BCR.
 #' Making alignment of the sequence is the first step towards sequence analysis including BCR.
 #'
@@ -34,7 +34,7 @@
 #' (will be saved in output table only if .verbose_output parameter is set to TRUE).
 #'
 #' @param .prepare_threads Number of threads to prepare results table.
-#' High number can cause heavy memory usage!
+#' Please note that high number can cause heavy memory usage!
 #'
 #' @param .align_threads Number of threads for lineage alignment.
 #'
@@ -47,7 +47,7 @@
 #' increases memory usage. If FALSE, only aligned clusters and columns required for repClonalFamily() calculation
 #' will be included in the output.
 #'
-#' @param .nofail Return NA instead of stopping if Clustal W is not installed.
+#' @param .nofail Will return NA instead of stopping if Clustal W is not installed.
 #' Used to avoid raising errors in examples on computers where Clustal W is not installed.
 #'
 #' @return

diff --git a/R/clonality.R b/R/clonality.R
@@ -34,10 +34,10 @@
 #' such as 10, 100 and so on.
 #'
 #' Set \code{"rare"} to estimate relative abundance for the groups of rare clonotypes
-#' with low counts. Use \code{".bound"} to define the boundaries of clonotype groups.
+#' with low counts. Use \code{".bound"} to define the threshold of clonotype groups.
 #'
 #' @param .perc A single numerical value ranging from 0 to 100.
-#' @param .clone.types A named numerical vector with the boundaries of the half-closed
+#' @param .clone.types A named numerical vector with the threshold of the half-closed
 #' intervals that mark off clonal groups.
 #' @param .head A numerical vector with ranges of the top clonotypes.
 #' @param .bound A numerical vector with ranges of abundance for the rare clonotypes in

diff --git a/R/clustering.R b/R/clustering.R
@@ -8,7 +8,7 @@
 #' @importFrom factoextra hcut fviz_nbclust
 #' @importFrom stats kmeans as.dist cmdscale dist
 #'
-#' @description Cluster the data with one of the following methods:
+#' @description Clusters the data with one of the following methods:
 #'
 #' - \code{immunr_hclust} clusters the data using the hierarchical clustering from \link[factoextra]{hcut};
 #'
@@ -26,7 +26,7 @@
 #'
 #' @param .data Matrix or data frame with features, distance matrix or output from \link{repOverlapAnalysis} or \link{geneUsageAnalysis} functions.
 #'
-#' @param .k The number of clusters to create, passed as \code{k} to \link[factoextra]{hcut} or as \code{centers} to \link{kmeans}.
+#' @param .k The number of clusters to create, defined as \code{k} to \link[factoextra]{hcut} or as \code{centers} to \link{kmeans}.
 #'
 #' @param .k.max Limits the maximum number of clusters. It is passed as \code{k.max} to \link{fviz_nbclust} for \code{immunr_hclust} and \code{immunr_kmeans}.
 #'
@@ -41,15 +41,15 @@
 #' @param .dist If TRUE then ".data" is expected to be a distance matrix. If FALSE then the euclidean distance is computed for the input objects.
 #'
 #' @return
-#' \code{immunr_hclust} - list with two elements. First element is an output from \link{hcut}.
-#' Second element is an output from \link{fviz_nbclust}
+#' \code{immunr_hclust} - list with two elements. The first element is an output from \link{hcut}.
+#' The second element is an output from \link{fviz_nbclust}
 #'
-#' \code{immunr_kmeans} - list with three elements. First element is an output from \link{kmeans}.
-#' Second element is an output from \link{fviz_nbclust}.
-#' Third element is the input dataset \code{.data}.
+#' \code{immunr_kmeans} - list with three elements. The first element is an output from \link{kmeans}.
+#' The second element is an output from \link{fviz_nbclust}.
+#' The third element is the input dataset \code{.data}.
 #'
-#' \code{immunr_dbscan} - list with two elements. First element is an output from \link{dbscan}.
-#' Second element is the input dataset \code{.data}.
+#' \code{immunr_dbscan} - list with two elements. The first element is an output from \link{dbscan}.
+#' The second element is the input dataset \code{.data}.
 #'
 #' @examples
 #' data(immdata)

diff --git a/R/data_docs.R b/R/data_docs.R
@@ -68,8 +68,8 @@ AA_TABLE_REVERSED <- AA_TABLE_REVERSED[order(names(AA_TABLE_REVERSED))]
 #'
 #' @description A dataset with single chain TCR data for testing and examplatory purposes.
 #'
-#' @format A list of two elements. First element ("data") is a list with data frames with clonotype tables.
-#' Second element ("meta") is a metadata table.
+#' @format A list of two elements. The first element ("data") is a list with data frames with clonotype tables.
+#' The second element ("meta") is a metadata table.
 #' \describe{
 #'   \item{data}{List of immune repertoire data frames.}
 #'   \item{meta}{Metadata}
@@ -84,9 +84,9 @@ AA_TABLE_REVERSED <- AA_TABLE_REVERSED[order(names(AA_TABLE_REVERSED))]
 #'
 #' @description A dataset with BCR data for testing and examplatory purposes.
 #'
-#' @format A list of two elements. First element ("data") is a list of 1 element named "full_clones"
+#' @format A list of two elements. The first element ("data") is a list of 1 element named "full_clones"
 #' that contains immune repertoire data frame.
-#' Second element ("meta") is empty metadata table.
+#' The second element ("meta") is empty metadata table.
 #' \describe{
 #'   \item{data}{List of immune repertoire data frames.}
 #'   \item{meta}{Metadata}
@@ -101,7 +101,7 @@ AA_TABLE_REVERSED <- AA_TABLE_REVERSED[order(names(AA_TABLE_REVERSED))]
 #'
 #' @description A dataset with paired chain IG data for testing and examplatory purposes.
 #'
-#' @format A list of four elements.
+#' @format A list of four elements:
 #' "data" is a list with data frames with clonotype tables.
 #' "meta" is a metadata table.
 #' "bc_patients" is a list of barcodes corresponding to specific patients.

diff --git a/R/dimensions.R b/R/dimensions.R
@@ -17,7 +17,7 @@ default_scale_fun <- function(x) {
 #'
 #' @aliases immunr_pca immunr_mds immunr_tsne
 #'
-#' @description Collect a set of principal variables, reducing the number of not important variables
+#' @description Collects a set of principal variables, reducing the number of not important variables
 #' to analyse. Dimensionality reduction makes data analysis algorithms work faster and
 #' sometimes more accurate, since it also reduces noise in the data. Currently available
 #' methods are:
@@ -44,13 +44,13 @@ default_scale_fun <- function(x) {
 #' @param .perp The perplexity parameter for \link[Rtsne]{Rtsne}. Sepcifies the number
 #' of neighbours each data point must have in the resulting plot.
 #'
-#' @param .raw If TRUE then return non-processed output from dimensionality reduction
+#' @param .raw If TRUE then returns the non-processed output from dimensionality reduction
 #' algorithms. Pass FALSE if you want to visualise results.
 #'
-#' @param .orig If TRUE then return the original result from algorithms. Pass FALSE
+#' @param .orig If TRUE then returns the original result from algorithms. Pass FALSE
 #' if you want to visualise results.
 #'
-#' @param .dist If TRUE then assume ".data" is a distance matrix.
+#' @param .dist If TRUE then assumes that ".data" is a distance matrix.
 #'
 #' @param ... Other parameters passed to \link[Rtsne]{Rtsne}.
 #'

diff --git a/R/distance.R b/R/distance.R
@@ -18,17 +18,17 @@
 #'
 #' Every object must have columns in the immunarch compatible format \link{immunarch_data_format}
 #'
-#' @param .col A string that specifies the column name to be processed. Default value is 'CDR3.nt'.
+#' @param .col A string that specifies the column name to be processed. The default value is 'CDR3.nt'.
 #'
 #' @param .method Character value or user-defined function.
 #'
-#' @param .group_by Character vector of column names to group sequence by. Default value is c("V.first", "J.first"). Columns "V.first" and "J.first" containing first genes without allele suffixes are calculated automatically from "V.name" and "J.name" if absent in the data. Pass NA for no grouping options.
+#' @param .group_by Character vector of column names to group sequence by. The default value is c("V.first", "J.first"). Columns "V.first" and "J.first" containing first genes without allele suffixes are calculated automatically from "V.name" and "J.name" if absent in the data. Pass NA for no grouping options.
 #'
-#' @param .group_by_seqLength If TRUE  - add grouping by sequence length of .col argument
+#' @param .group_by_seqLength If TRUE  - adds grouping by sequence length of .col argument
 #'
 #' @param ... Extra arguments for user-defined function.
 #'
-#' Default value is \code{'hamming'} for Hamming distance which counts the number of character substitutions that turns b into a.
+#' The default value is \code{'hamming'} for Hamming distance which counts the number of character substitutions that turns b into a.
 #' If a and b have different number of characters the distance is Inf.
 #'
 #' Other possible values are:

diff --git a/R/diversity.R b/R/diversity.R
@@ -3,7 +3,7 @@ if (getRversion() >= "2.15.1") {
 }
 
 
-#' Main function for immune repertoire diversity estimation
+#' The main function for immune repertoire diversity estimation
 #'
 #' @concept diversity
 #'
@@ -16,8 +16,7 @@ if (getRversion() >= "2.15.1") {
 #' @importFrom rlang sym
 #'
 #' @description
-#' This is a utility function to estimate the diversity of species or objects
-#' in the given distribution.
+#' This is a utility function to estimate the diversity of species or objects in the given distribution.
 #'
 #' Note: functions will check if .data is a distribution of a random variable (sum == 1) or not.
 #' To force normalisation and / or to prevent this, set .do.norm to TRUE (do normalisation)
@@ -35,7 +34,7 @@ if (getRversion() >= "2.15.1") {
 #'
 #' Note: each connection must represent a separate repertoire.
 #'
-#' @param .method Pick a method used for estimation out of a following list: chao1,
+#' @param .method Picks a method used for estimation out of a following list: chao1,
 #' hill, div, gini.simp, inv.simp, gini, raref, d50, dxx.
 #' @param .col A string that specifies the column(s) to be processed. Pass one of the
 #' following strings, separated by the plus sign: "nt" for nucleotide sequences,
@@ -51,11 +50,11 @@ if (getRversion() >= "2.15.1") {
 #' @param .extrapolation An integer. An upper limit for the number of clones to extrapolate to.
 #' Pass 0 (zero) to turn extrapolation subroutines off.
 #' @param .perc Set the percent to dXX index measurement.
-#' @param .norm Normalise rarefaction curves.
-#' @param .verbose If TRUE then output progress.
-#' @param .do.norm One of the three values - NA, TRUE or FALSE. If NA then check for distrubution (sum(.data) == 1)
-#' and normalise if needed with the given laplace correction value. if TRUE then do normalisation and laplace
-#' correction. If FALSE then don't do normalisaton and laplace correction.
+#' @param .norm Normalises rarefaction curves.
+#' @param .verbose If TRUE then outputs progress.
+#' @param .do.norm One of the three values - NA, TRUE or FALSE. If NA then checks for distrubution (sum(.data) == 1)
+#' and normalises if needed with the given laplace correction value. if TRUE then does normalisation and laplace
+#' correction. If FALSE then doesn't do neither normalisaton nor laplace correction.
 #' @param .laplace A numeric value, which is used as a pseudocount for Laplace
 #' smoothing.
 #'
@@ -362,7 +361,7 @@ rarefaction <- function(.data, .step = NA, .quantile = c(.025, .975),
   }
 
   if (.verbose) {
-    pb <- set_pb(sum(sapply(1:length(.data), function(i) {
+    pb <- set_pb(sum(sapply(seq_along(.data), function(i) {
       bc.vec <- .data[[i]]
       bc.sum <- sum(.data[[i]])
       sizes <- seq(.step, bc.sum, .step)
@@ -373,10 +372,13 @@ rarefaction <- function(.data, .step = NA, .quantile = c(.025, .975),
     })))
   }
 
-  muc.list <- lapply(1:length(.data), function(i) {
+  muc.list <- lapply(seq_along(.data), function(i) {
     Sobs <- length(.data[[i]])
     bc.vec <- .data[[i]]
-    Sest <- chao1(bc.vec)
+    Sest <- chao1(bc.vec)[1]
+    if (is.na(Sest)) {
+      Sest <- Sobs
+    }
     n <- sum(bc.vec)
     sizes <- seq(.step, n, .step)
     # if (sizes[length(sizes)] != n) {
@@ -389,11 +391,11 @@ rarefaction <- function(.data, .step = NA, .quantile = c(.025, .975),
       alphas <- sapply(freqs, function(k) .alpha(n, k, sz))
 
       # poisson
-      Sind <- sum(sapply(1:length(freqs), function(k) (1 - alphas[k]) * counts[k]))
-      if (Sest[1] == Sobs) {
+      Sind <- sum(sapply(seq_along(freqs), function(k) (1 - alphas[k]) * counts[k]))
+      if (Sest == Sobs) {
         SD <- 0
       } else {
-        SD <- sqrt(sum(sapply(1:length(freqs), function(k) (1 - alphas[k])^2 * counts[k])) - Sind^2 / Sest[1])
+        SD <- sqrt(sum(sapply(seq_along(freqs), function(k) (1 - alphas[k])^2 * counts[k])) - Sind^2 / Sest[1])
       }
       t <- Sind - Sobs
       if (t != 0) {
@@ -419,7 +421,7 @@ rarefaction <- function(.data, .step = NA, .quantile = c(.025, .975),
       )
       if (length(sizes) != 1) {
         ex.res <- t(sapply(sizes, function(sz) {
-          f0 <- Sest[1] - Sobs
+          f0 <- Sest - Sobs
           f1 <- counts["1"]
           if (is.na(f1) || f0 == 0) {
             Sind <- Sobs

diff --git a/R/dynamics.R b/R/dynamics.R
@@ -8,7 +8,7 @@
 #' @aliases trackClonotypes
 #'
 #' @description
-#' Track the temporal dynamics of clonotypes in repertoires. For example, tracking across multiple
+#' Tracks the temporal dynamics of clonotypes in repertoires. For example, tracking across multiple
 #' time points after vaccination.
 #'
 #' @param .data The data to process. It can be a \link{data.frame}, a
@@ -25,12 +25,12 @@
 #'
 #' @param .which An argument that regulates which clonotypes to choose for tracking. There are three options for this argument:
 #'
-#' 1) pass a list with two elements \code{list(X, Y)}, where \code{X} is the name or the index of a target repertoire from ".data", and
+#' 1) passes a list with two elements \code{list(X, Y)}, where \code{X} is the name or the index of a target repertoire from ".data", and
 #' \code{Y} is the number of the most abundant clonotypes to take from \code{X}.
 #'
-#' 2) pass a character vector of sequences to take from all data frames;
+#' 2) passes a character vector of sequences to take from all data frames;
 #'
-#' 3) pass a data frame (data table, database) with one or more columns - first for sequences, and other for gene segments (if applicable).
+#' 3) passes a data frame (data table, database) with one or more columns - first for sequences, and other for gene segments (if applicable).
 #'
 #' See the "Examples" below with examples for each option.
 #'
@@ -40,7 +40,7 @@
 #' sequences with Joining genes, or any combination of the above.
 #' Used only if ".which" has option 1) or option 2).
 #'
-#' @param .norm Logical. If TRUE then use Proportion instead of the number of Clones per clonotype to store
+#' @param .norm Logical. If TRUE then uses Proportion instead of the number of Clones per clonotype to store
 #' in the function output.
 #'
 #' @description

diff --git a/R/filters.R b/R/filters.R
@@ -8,37 +8,37 @@
 #' @importFrom tidyselect starts_with
 #'
 #' @param .data The data to be processed. Must be the list of 2 elements:
-#' data table and metadata table.
+#' a data table and a metadata table.
 #' @param .method Method of filtering. Implemented methods:
 #' by.meta, by.repertoire (by.rep), by.clonotype (by.cl)
 #' Default value: 'by.clonotype'.
 #' @param .query Filtering query. It's a named list of filters that will be applied
 #' to data.
 #' Possible values for names in this list are dependent on filter methods:
-#' - by.meta: filter by metadata. Names in the named list are metadata column headers.
-#' - by.repertoire: filter by number of clonotypes or total number of clones in sample.
+#' - by.meta: filters by metadata. Names in the named list are metadata column headers.
+#' - by.repertoire: filters by the number of clonotypes or total number of clones in sample.
 #' Possible names in the named list are "n_clonotypes" and "n_clones".
-#' - by.clonotype: filter by data in all samples. Names in the named list are
+#' - by.clonotype: filters by data in all samples. Names in the named list are
 #' data column headers.
 #' Elements of the named list for each of the filters are filtering options.
 #' Possible values for filtering options:
-#' - include("STR1", "STR2", ...): keep only rows with matching values.
+#' - include("STR1", "STR2", ...): keeps only rows with matching values.
 #' Available for methods: "by.meta", "by.clonotype".
-#' - exclude("STR1", "STR2", ...): remove rows with matching values.
+#' - exclude("STR1", "STR2", ...): removes rows with matching values.
 #' Available for methods: "by.meta", "by.clonotype".
-#' - lessthan(value): keep rows/samples with numeric values less than specified.
+#' - lessthan(value): keeps rows/samples with numeric values less than specified.
 #' Available for methods: "by.meta", "by.repertoire", "by.clonotype".
-#' - morethan(value): keep rows/samples with numeric values more than specified.
+#' - morethan(value): keeps rows/samples with numeric values more than specified.
 #' Available for methods: "by.meta", "by.repertoire", "by.clonotype".
-#' - interval(from, to): keep rows/samples with numeric values that fits in this interval.
+#' - interval(from, to): keeps rows/samples with numeric values that fits in this interval.
 #' from is inclusive, to is exclusive.
 #' Available for methods: "by.meta", "by.repertoire", "by.clonotype".
 #' Default value: 'list(CDR3.aa = exclude("partial", "out_of_frame"))'.
 #' @param .match Matching method for "include" and "exclude" options in query.
 #' Possible values:
-#' - exact: match only the exact specified string;
-#' - startswith: match all strings starting with the specified substring;
-#' - substring: match all strings containing the specified substring.
+#' - exact: matches only the exact specified string;
+#' - startswith: matches all strings starting with the specified substring;
+#' - substring: matches all strings containing the specified substring.
 #' Default value: 'exact'.
 #'
 #' @examples
@@ -227,9 +227,15 @@ filter_table <- function(.table, .column_name, .query_type, .query_args, .match)
     if (.match == "exact") {
       .table %<>% subset(!get(.column_name) %in% .query_args)
     } else if (.match == "startswith") {
-      .table <- .table[-startswith_rows(.table, .column_name, .query_args), ]
+      matching_rows <- startswith_rows(.table, .column_name, .query_args)
+      if (length(matching_rows) > 0) {
+        .table <- .table[-matching_rows, ]
+      }
     } else if (.match == "substring") {
-      .table <- .table[-substring_rows(.table, .column_name, .query_args), ]
+      matching_rows <- substring_rows(.table, .column_name, .query_args)
+      if (length(matching_rows) > 0) {
+        .table <- .table[-matching_rows, ]
+      }
     }
   } else if (.query_type == "lessthan") {
     .table %<>% subset(get(.column_name) < as_numeric_or_fail(.query_args))