diff --git a/DESCRIPTION b/DESCRIPTION index 31424ca..8b8cd5a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: phylotax Type: Package Title: Refine taxonomic assignment of environmental sequences using a taxonomic tree -Version: 0.0.2.1 +Version: 0.0.3 Authors@R: person(given = "Brendan", family = "Furneaux", email = "brendan.furneaux@gmail.com", role = c("aut", "cre")) diff --git a/NEWS.md b/NEWS.md index c16494a..2dbf068 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,20 @@ +# phlotax 0.0.3 + +* **BREAKING CHANGE** `phylotax` returns taxonomic tables in four categories; + * "`tip_taxa`" is the assignments which PHYLOTAX has made itself. + * "`rejected`" are primary assignments which PHYLOTAX has rejected. + * "`retained`" are primary assignments which PHYLOTAX has not rejected; + however some of them may still be ambiguous. + * "`missing`" are primary assignments whose labels are not present in the + tree, so PHYLOTAX has not done anything with them. (But note that this will + be empty if no tree was given). +* `phylotax()` now returns an S3 object of class "`phylotax`". This should not + break anything, and it allows the possibility of nice improvements in the + future. +* `phylotax()$node_taxa` now includes a "`label`" column, and populates it with + node labels if they exist, or just the numbers if they don't. Node labels are + also used in trace output. + # phylotax 0.0.2.1 * Two quick bugfix, applying to errors in `taxonomy_sintax()` and diff --git a/R/taxonomy.R b/R/taxonomy.R index 3b267ba..4535057 100644 --- a/R/taxonomy.R +++ b/R/taxonomy.R @@ -541,10 +541,14 @@ new_phylotax_env <- function(tree, taxa, parent = parent.frame()) { .parent = parent, node_taxa = tibble::tibble( node = integer(), + label = NULL, rank = taxa$rank[FALSE], taxon = character() ), - tip_taxa = dplyr::filter(taxa, .data$label %in% tree$tip.label), + tip_taxa = dplyr::filter(taxa, FALSE), + retained = dplyr::filter(taxa, .data$label %in% tree$tip.label), + rejected = dplyr::filter(taxa, FALSE), + missing = dplyr::filter(taxa, !.data$label %in% tree$tip.label), tree = tree ) } @@ -552,35 +556,39 @@ new_phylotax_env <- function(tree, taxa, parent = parent.frame()) { phylotax_ <- function(tree, taxa, node, ranks, method, e) { if (length(ranks) == 0) return() - + nodelabel <- if (!is.null(tree$node.label)) { + tree$node.label[node - ape::Ntip(tree)] + } else { + as.character(node) + } parents <- phangorn::Ancestors(tree, node, type = "all") for (r in ranks) { if (is.ordered(ranks)) r <- ordered(r, levels = levels(ranks)) if (any(e$node_taxa$node %in% parents & e$node_taxa$rank == r)) next - taxon <- clade_taxon(tree, e$tip_taxa, node, r) + taxon <- clade_taxon(tree, e$retained, node, r) if (is.na(taxon)) { - futile.logger::flog.debug("Could not assign a %s to node %d.", r, node) + futile.logger::flog.debug("Could not assign a %s to node %s.", r, nodelabel) for (n in phangorn::Children(tree, node)) { - phylotax_(tree, e$tip_taxa, n, ranks, method, e) + phylotax_(tree, e$retained, n, ranks, method, e) } break } else { children <- phangorn::Children(tree, node) if (length(children) > 0) { futile.logger::flog.info( - "Assigned node %d and its %d children to %s %s.", - node, length(children), as.character(r), taxon) + "Assigned node %s and its %d children to %s %s.", + nodelabel, length(children), as.character(r), taxon) } else { - futile.logger::flog.info("Assigned node %d to %s %s.", node, + futile.logger::flog.info("Assigned node %s to %s %s.", nodelabel, as.character(r), taxon) } ranks <- ranks[-1] e$node_taxa <- dplyr::bind_rows( e$node_taxa, - tibble::tibble(node = node, rank = r, taxon = taxon) + tibble::tibble(node = node, label = nodelabel, rank = r, taxon = taxon) ) tips <- tree$tip.label[phangorn::Descendants(tree, node, type = "tips")[[1]]] - wrongTaxa <- e$tip_taxa %>% + wrongTaxa <- e$retained %>% dplyr::filter( .data$label %in% tips, .data$rank == r, @@ -596,11 +604,16 @@ phylotax_ <- function(tree, taxa, node, ranks, method, e) { newAssign[[n]] <- unname(method[n]) } # remove assignments which are not consistent with the one we just chose - e$tip_taxa <- dplyr::bind_rows( - dplyr::filter(e$tip_taxa, .data$rank < r), - dplyr::filter(e$tip_taxa, .data$rank >= r) %>% - dplyr::anti_join(wrongTaxa, by = names(wrongTaxa)), - newAssign + e$tip_taxa <- dplyr::bind_rows(e$tip_taxa, newAssign) + e$rejected <- dplyr::bind_rows( + e$rejected, + dplyr::filter(e$retained, .data$rank >= r) %>% + dplyr::semi_join(wrongTaxa, by = names(wrongTaxa)) + ) + e$retained <- dplyr::bind_rows( + dplyr::filter(e$retained, .data$rank < r), + dplyr::filter(e$retained, .data$rank >= r) %>% + dplyr::anti_join(wrongTaxa, by = names(wrongTaxa)) ) } } @@ -647,14 +660,20 @@ phylotax_ <- function(tree, taxa, node, ranks, method, e) { #' treat each unique combination of values in these columns as a distinct #' method. #' -#' @return a list with two elements, "tip_taxa" and "node_taxa". "tip_taxa" is -#' a `tibble::tibble()` with the same format as `taxa`, in -#' which assignments which are inconsistent with the phylogeny have been -#' removed, and new assignments deduced or confirmed from the phylogeny. -#' These are identified by the value "phylotax" in the "method" column, -#' which is created if it does not already exist. "node_taxa" has columns -#' "node", "rank" and "taxon", giving taxonomic assignments for the nodes of -#' the tree. +#' @return an S3 object with class "`phylotax`", with five elements: +#' * "`tip_taxa` a `tibble::tibble()` with the same format as `taxa`, containing +#' taxonomy assignments made by PHYLOTAX to tips. +#' * "`node_taxa`" a `tibble::tibble()` with columns "`node`", "`label`", +#' "`rank`" and "`taxon`" giving taxonomy assignments made by PHYLOTAX to +#' internal nodes. +#' * "`rejected`" a `tibble::tibble()` with the same format as `taxa` giving +#' primary assignments which have been rejected by PHYLOTAX. +#' * "`retained`" a `tibble::tibble()` with the same format as `taxa` giving +#' primary assignments which have not been rehected by PHYLOTAX. These may +#' contain inconsistencies that PHYLOTAX was unable to resolve. +#' * "`missing`" a `tibble::tibble()` with the same format as `taxa`, giving the +#' primary assignments which have not been assessed by PHULOTAX because they +#' have labels which are not present on the tree. #' #' @export phylotax <- function( @@ -670,9 +689,13 @@ phylotax <- function( e <- new_phylotax_env(tree, count_assignments(taxa), ranks) ranks <- sort(unique(taxa$rank)) phylotax_(tree, taxa, phangorn::getRoot(tree), ranks, method, e) - e$tip_taxa$n_tot <- NULL - e$tip_taxa$n_diff <- NULL - as.list(e) + for (member in c("missing", "retained", "rejected", "tip_taxa")) + for (n in c("n_tot", "n_diff")) + e[[member]][[n]] <- NULL + structure( + as.list(e), + class = "phylotax" + ) } #' Simple phylogenetic tree for use in examples diff --git a/README.Rmd b/README.Rmd index bb1c490..8aa7e7c 100644 --- a/README.Rmd +++ b/README.Rmd @@ -76,13 +76,24 @@ supports it. phylotax_out <- phylotax(tree = example_tree(), taxa = example_taxa()) ``` -PHYLOTAX returns a list containing the tree, taxa assigned to tips, -and taxa assigned to nodes. Let's look at the tip taxa assignments. +PHYLOTAX returns a list of class "`phylotax`" containing the tree, +taxa assignments for tips and internal nodes, as well as tables dividing the +primary assignments into those which were rejected, those which were retained, +and those which were missing from the input tree. ```{r} phylotax_out$tip_taxa ``` +```{r} +phylotax_out$retained +``` + +```{r} +phylotax_out$rejected +``` + + Phylotax has used the following logic: 1. It's not possible to decide what the root (node 1) is, because one of its diff --git a/README.md b/README.md index 24fefaf..c7309e6 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ -# phylotax +phylotax +======== @@ -11,7 +12,8 @@ status](https://travis-ci.com/brendanf/phylotax.svg?branch=master)](https://trav coverage](https://codecov.io/gh/brendanf/phylotax/branch/master/graph/badge.svg)](https://codecov.io/gh/brendanf/phylotax?branch=master) -## Installation +Installation +------------ Install the development version from [GitHub](https://github.com/) with: @@ -20,7 +22,8 @@ Install the development version from [GitHub](https://github.com/) with: devtools::install_github("brendanf/phylotax") ``` -## Usage +Usage +----- The PHYLOTAX algorithm takes as input taxonomic annotations from one or more primary taxonomic assignment algoirthms, and refines them using a @@ -40,15 +43,14 @@ plot(example_tree(), show.node.label = TRUE) And here is a set of taxonomic assignments for the tips of the tree, based on two hypothetical primary assignment algorithms “XTAX” and -“YTAX”. Some of the tips have been assigned to two genera: “Tax1” -and “Tax2”. The `phylotax` package includes the function `taxtable()` -which can generate a table of this type based on the output of various -primary assignment algorithms, but all that’s important is that it -contains the columns “label”, “rank”, and “taxon”. The ranks need to be -one of “rootrank”, “domain”, “kingdom”, “phylum”, “class”, “order”, -“family”, “genus”, and “species”. Our example also has a “method” -column, which PHYLOTAX uses to identify which assignments come from the -same source. +“YTAX”. Some of the tips have been assigned to two genera: “Tax1” and +“Tax2”. The `phylotax` package includes the function `taxtable()` which +can generate a table of this type based on the output of various primary +assignment algorithms, but all that’s important is that it contains the +columns “label”, “rank”, and “taxon”. The ranks need to be one of +“rootrank”, “domain”, “kingdom”, “phylum”, “class”, “order”, “family”, +“genus”, and “species”. Our example also has a “method” column, which +PHYLOTAX uses to identify which assignments come from the same source. ``` r example_taxa() @@ -87,28 +89,46 @@ tree supports it. ``` r phylotax_out <- phylotax(tree = example_tree(), taxa = example_taxa()) -#> INFO [2020-10-14 17:35:34] Assigned node 9 and its 2 children to genus Tax2. -#> INFO [2020-10-14 17:35:34] Assigned node 10 and its 2 children to genus Tax1. +#> INFO [2020-10-15 15:02:50] Assigned node 3 and its 2 children to genus Tax2. +#> INFO [2020-10-15 15:02:50] Assigned node 4 and its 2 children to genus Tax1. ``` -PHYLOTAX returns a list containing the tree, taxa assigned to tips, and -taxa assigned to nodes. Let’s look at the tip taxa assignments. +PHYLOTAX returns a list of class “`phylotax`” containing the tree, taxa +assignments for tips and internal nodes, as well as tables dividing the +primary assignments into those which were rejected, those which were +retained, and those which were missing from the input tree. ``` r phylotax_out$tip_taxa -#> # A tibble: 10 x 4 -#> label method rank taxon -#> -#> 1 C XTAX genus Tax2 -#> 2 B YTAX genus Tax2 -#> 3 C YTAX genus Tax2 -#> 4 D YTAX genus Tax1 -#> 5 F YTAX genus Tax1 -#> 6 B PHYLOTAX genus Tax2 -#> 7 C PHYLOTAX genus Tax2 -#> 8 E PHYLOTAX genus Tax1 -#> 9 F PHYLOTAX genus Tax1 -#> 10 D PHYLOTAX genus Tax1 +#> # A tibble: 5 x 4 +#> label method rank taxon +#> +#> 1 B PHYLOTAX genus Tax2 +#> 2 C PHYLOTAX genus Tax2 +#> 3 E PHYLOTAX genus Tax1 +#> 4 F PHYLOTAX genus Tax1 +#> 5 D PHYLOTAX genus Tax1 +``` + +``` r +phylotax_out$retained +#> # A tibble: 5 x 4 +#> label method rank taxon +#> +#> 1 C XTAX genus Tax2 +#> 2 B YTAX genus Tax2 +#> 3 C YTAX genus Tax2 +#> 4 D YTAX genus Tax1 +#> 5 F YTAX genus Tax1 +``` + +``` r +phylotax_out$rejected +#> # A tibble: 2 x 4 +#> label method rank taxon +#> +#> 1 B XTAX genus Tax1 +#> 2 D XTAX genus Tax2 ``` Phylotax has used the following logic: diff --git a/man/figures/README-unnamed-chunk-2-1.png b/man/figures/README-unnamed-chunk-2-1.png index cd8d860..aa6631e 100644 Binary files a/man/figures/README-unnamed-chunk-2-1.png and b/man/figures/README-unnamed-chunk-2-1.png differ diff --git a/man/phylotax.Rd b/man/phylotax.Rd index 8a82ecd..11dee32 100644 --- a/man/phylotax.Rd +++ b/man/phylotax.Rd @@ -36,14 +36,22 @@ with values/levels from the set rootrank, domain, kingdom, phylum, class, order, how to identify different methods. See details.} } \value{ -a list with two elements, "tip_taxa" and "node_taxa". "tip_taxa" is -a \code{tibble::tibble()} with the same format as \code{taxa}, in -which assignments which are inconsistent with the phylogeny have been -removed, and new assignments deduced or confirmed from the phylogeny. -These are identified by the value "phylotax" in the "method" column, -which is created if it does not already exist. "node_taxa" has columns -"node", "rank" and "taxon", giving taxonomic assignments for the nodes of -the tree. +an S3 object with class "\code{phylotax}", with five elements: +\itemize{ +\item "\code{tip_taxa} a \code{tibble::tibble()} with the same format as \code{taxa}, containing +taxonomy assignments made by PHYLOTAX to tips. +\item "\code{node_taxa}" a \code{tibble::tibble()} with columns "\code{node}", "\code{label}", +"\code{rank}" and "\code{taxon}" giving taxonomy assignments made by PHYLOTAX to +internal nodes. +\item "\code{rejected}" a \code{tibble::tibble()} with the same format as \code{taxa} giving +primary assignments which have been rejected by PHYLOTAX. +\item "\code{retained}" a \code{tibble::tibble()} with the same format as \code{taxa} giving +primary assignments which have not been rehected by PHYLOTAX. These may +contain inconsistencies that PHYLOTAX was unable to resolve. +\item "\code{missing}" a \code{tibble::tibble()} with the same format as \code{taxa}, giving the +primary assignments which have not been assessed by PHULOTAX because they +have labels which are not present on the tree. +} } \description{ Assign taxon labels to nodes in a tree when there is a consensus of IDs on descendent tips. diff --git a/tests/testthat/node_taxa.rds b/tests/testthat/node_taxa.rds new file mode 100644 index 0000000..0649f0b Binary files /dev/null and b/tests/testthat/node_taxa.rds differ diff --git a/tests/testthat/phylotax.rds b/tests/testthat/phylotax.rds deleted file mode 100644 index ae68a10..0000000 Binary files a/tests/testthat/phylotax.rds and /dev/null differ diff --git a/tests/testthat/rejected.rds b/tests/testthat/rejected.rds new file mode 100644 index 0000000..c72805a Binary files /dev/null and b/tests/testthat/rejected.rds differ diff --git a/tests/testthat/retained.rds b/tests/testthat/retained.rds new file mode 100644 index 0000000..2046b12 Binary files /dev/null and b/tests/testthat/retained.rds differ diff --git a/tests/testthat/test-consensus.R b/tests/testthat/test-consensus.R new file mode 100644 index 0000000..516ab71 --- /dev/null +++ b/tests/testthat/test-consensus.R @@ -0,0 +1,9 @@ +consensus_test <- tibble::tribble( + ~label, ~method, ~rank, ~taxon, + "A", "XTAX", "genus", "G1", + "A", "XTAX", "species", "G1 s1", + "A", "YTAX", "genus", "G2") + +test_that("consensus is not assumed for lower taxa when higher taxa are inconsistent", { + expect_equal(length(phylotax(taxa = consensus_test)$tip_taxa$method), 0) +}) diff --git a/tests/testthat/test-phylotax.R b/tests/testthat/test-phylotax.R index 3d56f56..fe5a45c 100644 --- a/tests/testthat/test-phylotax.R +++ b/tests/testthat/test-phylotax.R @@ -1,3 +1,8 @@ +phylotax_out <- phylotax(tree = example_tree(), taxa = example_taxa()) + test_that("phylotax does not revert", { - expect_known_value(phylotax(tree = example_tree(), taxa = example_taxa()), "phylotax.rds", update = FALSE) + expect_known_value(phylotax_out$rejected, "rejected.rds", update = FALSE) + expect_known_value(phylotax_out$retained, "retained.rds", update = FALSE) + expect_known_value(phylotax_out$tip_taxa, "tip_taxa.rds", update = FALSE) + expect_known_value(phylotax_out$node_taxa, "node_taxa.rds", update = FALSE) }) diff --git a/tests/testthat/tip_taxa.rds b/tests/testthat/tip_taxa.rds new file mode 100644 index 0000000..53d9075 Binary files /dev/null and b/tests/testthat/tip_taxa.rds differ