Merge pull request #34 from monarch-initiative/checks_pass

Passing more checks
monarch-initiative · Aug 16, 2024 · 3656ffe · 3656ffe
2 parents 6650a30 + f4c4e4f
commit 3656ffe
Show file tree

Hide file tree

Showing 59 changed files with 523 additions and 3,872 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -3,7 +3,7 @@ Type: Package
 Title: Monarch Knowledge Graph Queries
 Description: R package for easy access, manipulation, and analysis of
 		Monarch KG data Resources.
-Version: 1.1.0
+Version: 1.2.1
 URL: https://github.com/monarch-initiative/monarchr
 BugReports: https://github.com/monarch-initiative/monarchr/issues
 Authors@R:
@@ -45,6 +45,5 @@ Imports:
     ggraph,
     ggplot2
 Suggests: 
-		ggraph,
     testthat (>= 3.0.0),
     rworkflows
diff --git a/NAMESPACE b/NAMESPACE
@@ -24,13 +24,15 @@ export(file_engine)
 export(file_engine_check)
 export(get_engine)
 export(kg_join)
+export(load_kgx)
 export(monarch_engine)
 export(monarch_engine_check)
 export(monarch_search)
 export(monarch_semsim)
 export(neo4j_engine)
 export(neo4j_engine_check)
 export(nodes)
+export(save_kgx)
 export(summarize_neighborhood)
 export(tbl_kgx)
 import(dplyr)
@@ -40,6 +42,7 @@ import(knitr)
 import(stringr)
 import(tidygraph)
 importFrom(archive,archive_read)
+importFrom(archive,archive_write_files)
 importFrom(assertthat,assert_that)
 importFrom(dplyr,slice_head)
 importFrom(httr,GET)
@@ -55,6 +58,7 @@ importFrom(purrr,map_chr)
 importFrom(readr,col_character)
 importFrom(readr,cols)
 importFrom(readr,read_tsv)
+importFrom(readr,write_tsv)
 importFrom(rlang,enquos)
 importFrom(stringr,str_detect)
 importFrom(stringr,str_replace_all)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,18 @@
 
 
+
+# monarchr 1.2.1
+
+## New features
+
+* New `save_kgx()` and `load_kgx()` for saving/loading graphs.
+
+## Bugfixes
+
+* Reduced size of package with smaller example data
+* Fixed a bug in `expand()` where the engine was not carried through for file-engine backed graphs
+* Unit tests dependent on live-hosted Monarch KG now allow for minor changes in result counts
+
 # monarchr 1.1.0
 
 ## New features

diff --git a/R/cypher_query.R b/R/cypher_query.R
@@ -18,7 +18,7 @@
 #' result <- cypher_query(engine, query, parameters)
 #' print(result)
 #' @importFrom neo2R cypher
-cypher_query <- function(engine, ...) {
+cypher_query <- function(engine, query, parameters = NULL, ...) {
     UseMethod("cypher_query")
 }
 
diff --git a/R/cypher_query_df.R b/R/cypher_query_df.R
@@ -17,6 +17,6 @@
 #' result <- cypher_query_df(engine, query, parameters)
 #' print(result)
 #' @importFrom neo2R cypher
-cypher_query_df <- function(engine, ...) {
+cypher_query_df <- function(engine, query, parameters = list(), ...) {
     UseMethod("cypher_query_df")
 }
diff --git a/R/expand.R b/R/expand.R
@@ -29,9 +29,9 @@
 #'
 #'
 #' @examples
-#' ## Using local MONDO KGX file (packaged with monarchr)
-#' phenos <- file_engine(system.file("extdata", "mondo_kgx_tsv.tar.gz",
-#'                       package = "monarchr")) |>
+#' ## Using example KGX file packaged with monarchr
+#' filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr")
+#' phenos <- file_engine(filename) |>
 #'           fetch_nodes(query_ids = "MONDO:0007525") |>
 #'           expand(predicates = "biolink:has_phenotype",
 #'                  categories = "biolink:PhenotypicFeature")
@@ -60,6 +60,7 @@ expand <- function(graph,
 									 categories = NULL,
 									 transitive = FALSE,
 									 drop_unused_query_nodes = FALSE,
-									 ...) {
+
+									 					 ...) {
 	UseMethod("expand")
 }
diff --git a/R/expand_file_engine.R b/R/expand_file_engine.R
@@ -8,19 +8,19 @@ transitive_query_internal <- function(engine,
                                       categories = NULL,
                                       drop_unused_query_nodes = FALSE) {
 
-    # TODO: this block will never trigger; the check is done in the main function, remove
-    if(length(predicates) > 1) {
-        # we call recusively on each predicate
-        for(predicate in predicates) {
-            g2 <- transitive_query_internal(engine,
-                                            g,
-                                            direction = direction,
-                                            predicates = predicate,
-                                            categories = categories,
-                                            drop_unused_query_nodes = TRUE)
-            suppressMessages(g <- tidygraph::graph_join(g, g2), classes = "message") # suppress joining info
-        }
-    }
+    # # TODO: this block will never trigger; the check is done in the main function, remove
+    # if(length(predicates) > 1) {
+    #     # we call recusively on each predicate
+    #     for(predicate in predicates) {
+    #         g2 <- transitive_query_internal(engine,
+    #                                         g,
+    #                                         direction = direction,
+    #                                         predicates = predicate,
+    #                                         categories = categories,
+    #                                         drop_unused_query_nodes = TRUE)
+    #         suppressMessages(g <- tidygraph::graph_join(g, g2), classes = "message") # suppress joining info
+    #     }
+    # }
 
     # assert that direction is "out" or "in"
     assert_that(direction == "out" | direction == "in", msg = "Direction must be 'out' or 'in' when using transitive closure.")
@@ -74,6 +74,7 @@ transitive_query_internal <- function(engine,
             filter(id %in% bfs_nodes)
     }
 
+    attr(bfs_result, "last_engine") <- engine
     return(bfs_result)
 }
 

diff --git a/R/fetch_nodes.R b/R/fetch_nodes.R
@@ -38,15 +38,15 @@
 #'
 #' @examples
 #' # file_engine supports the same features as neo4j_engine
-#' # (using the MONDO KGX file packaged with monarchr)
-#' filename <- system.file("extdata", "mondo_kgx_tsv.tar.gz", package = "monarchr")
+#' # (using the example KGX file packaged with monarchr)
+#' filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr")
 #'
 #' file_engine(filename) |>
 #'   fetch_nodes(query_ids = c("MONDO:0007525", "MONDO:0007526"))
 #'
-#' # grab all rare diseases
+#' # grab all Homo sapiens genes
 #' file_engine(filename) |>
-#'   fetch_nodes("rare" %in_list% subsets & "biolink:Disease" %in_list% category)
+#'   fetch_nodes(in_taxon_label == "Homo sapiens" & "biolink:Gene" %in_list% category)
 #'
 #' @export
 fetch_nodes <- function(engine, ..., query_ids = NULL, limit = NULL) {

diff --git a/R/file_engine.R b/R/file_engine.R
@@ -18,8 +18,8 @@
 #' library(tidygraph)
 #' library(dplyr)
 #'
-#' # Using a local MONDO KGX file (packaged with monarchr)
-#' filename <- system.file("extdata", "mondo_kgx_tsv.tar.gz", package = "monarchr")
+#' # Using example KGX file packaged with monarchr
+#' filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr")
 #' engine <- file_engine(filename)
 #'
 #' res <- engine |> fetch_nodes(query_ids = c("MONDO:0007522", "MONDO:0007947"))

diff --git a/R/get_engine.R b/R/get_engine.R
@@ -6,8 +6,8 @@
 #' @param fail_if_missing If TRUE, fail if there is no engine associated with the graph.
 #' @return A graph engine object.
 #' @examples
-#' # (using the MONDO KGX file packaged with monarchr)
-#' filename <- system.file("extdata", "mondo_kgx_tsv.tar.gz", package = "monarchr")
+#' # Using example KGX file packaged with monarchr
+#' filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr")
 #'
 #' g <- file_engine(filename) |>
 #'   fetch_nodes(query_ids = c("MONDO:0007525", "MONDO:0007526"))

diff --git a/R/kg_join.R b/R/kg_join.R
@@ -20,21 +20,21 @@
 #' @return A `tbl_kgx()` graph
 #' @export
 #' @examples
-#' ## Using local MONDO KGX file (packaged with monarchr)
-#' monarch <- file_engine(system.file("extdata", "mondo_kgx_tsv.tar.gz",
-#'                               package = "monarchr"))
+#' ## Using example KGX file packaged with monarchr
+#' filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr")
+#' engine <- file_engine(filename)
 #'
-#' eds_and_phenos <- monarch |>
+#' eds_and_phenos <- engine |>
 #'                   fetch_nodes(query_ids = "MONDO:0007525") |>
 #'                   expand(predicates = "biolink:has_phenotype",
 #'                          categories = "biolink:PhenotypicFeature")
 #'
-#' marfan_and_phenos <- monarch |>
+#' marfan_and_phenos <- engine |>
 #'                      fetch_nodes(query_ids = "MONDO:0007947") |>
 #'                      expand(predicates = "biolink:has_phenotype",
 #'                             categories = "biolink:PhenotypicFeature")
 #'
-#' combined <- graph_join(eds_and_phenos, marfan_and_phenos)
+#' combined <- kg_join(eds_and_phenos, marfan_and_phenos)
 #' print(combined)
 kg_join <- function(graph1, graph2, ...) {
   UseMethod("kg_join")

diff --git a/R/kg_join.tbl_kgx.R b/R/kg_join.tbl_kgx.R
@@ -1,14 +1,14 @@
 #' @import tidygraph
 #' @import dplyr
 #' @export
-kg_join.tbl_kgx <- function(g1, g2) {
-    nodes_g1 <- nodes(g1)
-    nodes_g2 <- nodes(g2)
+kg_join.tbl_kgx <- function(graph1, graph2, ...) {
+    nodes_g1 <- nodes(graph1)
+    nodes_g2 <- nodes(graph2)
 
     # we don't want to keep the to and from columns in the edges, since we'll be rebuilding edges from scratch,
     # and be running them through unique()
-    edges_g1 <- edges(g1) |> select(-to, -from)
-    edges_g2 <- edges(g2) |> select(-to, -from)
+    edges_g1 <- edges(graph1) |> select(-to, -from)
+    edges_g2 <- edges(graph2) |> select(-to, -from)
 
     all_nodes <- unique(nodes_g1 |>
                           full_join(nodes_g2)) |>
@@ -53,6 +53,6 @@ kg_join.tbl_kgx <- function(g1, g2) {
     all_nodes <- all_nodes |>
       select(-idx)  # remove the idx column
 
-    res <- tbl_kgx(nodes = all_nodes, edges = filled_edges, attach_engine = get_engine(g1, fail_if_missing = FALSE))
+    res <- tbl_kgx(nodes = all_nodes, edges = filled_edges, attach_engine = get_engine(graph1, fail_if_missing = FALSE))
     return(res)
 }
diff --git a/R/knit_print.tbl_kgx.R b/R/knit_print.tbl_kgx.R
@@ -20,14 +20,18 @@ clean_df <- function(df) {
 
 
 #' Specialized print function for KGX graphs in knitted documents
-#'
+#' @param x A `tbl_kgx` graph to display.
+#' @param ... Other arguments (unused).
+#' @param show The maximum number of nodes and edges to display.
 #' @export
 #' @import knitr
 #' @importFrom kableExtra kable
 #' @importFrom kableExtra kable_styling
 #' @importFrom kableExtra column_spec
 #' @importFrom dplyr slice_head
-knit_print.tbl_kgx <- function(graph, show = 100, ...) {
+knit_print.tbl_kgx <- function(x, ..., show = 100) {
+	graph <- x
+
 	g <- order_cols(graph)
 
 	nodes_colnames <- colnames(nodes(g))

diff --git a/R/load_kgx.R b/R/load_kgx.R
@@ -0,0 +1,35 @@
+
+#' Load a graph from a KGX-formatted .tar.gz file.
+#'
+#' Given a KGX-formatted tabular KG
+#' (see https://github.com/biolink/kgx/blob/master/specification/kgx-format.md)
+#' loads it as a graph.
+#'
+#' @param filename File to the graph from. Must end in .tar.gz and conform to KGX specification (see description).
+#' @param attach_engine An engine to attach to the graph (optional).
+#' @param ... Other parameters (unused)
+#' @return A `tbl_kgx` graph.
+#' @export
+#' @examplesIf monarch_engine_check()
+#' phenos <- monarch_engine() |>
+#'           fetch_nodes(query_ids = "MONDO:0007525") |>
+#'           expand(predicates = "biolink:has_phenotype",
+#'                  categories = "biolink:PhenotypicFeature")
+#'
+#' save_kgx(phenos, "phenos.tar.gz")
+#'
+#' # when loading the graph, we can optionally attach an engine
+#' loaded_phenos <- load_kgx("phenos.tar.gz", attach_engine = monarch_engine())
+#'
+#' loaded_phenos
+#'
+#' # cleanup saved file
+#' file.remove("phenos.tar.gz")
+load_kgx <- function(filename, attach_engine = NULL, ...) {
+	e <- file_engine(filename)
+	g <- e$graph
+
+	attr(g, "last_engine") <- attach_engine
+	g <- order_cols(g)
+	return(g)
+}
diff --git a/R/monarch_semsim.R b/R/monarch_semsim.R
@@ -1,28 +1,28 @@
 #' Semantic similarity mapping between two graphs
-#' 
+#'
 #' This function calls the Monarch-hosted semantic similarity API to compare two
 #' graphs, via the same endpoints as the Monarch Phenotype Explorer:
 #' https://monarchinitiative.org/explore#phenotype-explorer.
-#' 
+#'
 #' The API returns the best matches between the nodes of the two graphs, based on
-#' a specified knowledge-graph-boased metric: the default is `"ancestor_information_content"`, 
+#' a specified knowledge-graph-boased metric: the default is `"ancestor_information_content"`,
 #' also available are `"jaccard_similarity"` and `"phenodigm_score"`. The result is
 #' returned as a graph, with `"computed:best_matches"` edges between the nodes of the two input graphs.
-#' 
+#'
 #' By default, the function only returns the best matches from the first graph to the second graph, and
 #' removes any nodes that do not have a match. If `include_reverse = TRUE`, the function also returns
 #' the best matches from the second graph to the first graph.
-#' 
+#'
 #' The engine attached to the return graph is that of the query.
-#' 
+#'
 #' @param query_graph A tbl_kgx graph.
 #' @param target_graph A tbl_kgx graph.
 #' @param metric The semantic similarity metric to use. Default is `"ancestor_information_content"`. Also available are `"jaccard_similarity"` and `"phenodigm_score"`.
 #' @param include_reverse Whether to include the best matches from the target graph to the query graph. Default is `FALSE`.
-#' @param keep_unmatched_targets Whether to keep nodes in the target graph that do not have a match. Default is `FALSE`.
+#' @param keep_unmatched Whether to keep nodes in the target graph that do not have a match. Default is `FALSE`.
 #' @return A tbl_kgx graph with `"computed:best_matches"` edges between the nodes of the two input graphs.
 #' @export
-#' 
+#'
 #' @importFrom httr POST content http_status
 #' @import tidygraph
 #' @import dplyr
@@ -42,23 +42,23 @@
 #' # also inclue the unmatched targets
 #' sim <- monarch_semsim(g1, g2, keep_unmatched = TRUE)
 #' print(sim)
-#' 
+#'
 #' # inclue reverse matches
 #' sim <- monarch_semsim(g1, g2, include_reverse = TRUE)
 #' print(sim)
-#' 
-monarch_semsim <- function(query_graph, 
-                           target_graph, 
-                           metric = "ancestor_information_content", 
-                           include_reverse = FALSE, 
+#'
+monarch_semsim <- function(query_graph,
+                           target_graph,
+                           metric = "ancestor_information_content",
+                           include_reverse = FALSE,
                            keep_unmatched = FALSE) {
     # check that the metric is valid
     assert_that(metric %in% c("ancestor_information_content", "jaccard_similarity", "phenodigm_score"), msg = "metric must be one of 'ancestor_information_content', 'jaccard_similarity', or 'phenodigm_score'")
 
     engine <- monarch_engine()
     api_url <- paste0(engine$preferences$monarch_api_url, "/semsim/compare")
 
-    # these are called subject_ids and object_ids in the API, but 
+    # these are called subject_ids and object_ids in the API, but
     # these don't relate to "subject" and "object" of a predicate
     # we use the APIs terminology here
     subject_ids <- nodes(query_graph)$id
@@ -140,4 +140,4 @@ monarch_semsim <- function(query_graph,
     g <- tbl_kgx(nodes = nodes_df, edges = keep_edges_df, attach_engine = get_engine(query_graph, fail_if_missing = FALSE))
 
     return(g)
-}
+}
diff --git a/R/order_cols.R b/R/order_cols.R
@@ -1,11 +1,16 @@
 #' Set edge/row data column order according to most recent engine preferences
-#'
+#' @param g A `tbl_kgx` graph.
 #' @import dplyr
 #' @import tidygraph
 order_cols <- function(g) {
 	e <- attr(g, "last_engine")
-	node_prefs <- e$preferences$node_property_priority
-	edge_prefs <- e$preferences$edge_property_priority
+	node_prefs <- c("id", "pcategory", "name")
+	edge_prefs <- c("subject", "predicate", "object")
+
+	if(!is.null(e)) {
+		node_prefs <- e$preferences$node_property_priority
+		edge_prefs <- e$preferences$edge_property_priority
+	}
 
 	current_node_names <- colnames(nodes(g))
 	used_prefs_node_names <- node_prefs[node_prefs %in% current_node_names]