From b45ce513a3f31832ea1420856ab6bd6ff413993a Mon Sep 17 00:00:00 2001
From: ytakemon <yuka.takemon@gmail.com>
Date: Thu, 3 Mar 2022 09:20:43 -0800
Subject: [PATCH] related to #24 #25 remove data documents

---
 GINIR_data_document.tar.gz     |  3 +++
 R/CCLE_exp.R                   | 10 --------
 R/CCLE_exp_annot.R             | 14 -----------
 R/copy_num.R                   | 10 --------
 R/copy_num_annot.R             | 16 ------------
 R/dep.R                        |  9 -------
 R/dep_annot.R                  | 14 -----------
 R/essential_genes.R            | 13 ----------
 R/gene_effect.R                |  9 -------
 R/list_available_cancer_type.R | 12 +++++----
 R/mut_calls.R                  | 46 ----------------------------------
 R/nonessential_genes.R         | 13 ----------
 R/protein.R                    |  9 -------
 R/protein_annot.R              | 12 ---------
 R/protein_nodup.R              |  9 -------
 R/sample_ARID1A_KO_screen.R    |  9 -------
 R/sample_annot.R               | 34 -------------------------
 17 files changed, 10 insertions(+), 232 deletions(-)
 create mode 100644 GINIR_data_document.tar.gz
 delete mode 100644 R/CCLE_exp.R
 delete mode 100644 R/CCLE_exp_annot.R
 delete mode 100644 R/copy_num.R
 delete mode 100644 R/copy_num_annot.R
 delete mode 100644 R/dep.R
 delete mode 100644 R/dep_annot.R
 delete mode 100644 R/essential_genes.R
 delete mode 100644 R/gene_effect.R
 delete mode 100644 R/mut_calls.R
 delete mode 100644 R/nonessential_genes.R
 delete mode 100644 R/protein.R
 delete mode 100644 R/protein_annot.R
 delete mode 100644 R/protein_nodup.R
 delete mode 100644 R/sample_ARID1A_KO_screen.R
 delete mode 100644 R/sample_annot.R

diff --git a/GINIR_data_document.tar.gz b/GINIR_data_document.tar.gz
new file mode 100644
index 0000000..c6ea7fe
--- /dev/null
+++ b/GINIR_data_document.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69939720fd02b1a2f4ac48c1bf63adc55556774e4e396fa8836d918e7c7fbd5f
+size 2667
diff --git a/R/CCLE_exp.R b/R/CCLE_exp.R
deleted file mode 100644
index 63f82eb..0000000
--- a/R/CCLE_exp.R
+++ /dev/null
@@ -1,10 +0,0 @@
-#' @title DepMap 20Q1: Cancer cell line RNA-seq expression
-#'
-#' @description A data set containing the RNA-seq expression in transcripts per million (TPM) of 1279 cancer cell lines. 
-#' Data was generated by the Cancer Cell Line Encyclopedia (CCLE) and distributed by DepMap. 
-#' Details on data generation can be found in Ghandi, M., et al (2019) (https://www.nature.com/articles/s41586-019-1186-3).
-#' The data was obtained from 'CCLE_expression.csv' (see source url).
-#'
-#' @format A data frame with 1279 rows and 19145 variables: 
-#' @source \url{https://figshare.com/articles/dataset/DepMap_20Q1_Public/11791698}
-"CCLE_exp"
diff --git a/R/CCLE_exp_annot.R b/R/CCLE_exp_annot.R
deleted file mode 100644
index cc4e4e3..0000000
--- a/R/CCLE_exp_annot.R
+++ /dev/null
@@ -1,14 +0,0 @@
-#' @title DepMap 20Q1: Cancer cell line RNA-seq expression annotation
-#'
-#' @description A data set containing gene name column headers from the "CCLE_exp" data frame and
-#' its various forms that exist in the DepMap data set platform.
-#'
-#' @format A data frame with 19145 rows and 4 variables:
-#' \describe{
-#'   \item{\code{names}}{character "CCLE_exp" column names}
-#'   \item{\code{GeneNames}}{character Hugo symbols}
-#'   \item{\code{GeneID}}{character NCBI gene IDs}
-#'   \item{\code{GeneNameID}}{character Hugo symbol separated by NCBI gene ID} 
-#'}
-#' @source \url{https://figshare.com/articles/dataset/DepMap_20Q1_Public/11791698}
-"CCLE_exp_annot"
diff --git a/R/copy_num.R b/R/copy_num.R
deleted file mode 100644
index 51888cb..0000000
--- a/R/copy_num.R
+++ /dev/null
@@ -1,10 +0,0 @@
-#' @title DepMap 20Q1: Cancer cell line copy number
-#'
-#' @description A data set containing the copy number of 27640 genes in 1713 cancer cell lines. 
-#' Data was generated by the Cancer Cell Line Encyclopedia (CCLE) and distributed by DepMap. 
-#' Details on data generation can be found in Ghandi, M., et al (2019) (https://www.nature.com/articles/s41586-019-1186-3).
-#' The data was obtained from 'CCLE_expression.csv' (see source url).
-#'
-#' @format A data frame with 1713 rows and 27640 variables:
-#' @source \url{https://figshare.com/articles/dataset/DepMap_20Q1_Public/11791698}
-"copy_num"
diff --git a/R/copy_num_annot.R b/R/copy_num_annot.R
deleted file mode 100644
index aedabdc..0000000
--- a/R/copy_num_annot.R
+++ /dev/null
@@ -1,16 +0,0 @@
-#' @title DepMap 20Q1: Cancer cell line copy number annotations
-#'
-#' @description A data set containing the copy number of 27640 genes in 1713 cancer cell lines. 
-#' Data was generated by the Cancer Cell Line Encyclopedia (CCLE) and distributed by DepMap. 
-#' Details on data generation can be found in Ghandi, M., et al (2019) (https://www.nature.com/articles/s41586-019-1186-3).
-#' The data was obtained from 'CCLE_expression.csv' (see source url).
-#'
-#' @format A data frame with 27640 rows and 4 variables:
-#' \describe{
-#'   \item{\code{names}}{character "copy_num"}
-#'   \item{\code{GeneNames}}{character Hugo symbols}
-#'   \item{\code{GeneID}}{character NCBI gene IDs}
-#'   \item{\code{GeneNameID}}{character Hugo symbol separated by NCBI gene ID} 
-#'}
-#' @source \url{https://figshare.com/articles/dataset/DepMap_20Q1_Public/11791698}
-"copy_num_annot"
diff --git a/R/dep.R b/R/dep.R
deleted file mode 100644
index a6d310b..0000000
--- a/R/dep.R
+++ /dev/null
@@ -1,9 +0,0 @@
-#' @title DepMap 20Q1: Cancer cell line gene dependency probabilities
-#'
-#' @description A data set containing the gene dependency probabilities of 18334 genes in 739 cancer cell lines. 
-#' Details on data generation can be found in Meyers, RM., et al (2017).
-#' The data was obtained from 'Achilles_gene_dependency.csv' (see source url).
-#'
-#' @format A data frame with 19145 rows and 4 variables:
-#' @source \url{https://figshare.com/articles/dataset/DepMap_20Q1_Public/11791698}
-"dep"
diff --git a/R/dep_annot.R b/R/dep_annot.R
deleted file mode 100644
index 7b648d1..0000000
--- a/R/dep_annot.R
+++ /dev/null
@@ -1,14 +0,0 @@
-#' @title DepMap 20Q1: Cancer cell line gene dependency probabilities annotation
-#'
-#' @description A data set containing gene name column headers from the "dep" data frame and
-#' its various forms that exist in the DepMap data set platform.
-#'
-#' @format A data frame with 18334 rows and 4 variables:
-#' \describe{
-#'   \item{\code{names}}{character "dep" column names}
-#'   \item{\code{GeneNames}}{character Hugo symbols}
-#'   \item{\code{GeneID}}{character NCBI gene IDs}
-#'   \item{\code{GeneNameID}}{character Hugo symbol separated by NCBI gene ID} 
-#'}
-#' @source \url{https://figshare.com/articles/dataset/DepMap_20Q1_Public/11791698}
-"dep_annot"
diff --git a/R/essential_genes.R b/R/essential_genes.R
deleted file mode 100644
index 052eb8a..0000000
--- a/R/essential_genes.R
+++ /dev/null
@@ -1,13 +0,0 @@
-#' @title DepMap 20Q1: Gold standard essential genes list
-#' 
-#' @description A data set containing gold standard essential genes as described in Meyers, RM., et al (2017).
-#' The data was obtained from 'common_essentials.csv' (see source url).
-#' 
-#' @format A data frame with 2290 rows and 3 variables:
-#' \describe{
-#'   \item{\code{GeneNameID}}{character character Hugo symbol separated by NCBI gene ID}
-#'   \item{\code{GeneNames}}{character Hugo symbols}
-#'   \item{\code{GeneID}}{character NCBI gene IDs} 
-#'}
-#' @source \url{https://figshare.com/articles/dataset/DepMap_20Q1_Public/11791698}
-"essential_genes"
\ No newline at end of file
diff --git a/R/gene_effect.R b/R/gene_effect.R
deleted file mode 100644
index a89a85f..0000000
--- a/R/gene_effect.R
+++ /dev/null
@@ -1,9 +0,0 @@
-#' @title DepMap 20Q1: Cancer cell line gene KO effects
-#'
-#' @description A data set containing the gene KO effect of 18334 genes in 739 cancer cell lines. 
-#' Details on data generation can be found in Meyers, RM., et al (2017).
-#' The data was obtained from 'Achilles_gene_effect.csv' (see source url).
-#'
-#' @format A data frame with 739 rows and 18334 variables:
-#' @source \url{https://figshare.com/articles/dataset/DepMap_20Q1_Public/11791698}
-"gene_effect"
diff --git a/R/list_available_cancer_type.R b/R/list_available_cancer_type.R
index 1b08e93..2392b5d 100644
--- a/R/list_available_cancer_type.R
+++ b/R/list_available_cancer_type.R
@@ -4,7 +4,8 @@
 #' `list_available_cancer_types()` and `list_available_cancer_subtypes()` provide tools for identifying cancer (sub)types that are available in DepMap.
 #' 
 #' @return string A vector containing unique cancer types available
-#' 
+
+#' @param data_dir string Path to GINIR_data
 #' @import rlang
 #' @import dplyr
 #' @import utils
@@ -12,10 +13,10 @@
 #' @export
 #' @examples
 #' list_available_cancer_types()
-list_available_cancer_types <- function(){
+list_available_cancer_types <- function(data_dir){
   # Load necessary data
   sample_annot <- NULL # see: https://support.bioconductor.org/p/24756/
-  load(paste0(system.file(package = "GINIR"), "/data/sample_annot.rda"), envir = environment())
+  load(paste0(data_dir, "/sample_annot.rda"), envir = environment())
   
   # Main
   sample_annot %>% 
@@ -25,14 +26,15 @@ list_available_cancer_types <- function(){
 #' @describeIn list_available_cancer_types List cancer subtypes that are available
 #' 
 #' @param input_disease string A vector of unique with one or more cancer types listed in `list_available_cancer_types()`
+#' @param data_dir string Path to GINIR_data
 #' @importFrom rlang .data
 #' @export
 #' @examples
 #' list_available_cancer_subtypes("Lung Cancer")
-list_available_cancer_subtypes <- function(input_disease){
+list_available_cancer_subtypes <- function(input_disease, data_dir){
   # Load necessary data
   sample_annot <- NULL # see: https://support.bioconductor.org/p/24756/
-  load(paste0(system.file(package = "GINIR"), "/data/sample_annot.rda"), envir = environment())
+  load(paste0(data_dir, "/sample_annot.rda"), envir = environment())
   
   # Main
   sample_annot %>% 
diff --git a/R/mut_calls.R b/R/mut_calls.R
deleted file mode 100644
index bc842df..0000000
--- a/R/mut_calls.R
+++ /dev/null
@@ -1,46 +0,0 @@
-#' @title DepMap 20Q1: Cancer cell line RNA-seq expression
-#'
-#' @description A data set single nucleotide variations (SNVs), insertions and deletions (Indels) found in of 1697 cancer cell lines. 
-#' Data was generated by the Cancer Cell Line Encyclopedia (CCLE) and distributed by DepMap. 
-#' Details on data generation can be found in Ghandi, M., et al (2019) (https://www.nature.com/articles/s41586-019-1186-3).
-#' The data was obtained from 'CCLE_expression.csv' (see source url).
-#' 
-#' @format A data frame with 1279923 rows and 34 variables:
-#' \describe{
-#'   \item{\code{Hugo_Symbol}}{character}
-#'   \item{\code{Entrez_Gene_Id}}{double}
-#'   \item{\code{NCBI_Build}}{double}
-#'   \item{\code{Chromosome}}{character}
-#'   \item{\code{Start_position}}{double}
-#'   \item{\code{End_position}}{double}
-#'   \item{\code{Strand}}{character}
-#'   \item{\code{Variant_Classification}}{character}
-#'   \item{\code{Variant_Type}}{character}
-#'   \item{\code{Reference_Allele}}{character}
-#'   \item{\code{Tumor_Seq_Allele1}}{character}
-#'   \item{\code{dbSNP_RS}}{character}
-#'   \item{\code{dbSNP_Val_Status}}{character}
-#'   \item{\code{Genome_Change}}{character}
-#'   \item{\code{Annotation_Transcript}}{character}
-#'   \item{\code{Tumor_Sample_Barcode}}{character}
-#'   \item{\code{cDNA_Change}}{character}
-#'   \item{\code{Codon_Change}}{character}
-#'   \item{\code{Protein_Change}}{character}
-#'   \item{\code{isDeleterious}}{logical}
-#'   \item{\code{isTCGAhotspot}}{logical}
-#'   \item{\code{TCGAhsCnt}}{double}
-#'   \item{\code{isCOSMIChotspot}}{logical}
-#'   \item{\code{COSMIChsCnt}}{double}
-#'   \item{\code{ExAC_AF}}{character}
-#'   \item{\code{CGA_WES_AC}}{character}
-#'   \item{\code{SangerWES_AC}}{character}
-#'   \item{\code{SangerRecalibWES_AC}}{character}
-#'   \item{\code{RNAseq_AC}}{character}
-#'   \item{\code{HC_AC}}{character}
-#'   \item{\code{RD_AC}}{character}
-#'   \item{\code{WGS_AC}}{character}
-#'   \item{\code{Variant_annotation}}{character}
-#'   \item{\code{DepMap_ID}}{character} 
-#'}
-#' @details DETAILS
-"mut_calls"
\ No newline at end of file
diff --git a/R/nonessential_genes.R b/R/nonessential_genes.R
deleted file mode 100644
index 9b6ecf2..0000000
--- a/R/nonessential_genes.R
+++ /dev/null
@@ -1,13 +0,0 @@
-#' @title DepMap 20Q1: Common non-essential genes list
-#' 
-#' @description A data set containing common non-essential genes as described in Meyers, RM., et al (2017).
-#' The data was obtained from 'nonessentials.csv' (see source url).
-#' 
-#' @format A data frame with 2290 rows and 3 variables:
-#' \describe{
-#'   \item{\code{GeneNameID}}{character character Hugo symbol separated by NCBI gene ID}
-#'   \item{\code{GeneNames}}{character Hugo symbols}
-#'   \item{\code{GeneID}}{character NCBI gene IDs} 
-#'}
-#' @source \url{https://figshare.com/articles/dataset/DepMap_20Q1_Public/11791698}
-"nonessential_genes"
\ No newline at end of file
diff --git a/R/protein.R b/R/protein.R
deleted file mode 100644
index ecab1de..0000000
--- a/R/protein.R
+++ /dev/null
@@ -1,9 +0,0 @@
-#' @title DepMap 20Q1: Cancer cell line protein expression
-#'
-#' @description A data set containing the protein expression of 426 cancer cell lines. 
-#' Details on data generation can be found in Nusinow, DP., et al (2020) (https://www.sciencedirect.com/science/article/pii/S0092867419313856).
-#' The data was obtained from 'protein_quant_current_normalized.csv' (see source url).
-#'
-#' @format A data frame with 1279 rows and 19145 variables: 
-#' @source \url{https://figshare.com/articles/dataset/DepMap_20Q1_Public/11791698}
-"protein"
diff --git a/R/protein_annot.R b/R/protein_annot.R
deleted file mode 100644
index aa0597a..0000000
--- a/R/protein_annot.R
+++ /dev/null
@@ -1,12 +0,0 @@
-#' @title DepMap 20Q1: Cancer cell line protein expression annotation
-#'
-#' @description A data set containing cell line column headers from the "protein" data frame and
-#' its various forms that exist in the DepMap data set platform.
-#' 
-#'   @format A data frame with 426 rows and 2 variables:
-#' \describe{
-#'   \item{\code{GygiNames}}{character "protein" data frame cell line column names}
-#'   \item{\code{DepMap_ID}}{character Corresponding DepMap_ID} 
-#'}
-#' @source \url{https://figshare.com/articles/dataset/DepMap_20Q1_Public/11791698}
-"protein_annot"
diff --git a/R/protein_nodup.R b/R/protein_nodup.R
deleted file mode 100644
index d119937..0000000
--- a/R/protein_nodup.R
+++ /dev/null
@@ -1,9 +0,0 @@
-#' @title DepMap 20Q1: Cancer cell line protein expression with only the major isoform
-#'
-#' @description A data set containing the major protein isoform expression of 426 cancer cell lines. 
-#' Details on data generation can be found in Nusinow, DP., et al (2020) (https://www.sciencedirect.com/science/article/pii/S0092867419313856).
-#' The data was obtained from 'protein_quant_current_normalized.csv' (see source url).
-#'
-#' @format A data frame with 4893 rows and 426 variables: 
-#' @source \url{https://figshare.com/articles/dataset/DepMap_20Q1_Public/11791698}
-"protein_nodup"
diff --git a/R/sample_ARID1A_KO_screen.R b/R/sample_ARID1A_KO_screen.R
deleted file mode 100644
index 250a31f..0000000
--- a/R/sample_ARID1A_KO_screen.R
+++ /dev/null
@@ -1,9 +0,0 @@
-#' @title Sample GINI screen result for ARID1A KO pan cancer cell lines
-#'
-#' @description A sample data set containing the results of a screen conducted in cancer cell lines with
-#' loss-of-function alterations in ARID1A (HomDel) generated using `GINI_screen()`. As reported in previous literature
-#' ARID1A and ARID1B have synthetic lethal interaction (Helming et al.; doi: 10.1038/nm.3480; PMID:24562383), 
-#' thus gene with the highest interaction score is ARID1B. 
-#'
-#' @format A data frame with 18,333 rows and 17 variables:
-"sample_ARID1A_KO_screen"
diff --git a/R/sample_annot.R b/R/sample_annot.R
deleted file mode 100644
index 184279d..0000000
--- a/R/sample_annot.R
+++ /dev/null
@@ -1,34 +0,0 @@
-#' @title DepMap 20Q1: Cancer cell line sample annotations
-#' 
-#' @description A data set containing the cancer cell lines information generated by the Cancer Cell Line Encyclopedia (CCLE) and distributed by DepMap.
-#' The data was obtained from 'sample_info.csv' (see source url) and details on data generation can be found in Ghandi, M., et al (2019) (https://www.nature.com/articles/s41586-019-1186-3).
-#' 
-#' @format A data frame with 1775 rows and 24 variables: 
-#' \describe{
-#'   \item{\code{DepMap_ID}}{DepMap designated IDs used as a use key}
-#'   \item{\code{stripped_cell_line_name}}{character}
-#'   \item{\code{CCLE_Name}}{character}
-#'   \item{\code{alias}}{character}
-#'   \item{\code{COSMIC_ID}}{double}
-#'   \item{\code{lineage}}{character}
-#'   \item{\code{lineage_subtype}}{character}
-#'   \item{\code{lineage_sub_subtype}}{character}
-#'   \item{\code{lineage_molecular_subtype}}{character}
-#'   \item{\code{sex}}{character}
-#'   \item{\code{source}}{character}
-#'   \item{\code{Achilles_n_replicates}}{double}
-#'   \item{\code{cell_line_NNMD}}{double}
-#'   \item{\code{culture_type}}{character}
-#'   \item{\code{culture_medium}}{character}
-#'   \item{\code{cas9_activity}}{character}
-#'   \item{\code{RRID}}{character}
-#'   \item{\code{sample_collection_site}}{character}
-#'   \item{\code{primary_or_metastasis}}{character}
-#'   \item{\code{disease}}{character}
-#'   \item{\code{disease_subtype}}{character}
-#'   \item{\code{age}}{double}
-#'   \item{\code{Sanger_model_ID}}{character}
-#'   \item{\code{additional_info}}{character} 
-#' }
-#' @source \url{https://figshare.com/articles/dataset/DepMap_20Q1_Public/11791698}
-"sample_annot"