d3b-center · sangeetashukla · Jun 22, 2021 · Jun 22, 2021 · Jun 24, 2021 · Jul 15, 2021
diff --git a/analyses/DESeq_analysis/Uberon-map.tsv b/analyses/DESeq_analysis/Uberon-map.tsv
@@ -0,0 +1,55 @@
+Tissue_Subtype	Uberon	Tissue_Group	Uberon2
+Adipose - Subcutaneous	UBERON_0002190	Adipose	UBERON_0001013
+Adipose - Visceral (Omentum)	UBERON_0010414	Adipose	UBERON_0001013
+Adrenal Gland	UBERON_0002369	Adrenal Gland	UBERON_0002369
+Artery - Aorta	UBERON_0001496	Artery	UBERON_0001637
+Artery - Coronary	UBERON_0001621	Artery	UBERON_0001637
+Artery - Tibial	UBERON_0007610	Artery	UBERON_0001637
+Bladder	UBERON_0001255	Bladder	UBERON_0018707
+Brain - Amygdala	UBERON_0001876	Brain	UBERON_0000955
+Brain - Anterior cingulate cortex (BA24)	UBERON_0009835	Brain	UBERON_0000955
+Brain - Caudate (basal ganglia)	UBERON_0001873	Brain	UBERON_0000955
+Brain - Cerebellar Hemisphere	UBERON_0002037	Brain	UBERON_0000955
+Brain - Cerebellum	UBERON_0002037	Brain	UBERON_0000955
+Brain - Cortex	UBERON_0001870	Brain	UBERON_0000955
+Brain - Frontal Cortex (BA9)	UBERON_0009834	Brain	UBERON_0000955
+Brain - Hippocampus	UBERON_0001954	Brain	UBERON_0000955
+Brain - Hypothalamus	UBERON_0001898	Brain	UBERON_0000955
+Brain - Nucleus accumbens (basal ganglia)	UBERON_0001882	Brain	UBERON_0000955
+Brain - Putamen (basal ganglia)	UBERON_0001874	Brain	UBERON_0000955
+Brain - Spinal cord (cervical c-1)	UBERON_0006469	Brain	UBERON_0000955
+Brain - Substantia nigra	UBERON_0002038	Brain	UBERON_0000955
+Breast - Mammary Tissue	UBERON_0008367	Breast	UBERON_0000310
+Cells - Cultured fibroblasts	EFO_0002009	Cells	
+Cells - EBV-transformed lymphocytes	EFO_0000572	Cells	
+Cervix - Ectocervix	UBERON_0012249	Cervix	UBERON_0000002
+Cervix - Endocervix	UBERON_0000458	Cervix	UBERON_0000002
+Colon - Sigmoid	UBERON_0001159	Colon	UBERON_0001155
+Colon - Transverse	UBERON_0001157	Colon	UBERON_0001155
+Esophagus - Gastroesophageal Junction	UBERON_0004550	Esophagus	UBERON_0001043
+Esophagus - Mucosa	UBERON_0006920	Esophagus	UBERON_0001043
+Esophagus - Muscularis	UBERON_0004648	Esophagus	UBERON_0001043
+Fallopian Tube	UBERON_0003889	Fallopian Tube	UBERON_0003889
+Heart - Atrial Appendage	UBERON_0006631	Heart	UBERON_0000948
+Heart - Left Ventricle	UBERON_0006566	Heart	UBERON_0000948
+Kidney - Cortex	UBERON_0001225	Kidney	UBERON_0002113
+Kidney - Medulla	UBERON_0001293	Kidney	UBERON_0002113
+Liver	UBERON_0001114	Liver	UBERON_0002107
+Lung	UBERON_0008952	Lung	UBERON_0002048
+Minor Salivary Gland	UBERON_0006330	Minor Salivary Gland	UBERON_0001830
+Muscle - Skeletal	UBERON_0011907	Muscle	
+Nerve - Tibial	UBERON_0001323	Nerve	UBERON_0001021
+Ovary	UBERON_0000992	Ovary	UBERON_0000992
+Pancreas	UBERON_0001150	Pancreas	UBERON_0001264
+Pituitary	UBERON_0000007	Pituitary	UBERON_0000007
+Prostate	UBERON_0002367	Prostate	UBERON_0002367
+Skin - Not Sun Exposed (Suprapubic)	UBERON_0036149	Skin	UBERON_0001003
+Skin - Sun Exposed (Lower leg)	UBERON_0004264	Skin	UBERON_0001003
+Small Intestine - Terminal Ileum	UBERON_0001211	Small Intestine	UBERON_0002108
+Spleen	UBERON_0002106	Spleen	UBERON_0002106
+Stomach	UBERON_0000945	Stomach	UBERON_0000945
+Testis	UBERON_0000473	Testis	UBERON_0000473
+Thyroid	UBERON_0002046	Thyroid	UBERON_0002046
+Uterus	UBERON_0000995	Uterus	UBERON_0000995
+Vagina	UBERON_0000996	Vagina	UBERON_0000996
+Whole Blood	UBERON_0013756	Whole Blood	UBERON_0000178
@@ -0,0 +1,200 @@
+# Author: Sangeeta Shukla and Alvin Farrel
+# Date: June 2021
+# Function: 
+# 1. summarize Differential expression from RNASeq data
+# 2. tabulate corresponding P-value
+
+# Example run: DESeq
+# Rscript analyses/DESeq/run-DESeq-analysis.R 1 1
+# This was compares 1 cancer group (Combined or Cohort specific) with one GTEx tissue type. The input arguments are the indices of the comparison of the cancer groups and gtex sub tissues to be compared. In v6, there are 107 cancer groups vs 54 GTEx tissues
+
+# Read arguments from terminal
+args <- commandArgs(trailingOnly = TRUE)
+
+# Load required libraries
+suppressPackageStartupMessages({
+  library(ggplot2)
+  library(DESeq2)
+})
+
+
+HIST_index <- args[1]   #assign first argument to Histology index
+GTEX_index <- args[2]   #assign second argument to GTEx index
+
+
+
+
+
+#Load histology file
+hist <- read.delim("../data/v6/histologies.tsv", header=TRUE, sep = '\t')
+
+#Load expression counts data
+countData <- readRDS("../data/v6/gene-counts-rsem-expected_count-collapsed.rds")
+
+#Load expression TPM data
+TPMData <- readRDS("../data/v6/gene-expression-rsem-tpm-collapsed.rds")
+
+#Load EFO-MONDO map file
+EFO_MONDO <- read.delim("../data/v6/efo-mondo-map.tsv", header =T, stringsAsFactors = FALSE)
+
+#Load gene symbol-gene ID RMTL file
+ENSG_Hugo <- read.delim("../data/v6/ensg-hugo-rmtl-v1-mapping.tsv", header =T)
+
+# Subset Histology file for samples only found in the current the countData file (To ensure no discepancies cause errors later in the code)
+hist.filtered <- unique(hist[which(hist$Kids_First_Biospecimen_ID %in%  colnames(countData)),])
+
+#Subset countdata for data that are present in the hitstoly files (To ensure no discepancies cause errors later in the code)
+countData_filtered <- countData[,which(colnames(countData) %in% hist$Kids_First_Biospecimen_ID)]
+
+#Subset countadata for data that are present in the hitstoly files (To ensure no discepancies cause errors later in the code)
+TPMData_filtered <- TPMData[,which(colnames(TPMData) %in% hist$Kids_First_Biospecimen_ID)]
+
+#Save all the unique cancer histologies in a variable. These cancer histologies represent the patient data in the countsdata
+Cancer_Histology <- unique(hist.filtered$cancer_group)
+
+#Save all the GTEx tissue subgroups in a variable. These cancer histologies represent the GTEx RNDA data available in the countsdata
+Gtex_Tissue_subgroup <- sort(unique(hist.filtered$gtex_subgroup))
+
+#Save all the cohorts represented in the countsdata into a variable. Remove all 'NA's from the list. 
+#And paste cohort to cancer groep (eg GMKF_Neuroblastoma)
+Cancer_Histology_COHORT <- unique(
+  paste(hist.filtered$cohort[which(!is.na(hist.filtered$cancer_group))],
+        hist.filtered$cancer_group[which(!is.na(hist.filtered$cancer_group))],
+        sep="_")
+  )
+
+#Save all the histologies represented in the countsdata into a variable. 
+#Remove all 'NA's from the list. 
+#This will be the basis of all the data from each histology combined regardless of cohort (eg all_cohorts_Neuroblastoma)
+Cancer_Histology <- paste("all_cohorts",Cancer_Histology[which(!is.na(Cancer_Histology))],sep="_")
+
+
+#Save all the GTEx subgroups represented in the countsdata into a variable. Remove all 'NA's 
+Gtex_Tissue_subgroup <- Gtex_Tissue_subgroup[!is.na(Gtex_Tissue_subgroup)]
+
+#Create an empty df to populate with rbind of all normal Kids_First_Biospecimen_ID and gtex_subgroup
+#Create DF that list all Kids_First_Biospecimen_IDs by GTEX subgroup
+sample_type_df_normal <- data.frame()
+for(I in 1:length(Gtex_Tissue_subgroup))
+{
+  sample_type_df_normal <- rbind(sample_type_df_normal,
+                                 data.frame(Case_ID = hist.filtered$Kids_First_Biospecimen_ID[which(hist.filtered$gtex_subgroup == Gtex_Tissue_subgroup[I])],Type = Gtex_Tissue_subgroup[I]), stringsAsFactors = FALSE)
+}
+
+#Create an empty df to populate with rbind of all tumor Kids_First_Biospecimen_ID and cancer_group
+#Create DF that list all Kids_First_Biospecimen_IDs by cancer group subgroup
+sample_type_df_tumor <- data.frame()
+for(I in 1:length(Cancer_Histology))
+{
+  sample_type_df_tumor <- rbind(sample_type_df_tumor,data.frame(Case_ID = hist.filtered$Kids_First_Biospecimen_ID[which(hist.filtered$cancer_group == gsub("all_cohorts_","",Cancer_Histology[I]))],Type=Cancer_Histology[I], stringsAsFactors = FALSE))
+}
+
+#Create an empty df to populate with rbind of all tumor Kids_First_Biospecimen_ID and cancer_group by cohort
+#Create DF that list all Kids_First_Biospecimen_IDs by Cohort - Cancer groups
+sample_type_df_tumor_cohort <- data.frame()
+for(I in 1:length(Cancer_Histology_COHORT))
+{
+  Cancer_Histology_COHORT_cohort <- strsplit(Cancer_Histology_COHORT[I],split="_")[[1]][1]
+  Cancer_Histology_COHORT_cancer_group <- strsplit(Cancer_Histology_COHORT[I],split="_")[[1]][2]
+  sample_type_df_tumor_cohort <- rbind(sample_type_df_tumor_cohort,
+                                       data.frame(Case_ID = hist.filtered$Kids_First_Biospecimen_ID[which(hist.filtered$cancer_group == Cancer_Histology_COHORT_cancer_group & hist.filtered$cohort == Cancer_Histology_COHORT_cohort)],Type=Cancer_Histology_COHORT[I], stringsAsFactors = FALSE))
+}
+
+#Combine the rows from the normal and tumor sample df
+sample_type_df <- rbind(sample_type_df_tumor,sample_type_df_tumor_cohort,sample_type_df_normal)
+
+#Filter one more to ensure the rownames in the countsdata file match the sample dataframe for DEG just created
+countData_filtered_DEG <- countData_filtered[,which(colnames(countData_filtered) %in% sample_type_df$Case_ID)]
+
+#Filter one more to ensure the rownames in the countsdata file match the sample dataframe for DEG just created
+sample_type_df_filtered <- unique(sample_type_df[which(sample_type_df$Case_ID %in% colnames(countData_filtered_DEG)),])
+
+#Define All cancer groups (Combined and cohort-specific) in the histology list
+histology_filtered <- unique(sample_type_df_filtered$Type[-grep("^GTEX",sample_type_df_filtered$Case_ID)])
+
+#Define All GTEx groups as normal in the GTEX_filtered list
+GTEX_filtered <- unique(sample_type_df_filtered$Type[grep("^GTEX",sample_type_df_filtered$Case_ID)])
+
+#Assign cmparison
+ I <- as.numeric(HIST_index)   #assign first argument to Histology index
+ J <- as.numeric(GTEX_index)   #assign second argument to GTEx index
+
+
+
+#Subset countData_filtered_DEG dataframe for only the histology group and GTEx group being compared
+    countData_filtered_DEG.hist <- data.matrix(countData_filtered_DEG[,which(colnames(countData_filtered_DEG) %in% sample_type_df_filtered$Case_ID[which(sample_type_df_filtered$Type %in% c(histology_filtered[I],GTEX_filtered[J]))])])
+
+#Subset sample type dataframe for only the histology group and GTEx group being compared
+    sample_type_df_filtered.hist <- sample_type_df_filtered[which(sample_type_df_filtered$Type %in% c(histology_filtered[I],GTEX_filtered[J])),]
+
+#Run DESeq2  
+    sub.deseqdataset <- DESeqDataSetFromMatrix(countData <- round(countData_filtered_DEG.hist),
+                                               colData <- sample_type_df_filtered.hist,
+                                               design <- ~ Type)
+
+
+    sub.deseqdataset$Type <- factor(sub.deseqdataset$Type, levels=c(GTEX_filtered[J], histology_filtered[I]))
+
+    dds <- DESeq(sub.deseqdataset)
+    res <- results(dds)
+    resOrdered <- res[order(rownames(res)),]
+
+    Result <- resOrdered
+
+
+#Save subset of table with samples representing the histology in the DEG comparison to a variable.
+HIST_sample_type_df_filtered <- sample_type_df_filtered[which(sample_type_df_filtered$Type %in% c(histology_filtered[I])),]
+
+#Define study ID as all cohorts represented by the pateints involved in DEG comparison
+STUDY_ID <- paste(unique(hist$cohort[which(hist$Kids_First_Biospecimen_ID %in% HIST_sample_type_df_filtered$Case_ID)]),collapse=";",sep=";")
+
+#Record number of samples represnt the GTEX tissue used in the comparison
+GTEX_Hits <- length(sample_type_df_filtered[which(sample_type_df_filtered$Type %in% c(GTEX_filtered[J])),1])
+
+#Determine the mean TPM of the tissue. If there are multiple samples use the mean TPM
+if(GTEX_Hits > 1) GTEX_MEAN_TPMs = round(apply(TPMData_filtered[match(rownames(Result),rownames(TPMData_filtered)),which(colnames(TPMData_filtered) %in%  sample_type_df_filtered[which(sample_type_df_filtered$Type %in% c(GTEX_filtered[J])),1])]  ,MARGIN=1, mean ),2)
+
+#Determine the mean TPM of the tissue. If there is one sample just use the single TPM value of the sample
+if(GTEX_Hits <= 1) GTEX_MEAN_TPMs = TPMData_filtered[match(rownames(Result),rownames(TPMData_filtered)),which(colnames(TPMData_filtered) %in%  sample_type_df_filtered[which(sample_type_df_filtered$Type %in% c(GTEX_filtered[J])),1])]
+
+#Record number of samples represnt the cancer patient data used in the comparison
+Cancer.Hist_Hits <- length(sample_type_df_filtered[which(sample_type_df_filtered$Type %in% c(histology_filtered[I])),1])
+
+#Determine the mean TPM of the tissue. If there are multiple samples use the mean TPM
+if(Cancer.Hist_Hits > 1) Histology_MEAN_TPMs <- round(apply(TPMData_filtered[match(rownames(Result),rownames(TPMData_filtered)),which(colnames(TPMData_filtered) %in%  sample_type_df_filtered[which(sample_type_df_filtered$Type %in% c(histology_filtered[I])),1])]  ,MARGIN=1, mean ),2)
+
+#Determine the mean TPM of the tissue. If there is one sample just use the single TPM value of the sample
+if(Cancer.Hist_Hits <= 1) Histology_MEAN_TPMs <- TPMData_filtered[match(rownames(Result),rownames(TPMData_filtered)),which(colnames(TPMData_filtered) %in%  sample_type_df_filtered[which(sample_type_df_filtered$Type %in% c(histology_filtered[I])),1])]
+
+#Round the mean TPMs to the 2 decimal places
+Histology_MEAN_TPMs <- round(Histology_MEAN_TPMs,2)
+GTEX_MEAN_TPMs <- round(GTEX_MEAN_TPMs,2)
+
+
+#Create Final Dataframe with all the info calculated and extracted from histology file Including EFO/MONDO codes where available and RMTL status
+Final_Data_Table <- data.frame(
-Final_Data_Table <- data.frame(
+Final_Data_Table <- data.frame(
-Final_Data_Table <- data.frame(
+Final_Data_Table <- data.frame(
+  datasourceId <- paste(strsplit(histology_filtered[I],split="_")[[1]][1],"vs_GTex",sep="_"),
+  datatypeId <- "rna_expression",
+  cohort <- paste(unique(hist$cohort[which(hist$Kids_First_Biospecimen_ID %in% HIST_sample_type_df_filtered$Case_ID)]),collapse=";",sep=";"),
+  gene_symbol <- rownames(Result),
+  gene_id <- ENSG_Hugo$ensg_id[match(rownames(Result),ENSG_Hugo$gene_symbol)],
+  RMTL <- ENSG_Hugo$rmtl[match(rownames(Result),ENSG_Hugo$gene_symbol)],
+  EFO <- ifelse(length(which(EFO_MONDO$cancer_group == unique(hist$cancer_group[which(hist$Kids_First_Biospecimen_ID %in% HIST_sample_type_df_filtered$Case_ID)]))) >= 1, EFO_MONDO$efo_code[which(EFO_MONDO$cancer_group == unique(hist$cancer_group[which(hist$Kids_First_Biospecimen_ID %in% HIST_sample_type_df_filtered$Case_ID)]))], "" ),
+  MONDO <- ifelse(length(which(EFO_MONDO$cancer_group == unique(hist$cancer_group[which(hist$Kids_First_Biospecimen_ID %in% HIST_sample_type_df_filtered$Case_ID)]))) >= 1,EFO_MONDO$mondo_code[which(EFO_MONDO$cancer_group == unique(hist$cancer_group[which(hist$Kids_First_Biospecimen_ID %in% HIST_sample_type_df_filtered$Case_ID)]))],""),
+  comparisonId <- gsub(" |/|;|:|\\(|)","_",paste(histology_filtered[I],GTEX_filtered[J],sep="_v_")),
+  cancer_group <- paste(unlist(strsplit(histology_filtered[I],split="_"))[-1],collapse=" "),
+  cancer_group_Count <- Cancer.Hist_Hits,
+  GTEx <- GTEX_filtered[J],
+  GTEx_Count <- GTEX_Hits,
+  cancer_group_MeanTpm <- Histology_MEAN_TPMs,
+  GTEx_MeanTpm <- GTEX_MEAN_TPMs,
+  Result, stringsAsFactors = FALSE
+)#Final_Data_Table = data.frame(
+
+
+#Save files
+system("mkdir DESeq_analysis/Results/")
+
+#Define file name as Histoloy_v_Gtex.tsv and replacing all 'special symbols' with '_' for the filename
+FILENAME <- gsub(" |/|;|:|\\(|)","_",paste(histology_filtered[I],GTEX_filtered[J],sep="_v_"))
+write.table(Final_Data_Table,file=paste("DESeq_analysis/Results/Results_",FILENAME,".tsv",sep=""),sep="\t",col.names = T, row.names = F,quote = F)