diff --git a/workflow/rules/de_analysis.smk b/workflow/rules/de_analysis.smk index 2bc2c4e..f5cd90c 100644 --- a/workflow/rules/de_analysis.smk +++ b/workflow/rules/de_analysis.smk @@ -1,8 +1,8 @@ rule de_analysis: input: - f"{OUTPUT}/Quantification/dea_input.tsv" + f"{OUTPUT}/Quantification/dea_input.tsv" if len(mt_exps) > 0 else f"{OUTPUT}/Metaproteomics/mp_normalized.tsv" output: - f"{OUTPUT}/DE_analysis/condition_treated_results.tsv", + f"{OUTPUT}/DE_analysis/condition_treated_results.tsv" threads: 1 params: diff --git a/workflow/scripts/de_analysis.R b/workflow/scripts/de_analysis.R index 792c465..87e4312 100644 --- a/workflow/scripts/de_analysis.R +++ b/workflow/scripts/de_analysis.R @@ -79,7 +79,6 @@ if(snakemake@params$datatype == "rna_seq") { library("ROTS") library("tibble") # Reproducibility-Optimized Test Statistics - total <- log2(total[!(rowSums(total == 0) > 0), ]) de <- ROTS(data=total, groups=conditions, progress=TRUE) de_res <- cbind(de$logfc, de$pvalue, de$FDR) colnames(de_res) <- c("log2FoldChange", "pvalue", "FDR") @@ -95,19 +94,33 @@ if(snakemake@params$datatype == "rna_seq") { file=paste0(snakemake@params$output, '/report.txt'), append=TRUE) } + sorted_data <- head(as.matrix(total[order(de$pvalue), ]), 30) #summary(de, fdr = 0.05) - for (ptype in c('volcano', 'heatmap', 'ma', 'reproducibility', 'pvalue', 'pca')){ + for (ptype in c('volcano', 'heatmap', 'ma', 'reproducibility', 'pvalue', 'pca')) { print(paste0("Plotting ", ptype, " plot.")) - jpeg(paste0(snakemake@params$output, "/", ptype, ".jpeg"), res=300) + jpeg_file <- paste0(ptype, ".jpeg") - if (ptype == 'heatmap') { - # heatmap with legend requires this call, also arranges the data better - sorted_data <- total[order(de$pvalue), ] # total is always equal to de$data - heatmap(sorted_data, Rowv = NA, Colv = NA, col = colorRampPalette(c("blue", "white", "red"))(256), legend = TRUE) - } else { - plot(de, type = ptype, fdr = snakemake@params$fdr) - } - dev.off() + tryCatch({ + if (ptype == 'heatmap') { + pheatmap(sorted_data, Rowv = NA, Colv = NA, col = colorRampPalette(c("blue", "white", "red"))(256), legend = TRUE) + } else { + jpeg(jpeg_file, res = 300) + plot(de, type = ptype, fdr = 0.05) + dev.off() + } + }, error = function(e) { + tryCatch({ + if (ptype == 'heatmap') { + pheatmap(sorted_data, Rowv = NA, Colv = NA, col = colorRampPalette(c("blue", "white", "red"))(256), legend = TRUE) + } else { + jpeg(jpeg_file) # If plot generation fails due to margin size, try without specifying res + plot(de, type = ptype, fdr = 0.05) + dev.off() + } + }, error = function(inner_e) { + cat("Error in plotting", ptype, "plot:", conditionMessage(inner_e), "\n") + }) + }) } } diff --git a/workflow/scripts/general_report.py b/workflow/scripts/general_report.py index 6afa31e..cb0080e 100644 --- a/workflow/scripts/general_report.py +++ b/workflow/scripts/general_report.py @@ -52,7 +52,7 @@ def make_general_report(out, exps, sample, mg_preport, mt_preport, mp_preport, d de_input, pd.read_csv(f'{out}/Quantification/{sample}_mt.readcounts', sep='\t').rename( columns={'Gene': 'Entry'}), on='Entry', how='outer') if len(mp_names) > 0: - spectracounts = pd.read_csv(f'{out}/Metaproteomics/{sample}/spectracounts.tsv', sep='\t') + spectracounts = pd.read_csv(f'{out}/Metaproteomics/{sample}_mp.spectracounts', sep='\t') spectracounts.rename(columns={'Main Accession': 'qseqid'}, inplace=True) report = pd.merge(report, spectracounts, on='qseqid', how='left') mp_preport = pd.merge(mp_preport, report[['Entry'] + mp_names], on='Entry', how='outer') @@ -96,9 +96,9 @@ def make_general_reports(out, exps, max_lines=1000000): mp_report = mp_report.groupby('Entry')[mp_report.columns.tolist()[1:]].sum().reset_index() mp_report[mp_report[mp_report.columns.tolist()[1:]].isnull().sum( axis=1) < len(mp_report.columns.tolist()[1:])].drop_duplicates().to_csv( - f'{out}/Metaproteomics/mp_entry_quant', sep='\t', index=False) + f'{out}/Metaproteomics/mp_entry_quant.tsv', sep='\t', index=False) # in the case of MP, there is no normalization by protein size. So it can be done this way - shutil.copyfile(f'{out}/Metaproteomics/mp_entry_quant', f'{out}/Quantification/dea_input.tsv') + shutil.copyfile(f'{out}/Metaproteomics/mp_entry_quant.tsv', f'{out}/Quantification/dea_input.tsv') def run(): diff --git a/workflow/scripts/metaproteomics.py b/workflow/scripts/metaproteomics.py index 862d8c3..6acdeff 100644 --- a/workflow/scripts/metaproteomics.py +++ b/workflow/scripts/metaproteomics.py @@ -300,7 +300,6 @@ def select_proteins_for_second_search(self, original_db, output, results_files, output=f'{output}/2nd_search_database.fasta') def run(self): - Path(snakemake.params.output).mkdir(parents=True, exist_ok=True) # 1st database construction self.database_generation( @@ -355,14 +354,14 @@ def run(self): for name in snakemake.params.names: out = f'{snakemake.params.output}/{name}' self.compomics_run( - f'{snakemake.params.output}/2nd_search_database_target_decoy.fasta', f'{out}/2nd_search', - f'{out}/spectra', name, f'{snakemake.params.output}/2nd_params.par', + f'{snakemake.params.output}/2nd_search_database_concatenated_target_decoy.fasta', + f'{out}/2nd_search', f'{out}/spectra', name, f'{snakemake.params.output}/2nd_params.par', threads=snakemake.threads, max_memory=snakemake.params.max_memory, reports=['9', '10']) spectracounts = pd.merge(spectracounts, pd.read_csv( f'{out}/2nd_search/reports/{name}_Default_Protein_Report.txt', sep='\t', index_col=0 )[['Main Accession', '#PSMs']].rename(columns={'#PSMs': name}), how='outer', on='Main Accession') spectracounts.groupby('Main Accession')[snakemake.params.names].sum().reset_index().fillna(value=0.0).to_csv( - f'{snakemake.params.output}/{snakemake.wildcards.sample}_mp_spectracounts.tsv', sep='\t', index=False) + f'{snakemake.params.output}_mp.spectracounts', sep='\t', index=False) if __name__ == '__main__': diff --git a/workflow/scripts/normalization.R b/workflow/scripts/normalization.R index babc147..89292fb 100644 --- a/workflow/scripts/normalization.R +++ b/workflow/scripts/normalization.R @@ -17,6 +17,7 @@ normalize_gene_expression <- function(df, output_file, norm_method, imput_method library("vsn") library("pcaMethods") df[df == 0] <- NA + df <- df[rowSums(is.na(df)) < ncol(df), ] norm <- exprs(justvsn(ExpressionSet(df))) if (imput_method == "LLS") { print("Performing Local Least Squares Imputation.")