diff --git a/analyses/cnv-frequencies/01-cnv-frequencies.py b/analyses/cnv-frequencies/01-cnv-frequencies.py new file mode 100644 index 0000000000..c447a466b0 --- /dev/null +++ b/analyses/cnv-frequencies/01-cnv-frequencies.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python + + +""" +01-cnv-frequencies.py +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Functions to create copy number variation (CNV) cancer type and study gene-level frequencies for OPenPedCan analyses modules +""" + + +__author__ = ('Eric Wafula (wafulae@chop.edu)') +__version__ = '1.0' +__date__ = '12 July 2021' + + +import os +import sys +import csv +import json +import argparse +import subprocess +import numpy as np +import pandas as pd +from functools import reduce +from collections import OrderedDict + + +def read_parameters(): + p = argparse.ArgumentParser(description=("The 01-snv-frequencies.py scripts creates copy number variation (CNV) cancer type and study gene-level alterations frequencies table for the OPenPedCan analyses modules."), formatter_class=argparse.RawTextHelpFormatter) + p.add_argument('HISTOLOGY_FILE', type=str, default=None, help="OPenPedCan histology file (histologies.tsv)\n\n") + p.add_argument('CNV_FILE', type=str, default=None, help="OPenPedCan CNV consensus file (consensus_wgs_plus_cnvkit_wxs_autosomes.tsv.gz and consensus_wgs_plus_cnvkit_wxs_x_and_y.tsv.gz)\n\n") + p.add_argument('PRIMARY_TUMORS', type=str, default=None, help="OPenPedCan independent primary tumor samples file (independent-specimens.wgs.primary.eachcohort.tsv)\n\n") + p.add_argument('RELAPSE_TUMORS', type=str, default=None, help="OPenPedCan independent relapse tumor samples file (independent-specimens.wgs.relapse.eachcohort.tsv)\n\n") + p.add_argument('-v', '--version', action='version', version="classify_reads.py version {} ({})".format(__version__, __date__), help="Print the current 01-cnv-frequencies.py version and exit\n\n") + return p.parse_args() + + +def merge_histology_and_cnv_data(histology_file, cnv_consensus_file): + # load histology file + histology_df = pd.read_csv(histology_file, sep="\t", na_filter=False, dtype=str) + + # subset histology dataframe for relevant columns + histology_df = histology_df[["Kids_First_Biospecimen_ID","Kids_First_Participant_ID", "cohort", "cancer_group", "sample_type"]] + + # load CNV consensus file + cnv_df = pd.read_csv(cnv_consensus_file, sep="\t", dtype=str) + + # merge subset of histology dataframe to CNV dataframe keeping only sample present in the CNV table (left outer join) + merged_df = pd.merge(cnv_df, histology_df, how="left", left_on="biospecimen_id", right_on="Kids_First_Biospecimen_ID") + + # check if non tumor sample are present in the merged dataframe + if not np.array_equal(np.array(["Tumor"]), merged_df.sample_type.unique()): + raise Exception("Merged hsitology-CNV dataframe contains non tumor samples") + + # check and drop unknown cancer types (cancer_group == NA) + row_indices = merged_df[(merged_df["cancer_group"] == "NA")].index + merged_df.drop(row_indices, inplace=True) + + # select and reorder relevant columns + all_tumors_df = merged_df[["gene_symbol", "ensembl", "status", "copy_number", "ploidy", "cohort", "cancer_group", "Kids_First_Biospecimen_ID", "Kids_First_Participant_ID"]].reset_index() + return(all_tumors_df) + + +def get_cancer_groups_and_cohorts(all_tumors_df): + # group samples by cancer_group and cohort + cancer_group_cohort_df = all_tumors_df.groupby(["cancer_group", "cohort"])["Kids_First_Biospecimen_ID"].nunique().reset_index() + cancer_group_cohort_df.columns = ["cancer_group", "cohort", "num_samples"] + + # group samples by cancer_group only + def func(x): + d = {} + cohort_list = x["cohort"].unique() + if len(cohort_list) > 1: + d["cohort"] = "all_cohorts" + else: + d["cohort"] = cohort_list[0] + d["num_samples"] = x["Kids_First_Biospecimen_ID"].nunique() + return pd.Series(d, index=["cohort", "num_samples"]) + cancer_group_df = all_tumors_df.groupby(["cancer_group"]).apply(func).reset_index() + + # concat cancer_group_cohort_df and cancer_group_df and rremove duplicates + cancer_group_cohort_df = pd.concat([cancer_group_df, cancer_group_cohort_df], sort=False, ignore_index=True) + cancer_group_cohort_df.drop_duplicates(inplace=True) + # these subset dataframe is for testing and need to comment out + #cancer_group_cohort_df = cancer_group_cohort_df[cancer_group_cohort_df.cancer_group.isin(["CNS Embryonal tumor", "Atypical Teratoid Rhabdoid Tumor"])] + return(cancer_group_cohort_df) + + +def compute_variant_frequencies(all_tumors_df, primary_tumors_file, relapase_tumors_file, cancer_group_cohort_df): + # get independent primary tumor samples and subset from all tumor sample dataframe + tumor_dfs = {"all_tumors": all_tumors_df} + primary_tumors_samples_list = list(pd.read_csv(primary_tumors_file, sep="\t", dtype=str)["Kids_First_Biospecimen_ID"].unique()) + primary_tumors_df = all_tumors_df[all_tumors_df.Kids_First_Biospecimen_ID.isin(primary_tumors_samples_list)].reset_index() + tumor_dfs["primary_tumors"] = primary_tumors_df + + # get independent relapse tumor sample and subset from all tumor samples dataframe + relapse_tumors_samples_list = list(pd.read_csv(relapase_tumors_file, sep="\t", dtype=str)["Kids_First_Biospecimen_ID"].unique()) + relapse_tumors_df = all_tumors_df[all_tumors_df.Kids_First_Biospecimen_ID.isin(relapse_tumors_samples_list)].reset_index() + tumor_dfs["relapse_tumors"] = relapse_tumors_df + + # compute variant frequencies for each cancer group per cohort and cancer group in cohorts + # for the overal dataset (all tumor samples) and independent primary/replase tumor samples + def func(x): + d = {} + d["Gene_symbol"] = ",".join(x["gene_symbol"].unique()) + d["total_sample_alterations"] = x["Kids_First_Biospecimen_ID"].nunique() + d["total_patient_alterations"] = x["Kids_First_Participant_ID"].nunique() + return(pd.Series(d, index=["Gene_symbol", "total_sample_alterations", "total_patient_alterations"])) + all_tumors_frequecy_dfs = [] + primary_tumors_frequecy_dfs = [] + relapse_tumors_frequecy_dfs = [] + for row in cancer_group_cohort_df.itertuples(index=False): + if row.num_samples > 5: + for df_name, tumor_df in tumor_dfs.items(): + if row.cohort == "all_cohorts": + df = tumor_df[(tumor_df["cancer_group"] == row.cancer_group)] + else: + df = tumor_df[(tumor_df["cancer_group"] == row.cancer_group) & (tumor_df["cohort"] == row.cohort)] + if df.empty: + continue + num_samples = df["Kids_First_Biospecimen_ID"].nunique() + num_patients = df["Kids_First_Participant_ID"].nunique() + df = df.groupby(["ensembl", "status"]).apply(func) + df = df.rename_axis(["Gene_Ensembl_ID", "Variant_type"]).reset_index() + df["num_patients"] = num_patients + for i in df.itertuples(): + if df_name == "primary_tumors" or df_name == "relapse_tumors": + df.at[i.Index, "Total_alterations/Patients_in_dataset"] = "{}/{}".format(i.total_sample_alterations, num_samples) + df.at[i.Index, "Frequency_in_overall_dataset"] = "{:.2f}%".format((i.total_sample_alterations/num_samples)*100) + if df_name == "all_tumors": + df.at[i.Index, "Total_alterations/Patients_in_dataset"] = "{}/{}".format(i.total_patient_alterations, num_patients) + df.at[i.Index, "Frequency_in_overall_dataset"] = "{:.2f}%".format((i.total_patient_alterations/num_patients)*100) + df.at[i.Index, "Dataset"] = row.cohort + df.at[i.Index, "Disease"] = row.cancer_group + df = df [["Gene_symbol", "Gene_Ensembl_ID", "Variant_type", "Dataset", "Disease", "Total_alterations/Patients_in_dataset", "Frequency_in_overall_dataset"]] + if df_name == "primary_tumors": + primary_tumors_frequecy_dfs.append(df) + elif df_name == "relapse_tumors": + relapse_tumors_frequecy_dfs.append(df) + else: + all_tumors_frequecy_dfs.append(df) + + # merge overal dataset (all tumor samples) and independent primary/replase tumor samples + # frequencies for cancer groups per cohorts into a single dataframe + merging_list = [] + #frequencies in overall dataset + all_tumors_frequecy_df = pd.concat(all_tumors_frequecy_dfs, sort=False, ignore_index=True) + merging_list.append(all_tumors_frequecy_df) + # frequencies in independent primary tumors + primary_tumors_frequecy_df = pd.concat(primary_tumors_frequecy_dfs, sort=False, ignore_index=True) + primary_tumors_frequecy_df.rename(columns={"Total_alterations/Patients_in_dataset": "Total_primary_tumors_altered/Primary_tumors_in_dataset", "Frequency_in_overall_dataset": "Frequency_in_primary_tumors"}, inplace=True) + merging_list.append(primary_tumors_frequecy_df) + # frequencies in independent relapse tumors + relapse_tumors_frequecy_df = pd.concat(relapse_tumors_frequecy_dfs, sort=False, ignore_index=True) + relapse_tumors_frequecy_df.rename(columns={"Total_alterations/Patients_in_dataset": "Total_relapse_tumors_altered/Relapse_tumors_in_dataset", "Frequency_in_overall_dataset": "Frequency_in_relapse_tumors"}, inplace=True) + merging_list.append(relapse_tumors_frequecy_df) + cnv_frequency_df = reduce(lambda x, y: pd.merge(x, y, how="outer", on=["Gene_symbol", "Gene_Ensembl_ID", "Variant_type", "Dataset", "Disease"]), merging_list).fillna("") + cnv_frequency_df = cnv_frequency_df.replace({"Total_primary_tumors_altered/Primary_tumors_in_dataset": "", "Total_relapse_tumors_altered/Relapse_tumors_in_dataset": ""}, "0/0") + return(cnv_frequency_df) + + +def get_annotations(cnv_frequency_df, CNV_FILE): + # insert variant category annotation column in the CNV frequency dataframe + cnv_frequency_df["Variant_category"] = cnv_frequency_df.insert(3, "Variant_category", "") + cnv_frequency_df.fillna("", inplace=True) + + # create module results directory + results_dir = "{}/results".format(os.path.dirname(__file__)) + if not os.path.exists(results_dir): + os.mkdir(results_dir) + + # get CNV input parameters + args = read_parameters() + + # write annotated CNV frequencies results to TSV file + cnv_freq_tsv = "{}/{}_freq.tsv".format(results_dir, os.path.splitext(os.path.splitext(os.path.basename(args.CNV_FILE))[0])[0]) + cnv_frequency_df.to_csv(cnv_freq_tsv, sep="\t", index=False, encoding="utf-8") + + # annotate full gene names, OncoKB categories, EFO and MONDO disease accessions, + # and Relevant Molecular Target (RMTL) from the long-format-table-utils analysis module + log_file = "{}/annotator.log".format(results_dir) + cnv_annot_freq_tsv = "{}/{}_annot_freq.tsv".format(results_dir, os.path.splitext(os.path.splitext(os.path.basename(args.CNV_FILE))[0])[0]) + with open(log_file, "w") as log: + subprocess.run(["Rscript", "--vanilla", "analyses/long-format-table-utils/annotator/annotator-cli.R", "-r", "-c", "Gene_full_name,RMTL,OncoKB_cancer_gene,OncoKB_oncogene_TSG,EFO,MONDO", "-i", cnv_freq_tsv, "-o", cnv_annot_freq_tsv, "-v"], stdout=log, check=True) + + # transform annotated CNV frequencies results from TSV to JSONL file + cnv_annot_freq_jsonl = "{}/{}_annot_freq.jsonl".format(results_dir, os.path.splitext(os.path.splitext(os.path.basename(args.CNV_FILE))[0])[0]) + tsv_file = open(cnv_annot_freq_tsv, "r") + jsonl_file = open(cnv_annot_freq_jsonl, "w") + reader = csv.DictReader(tsv_file, delimiter="\t") + headers = reader.fieldnames + for row in reader: + row_dict = OrderedDict() + for header in headers: + row_dict[header] = row[header] + json.dump(row_dict, jsonl_file) + jsonl_file.write("\n") + tsv_file.close() + jsonl_file.close() + + +def main(): + # get input parameters + args = read_parameters() + + # call functions to compute CNV gene-level and add functional annotations + all_tumors_df = merge_histology_and_cnv_data(args.HISTOLOGY_FILE, args.CNV_FILE) + cancer_group_cohort_df = get_cancer_groups_and_cohorts(all_tumors_df) + cnv_frequency_df = compute_variant_frequencies(all_tumors_df, args.PRIMARY_TUMORS, args.RELAPSE_TUMORS, cancer_group_cohort_df) + cnv_frequency_df = get_annotations(cnv_frequency_df, args.CNV_FILE) + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/analyses/cnv-frequencies/README.md b/analyses/cnv-frequencies/README.md new file mode 100644 index 0000000000..9d5d00f7ab --- /dev/null +++ b/analyses/cnv-frequencies/README.md @@ -0,0 +1,99 @@ +## Annotate CNV table with mutation frequencies + +Adapted from [snv-frequencies](https://github.com/logstar/OpenPedCan-analysis/tree/snv-freq/analyses/snv-frequencies) +**Module author:** Yuanchao Zhang ([@logstar](https://github.com/logstar)) + +Adapted from [fusion-frequencies](https://github.com/PediatricOpenTargets/OpenPedCan-analysis/tree/kgaonkar6/fusion_freq/analyses/fusion-frequencies) +**Module author:** Krutika Gaonkar ([@kgaonkar6](https://github.com/kgaonkar6)) + +Adapted by Eric Wafula ([@ewafula](https://github.com/ewafula)) + +### Purpose +Uses `consensus_wgs_plus_cnvkit_wxs_autosomes.tsv.gz` and `consensus_wgs_plus_cnvkit_wxs_x_and_y.tsv.gz` consensus CNV calls and variant types (`amplification`, `deep deletion`, `gain`, `loss`, and `neutral`) to determine `Ensembl` gene-level mutation frequencies for each cancer type in an overall cohort dateset and in the independent primary/relapse cohort subsets of the data. + +#### Additional annotation +Additional disease and gene annotations include `gene full names` `RMTL designations`, `OncoKB categories`, and `EFO and MONDO identifiers` integrated to the CNV frequencies table using the [long-format-table-utils analysis module)](https://github.com/PediatricOpenTargets/OpenPedCan-analysis/tree/dev/analyses/long-format-table-utils). + +Each `cancer_group` and `cohort`(s) combination is considered a `cancer_group_cohort`. `cancer_group_cohort` with `n_samples` >= 5, compute `Frequency_in_overall_dataset`, `Frequency_in_primary_tumors`, and `Frequency_in_relapse_tumors` as following: + +- `Frequency_in_overall_dataset`: + - For each unique variant, count the number of patients (identified by `Kids_First_Participant_ID`) that have the variant, and call this number `Total_alterations`. + - Count the total number of patients in the `cancer_group_cohort`, and call this number `Patients_in_dataset`. + - `Frequency_in_overall_dataset = Total_alterations / Patients_in_dataset`. + +- `Frequency_in_primary_tumors`: + - For each unique variant, count the number of samples (identified by `Kids_First_Biospecimen_ID`) that are in the `independent-specimens.wgs.primary.eachcohort.tsv`, and call this number `Total_primary_tumors_alterated`. + - Count the total number of samples in the `cancer_group_cohort` that are also in the `independent-specimens.wgswxspanel.primary.eachcohort.tsv`, and call this number `Primary_tumors_in_dataset`. + - `Frequency_in_primary_tumors = Total_primary_tumors_alterated / Primary_tumors_in_dataset`. + +- `Frequency_in_relapse_tumors`: + - For each unique variant, count the number of samples (identified by `Kids_First_Biospecimen_ID`) that are in the `independent-specimens.wgs.relapse.eachcohort.tsv`, and call this number `Total_relapse_tumors_alterated`. + - Count the total number of samples in the `cancer_group_cohort` that are also in the `independent-specimens.wgswxspanel.relapse.eachcohort.tsv`, and call this number `Relapse_tumors_in_dataset`. + - `Frequency_in_relapse_tumors = Total_relapse_tumors_alterated / Relapse_tumors_in_dataset`. + +Format the CNV mutation frequency table according to the latest spreadsheet that is attached in . + +Merge the CNV frequency tables of all `cancer_group_cohort`s. + +### Results + +Results are generated using PediatricOpenTargets/OpenPedCan-analysis data release v7. + +The merged CNV frequency table of all `cancer_group_cohort`s is output in TSV and JSONL formats. + +- `gene-level-cnv-consensus-annotated-mut-freq.tsv.gz` +- `gene-level-cnv-consensus-annotated-mut-freq.jsonl.gz` + +### Analysis scripts + +### `run-cnv-frequencies-analysis.sh` +This is a bash script wrapper for setting input file paths for the main anlysis script, `01-cnv-frequencies.py`. All file paths in set in this script are based on root directory of this Git repository . Therefore, the script should always be run from the root directory of OPenPedCan-analysis + +Adapted from [snv-callers analysis module](https://github.com/PediatricOpenTargets/OpenPedCan-analysis/blob/dev/analyses/snv-callers/run_caller_consensus_analysis.sh) +**Module author:** Candace Savonen ([@cansavvy](https://github.com/cansavvy)) + +Usage: +```bash +bash run-cnv-frequencies-analysis.sh + +``` + +### `01-cnv-frequencies.py` +Python functions to create copy number variation (CNV) cancer type and study gene-level frequencies for OPenPedCan analyses modules + +Usage: +```bash +python3 analysis/cnv-frequencies/01-cnv-frequencies.py HISTOLOGY_FILE CNV_FILE PRIMARY_TUMORS RELAPSE_TUMORS +``` + +Parameter Options: +``` +positional arguments: + HISTOLOGY_FILE OPenPedCan histology file (histologies.tsv) + + CNV_FILE OPenPedCan CNV consensus file + (consensus_wgs_plus_cnvkit_wxs_autosomes.tsv.gz + and consensus_wgs_plus_cnvkit_wxs_x_and_y.tsv.gz) + + PRIMARY_TUMORS OPenPedCan independent primary tumor samples file + (independent-specimens.wgswxspanel.primary.eachcohort.tsv) + + RELAPSE_TUMORS OPenPedCan independent relapse tumor samples file + (independent-specimens.wgswxspanel.relapse.eachcohort.tsv) + +optional arguments: + -h, --help show this help message and exit + -v, --version Print the current 01-cnv-frequencies.py version and exit +``` + +Input: +- `data/histologies.tsv` +- `data/consensus_wgs_plus_cnvkit_wxs_autosomes.tsv.gz` +- `data/consensus_wgs_plus_cnvkit_wxs_x_and_y.tsv.gz` +- `analyses/independent-samples/results/independent-specimens.wgswxspanel.primary.eachcohort.tsv` +- `analyses/independent-samples/results/independent-specimens.wgswxspanel.relapse.eachcohort.tsv` + +Output: +- `analysis/cnv-frequencies/results/gene-level-cnv-consensus-annotated-mut-freq.tsv.gz` +- `analysis/cnv-frequencies/results/gene-level-cnv-consensus-annotated-mut-freq.jsonl.gz` + diff --git a/analyses/cnv-frequencies/results/gene-level-cnv-consensus-annotated-mut-freq.jsonl.gz b/analyses/cnv-frequencies/results/gene-level-cnv-consensus-annotated-mut-freq.jsonl.gz new file mode 100644 index 0000000000..1dd265254e Binary files /dev/null and b/analyses/cnv-frequencies/results/gene-level-cnv-consensus-annotated-mut-freq.jsonl.gz differ diff --git a/analyses/cnv-frequencies/results/gene-level-cnv-consensus-annotated-mut-freq.tsv.gz b/analyses/cnv-frequencies/results/gene-level-cnv-consensus-annotated-mut-freq.tsv.gz new file mode 100644 index 0000000000..6fa08dab5d Binary files /dev/null and b/analyses/cnv-frequencies/results/gene-level-cnv-consensus-annotated-mut-freq.tsv.gz differ diff --git a/analyses/cnv-frequencies/run-cnv-frequencies-analysis.sh b/analyses/cnv-frequencies/run-cnv-frequencies-analysis.sh new file mode 100644 index 0000000000..810373ce3b --- /dev/null +++ b/analyses/cnv-frequencies/run-cnv-frequencies-analysis.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Eric Wafula +# Pediatric OpenTargets 2021 + +# Purpose: Run a CNV consensus gene-level frequecies anaysis for PediatricOpenTargets + +# Set this so the whole loop stops if there is an error +set -e +set -o pipefail + +# All file paths in this bash script are based on root directory of this Git repository, +# therefore the script should always be run from the root directory of OPenPedCan-analysis +# Adapted from OpenPBTA analysis modules + +# Histology file path +histology_file=data/histologies.tsv + +# CNV consensus file path +autosomes_cnv_file=data/consensus_wgs_plus_cnvkit_wxs_autosomes.tsv.gz +allosomes_cnv_file=data/consensus_wgs_plus_cnvkit_wxs_x_and_y.tsv.gz + +# Independent primary tumor samples file path +primary_tumors=analyses/independent-samples/results/independent-specimens.wgswxspanel.primary.eachcohort.tsv + +# Independent relapse tumor samples file path +relapse_tumors=analyses/independent-samples/results/independent-specimens.wgswxspanel.relapse.eachcohort.tsv + +####### compute autosomes CNV frequencies ############# +python3 analyses/cnv-frequencies/01-cnv-frequencies.py $histology_file $autosomes_cnv_file $primary_tumors $relapse_tumors + +####### compute allosomes CNV frequencies ############# +python3 analyses/cnv-frequencies/01-cnv-frequencies.py $histology_file $allosomes_cnv_file $primary_tumors $relapse_tumors + +####### merge result files ###################### +# intricate bash command concatenates the autosomes and allosomes annotated CNV frequencies TSV files, and excludes the header line for the second +# file to avoid duplicate header in the merged file +# bash command "tail -n +2" outputs content of file starting from the second line +# bash Process substitution operator "<(...)" treats the sequence of enclosed commands as file +cat analyses/cnv-frequencies/results/consensus_wgs_plus_cnvkit_wxs_autosomes_annot_freq.tsv <(tail -n +2 analyses/cnv-frequencies/results/consensus_wgs_plus_cnvkit_wxs_x_and_y_annot_freq.tsv)> analyses/cnv-frequencies/results/gene-level-cnv-consensus-annotated-mut-freq.tsv +# simple bash command concatenates the autosomes and allosomes annotated CNV frequencies JSONL files +cat analyses/cnv-frequencies/results/consensus_wgs_plus_cnvkit_wxs_autosomes_annot_freq.jsonl analyses/cnv-frequencies/results/consensus_wgs_plus_cnvkit_wxs_x_and_y_annot_freq.jsonl > analyses/cnv-frequencies/results/gene-level-cnv-consensus-annotated-mut-freq.jsonl + +####### compress the final merged result files ###################### +gzip analyses/cnv-frequencies/results/gene-level-cnv-consensus-annotated-mut-freq* + +####### remove intermediate temporary and log files ###################### +rm -f analyses/cnv-frequencies/results/consensus_wgs_plus_cnvkit_wxs_* +rm -f analyses/cnv-frequencies/results/annotator.log +