Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update canonical transcript overrides with explanation #75

Merged
merged 6 commits into from
Mar 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions data/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,12 @@ GENOME_BUILD=$(firstword $(subst _, ,$(VERSION)))

ifeq ($(GENOME_BUILD), grch38)
MSKCC_ISOFORM_OVERRIDES_FILE_NAME=isoform_overrides_at_mskcc_grch38.txt
ONCOKB_ISOFORM_OVERRIDES_FILE_NAME=isoform_overrides_oncokb_grch38.txt
GENOME_NEXUS_ISOFORM_OVERRIDES_FILE_NAME=isoform_overrides_genome_nexus_grch38.txt
else
MSKCC_ISOFORM_OVERRIDES_FILE_NAME=isoform_overrides_at_mskcc_grch37.txt
ONCOKB_ISOFORM_OVERRIDES_FILE_NAME=isoform_overrides_oncokb_grch37.txt
GENOME_NEXUS_ISOFORM_OVERRIDES_FILE_NAME=isoform_overrides_genome_nexus_grch37.txt
endif

# Generic rule to unzip prerequisite files
Expand Down Expand Up @@ -162,9 +166,9 @@ $(TMP_DIR)/ensembl_biomart_transcripts_mouse.json.gz: $(TMP_DIR)/ensembl_biomart
# about 50m to run (TODO: this can be easily optimized)
# isoform_overrides_genome_nexus.txt is made for genome nexus, others files are generated for vcf2maf
# Please note: we should keep hgnc_complete_set_20221001 in sync with https://github.com/cBioPortal/datahub-study-curation-tools/blob/master/gene-table-update/build-input-for-importer/hgnc_complete_set.txt
# isoform_overrides_oncokb.txt is a list of OncoKB transcripts and genes that differ from msk_override, original file could be downloaded here: https://docs.google.com/spreadsheets/d/1ZZt8x0vvhrwL6VLQRzx3YE7XUOvEM-noQl7v6Tg7lCQ/edit#gid=0
$(TMP_DIR)/ensembl_biomart_canonical_transcripts_per_hgnc.txt: $(TMP_DIR)/ensembl_canonical_data.txt common_input/hgnc_complete_set_20221001.txt common_input/isoform_overrides_uniprot.txt common_input/$(MSKCC_ISOFORM_OVERRIDES_FILE_NAME) common_input/isoform_overrides_genome_nexus.txt common_input/isoform_overrides_oncokb.txt common_input/ignored_genes.txt
python3 ../scripts/make_one_canonical_transcript_per_gene.py $^ $@
# isoform_overrides_oncokb_grch3*.txt is a list of OncoKB transcripts and genes, it's generated by download_oncokb_isoform_overrides.py
$(TMP_DIR)/ensembl_biomart_canonical_transcripts_per_hgnc.txt: $(TMP_DIR)/ensembl_canonical_data.txt common_input/hgnc_complete_set_20221001.txt common_input/isoform_overrides_uniprot.txt common_input/$(MSKCC_ISOFORM_OVERRIDES_FILE_NAME) common_input/$(GENOME_NEXUS_ISOFORM_OVERRIDES_FILE_NAME) common_input/$(ONCOKB_ISOFORM_OVERRIDES_FILE_NAME) common_input/ignored_genes.txt
python ../scripts/make_one_canonical_transcript_per_gene.py $^ $@

# mouse version. A different script is called that set the canonicals based on Ensembl lookup.
$(TMP_DIR)/ensembl_biomart_canonical_transcripts_per_mgi.txt: $(TMP_DIR)/ensembl_canonical_data.txt common_input/mouse/MRK_ENSEMBL.rpt common_input/mouse/MGI_Gene_Model_Coord.rpt
Expand Down
2 changes: 0 additions & 2 deletions data/common_input/isoform_overrides_genome_nexus.txt

This file was deleted.

3 changes: 3 additions & 0 deletions data/common_input/isoform_overrides_genome_nexus_grch37.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
enst_id gene_name protein_stable_id gene_stable_id comment
ENST00000573679 PTPRC ENSP00000458322 ENSG00000262418
Copy link
Member Author

@leexgh leexgh Feb 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comes from: 88f0d68. @inodb Do you want to add some comments?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm having a hard time reconstructing why i picked that one, maybe ok to leave as is for now

ENST00000276681 MAL2 ENSP00000475434 ENSG00000147676 uniprot isoform overrides is ENST00000614891, but ENST00000614891 is only available for grch38
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
enst_id gene_name protein_stable_id gene_stable_id comment
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@inodb Do you have more grch38 transcript overrides to add?

86,444 changes: 43,222 additions & 43,222 deletions data/grch37_ensembl92/export/ensembl_biomart_canonical_transcripts_per_hgnc.txt

Large diffs are not rendered by default.

86,444 changes: 43,222 additions & 43,222 deletions data/grch38_ensembl92/export/ensembl_biomart_canonical_transcripts_per_hgnc.txt

Large diffs are not rendered by default.

86,444 changes: 43,222 additions & 43,222 deletions data/grch38_ensembl95/export/ensembl_biomart_canonical_transcripts_per_hgnc.txt

Large diffs are not rendered by default.

122 changes: 75 additions & 47 deletions scripts/make_one_canonical_transcript_per_gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,64 +5,104 @@
import argparse


def get_overrides_transcript(overrides_tables, ensembl_table, hgnc_symbol, hgnc_canonical_genes):
def get_overrides_transcript(overrides_tables, ensembl_table, ensembl_table_indexed_by_gene_stable_id, hgnc_symbol, hgnc_canonical_genes, overrides_table_names):
"""Find canonical transcript id for given hugo symbol. Overrides_tables is
a list of different override tables"""
for overrides in overrides_tables:
for index in range(len(overrides_tables)):
overrides = overrides_tables[index]
try:
# corner case when there are multiple overrides for a given gene symbol
if overrides.loc[hgnc_symbol].ndim > 1:
transcript = overrides.loc[hgnc_symbol].isoform_override.values[0]
isoform_override = overrides_table_names[index]
else:
transcript = overrides.loc[hgnc_symbol].isoform_override
return transcript
isoform_override = overrides_table_names[index]
# return transcript and isoform_override_explanation
return transcript, isoform_override
except KeyError:
pass
else:
# get ensembl canonical version otherwise
return get_ensembl_canonical_transcript_id_from_hgnc_then_ensembl(ensembl_table, hgnc_symbol, hgnc_canonical_genes, 'transcript_stable_id')

# get ensembl canonical version otherwise
return get_ensembl_canonical_transcript_id_from_hgnc_then_ensembl(ensembl_table, ensembl_table_indexed_by_gene_stable_id, hgnc_symbol, hgnc_canonical_genes, 'transcript_stable_id')

def pick_canonical_longest_transcript_from_ensembl_table(ensembl_rows, field):
"""Get canonical transcript id with largest protein length or if there is
no such thing, pick biggest gene id"""
return ensembl_rows.sort_values('is_canonical protein_length gene_stable_id'.split(), ascending=False)[field].values[0]

return ensembl_rows.sort_values('is_canonical protein_length gene_stable_id'.split(), ascending=False)[field].values[0], "ensembl longest"

def get_ensembl_canonical(ensembl_rows, field):

if (ensembl_rows.ndim == 1 and len(ensembl_rows) == 0) or (ensembl_rows.ndim == 2 and len(ensembl_rows) == 0):
return np.nan
return np.nan, np.nan
elif ensembl_rows.ndim == 1:
return ensembl_rows[field]
# this gene onlyhas one transcript from ensembl_table
return ensembl_rows[field], "ensembl only one transcript"
elif ensembl_rows.ndim == 2:
if len(ensembl_rows) == 1:
return ensembl_rows[field].values[0]
else:
return pick_canonical_longest_transcript_from_ensembl_table(ensembl_rows, field)
# there are multiple transcripts found by hugo_symbol or ensembl_gene_id
# we need to pick the longest transcript in this case
return pick_canonical_longest_transcript_from_ensembl_table(ensembl_rows, field)


def get_ensembl_canonical_transcript_id_from_hgnc_then_ensembl(ensembl_table, hgnc_symbol, hgnc_canonical_genes, field):
def get_ensembl_canonical_transcript_id_from_hgnc_then_ensembl(ensembl_table, ensembl_table_indexed_by_gene_stable_id, hgnc_symbol, hgnc_canonical_genes, field):
"""Determine canonical transcript based on hgnc mappings to ensembl id.
If not possible use ensembl's data."""
If not possible use ensembl's data.
ensembl_table is the same as ensembl_table_indexed_by_gene_stable_id
But ensembl_table has hugo_symbol as index
ensembl_table_indexed_by_gene_stable_id has gene_stable_id as index
Adding ensembl_table_indexed_by_gene_stable_id to improve performance since this is the most time consuming part
"""
try:
# try to find hgnc_symbol from hgnc_canonical_genes
# it should only have one row returned, otherwise raise an exception
hgnc_gene_rows = hgnc_canonical_genes.loc[hgnc_symbol]
except KeyError:
raise(Exception("Unknown hugo symbol"))

if hgnc_gene_rows.ndim == 1:
result = get_ensembl_canonical(ensembl_table[ensembl_table.gene_stable_id == hgnc_gene_rows.ensembl_gene_id], field)
if pd.isnull(result):
# use ensembl data to find canonical transcript if nan is found
try:
# from the only one record found in hgnc_canonical_genes, we could have the ensembl_gene_id from the record
# find transcripts from ensembl_table (same as ensembl_table_indexed_by_gene_stable_id) by searching ensembl_gene_id
# here we use ensembl_table_indexed_by_gene_stable_id instead to increase performance
# get_ensembl_canonical returns one transcript from all transcripts found by ensembl_gene_id
return get_ensembl_canonical(ensembl_table_indexed_by_gene_stable_id.loc[hgnc_gene_rows.ensembl_gene_id], field)
except KeyError:
# if couldn't find any transcripts by ensembl_gene_id, switch to searching by hgnc_symbol
# there's actually 222 of these (see notebook)
try:
return get_ensembl_canonical(ensembl_table.loc[hgnc_symbol], field)
except KeyError:
return np.nan
else:
return result
return np.nan, np.nan
else:
raise(Exception("One hugo symbol expected in hgnc_canonical_genes"))

def get_transcript_id_and_explanation(transcript_info_df, transcript_info_indexed_by_gene_stable_id, hugo_symbol, hgnc_df, oncokb, mskcc, uniprot, custom):
ensembl_canonical_gene, ensembl_canonical_gene_explanation = get_ensembl_canonical_transcript_id_from_hgnc_then_ensembl(transcript_info_df, transcript_info_indexed_by_gene_stable_id, hugo_symbol, hgnc_df, 'gene_stable_id')
ensembl_canonical_transcript, ensembl_canonical_transcript_explanation = get_ensembl_canonical_transcript_id_from_hgnc_then_ensembl(transcript_info_df, transcript_info_indexed_by_gene_stable_id, hugo_symbol, hgnc_df, 'transcript_stable_id')
genome_nexus_overrides_transcript, genome_nexus_overrides_transcript_explanation = get_overrides_transcript([custom], transcript_info_df, transcript_info_indexed_by_gene_stable_id, hugo_symbol, hgnc_df, ["genome nexus isoform override"])
uniprot_overrides_transcript, uniprot_overrides_transcript_explanation = get_overrides_transcript([custom, uniprot], transcript_info_df, transcript_info_indexed_by_gene_stable_id, hugo_symbol, hgnc_df, [ "manually override", "uniprot isoform override"])
mskcc_overrides_transcript, mskcc_overrides_transcript_explanation = get_overrides_transcript([oncokb, mskcc, custom, uniprot], transcript_info_df, transcript_info_indexed_by_gene_stable_id, hugo_symbol, hgnc_df, ["oncokb isoform override", "mskcc isoform override", "manually override", "uniprot isoform override"])
return pd.Series([
ensembl_canonical_gene,
ensembl_canonical_transcript,
ensembl_canonical_transcript_explanation,
genome_nexus_overrides_transcript,
genome_nexus_overrides_transcript_explanation,
uniprot_overrides_transcript,
uniprot_overrides_transcript_explanation,
mskcc_overrides_transcript,
mskcc_overrides_transcript_explanation
],
index="""
ensembl_canonical_gene
ensembl_canonical_transcript
ensembl_canonical_transcript_explanation
genome_nexus_canonical_transcript
genome_nexus_canonical_transcript_explanation
uniprot_canonical_transcript
uniprot_canonical_transcript_explanation
mskcc_canonical_transcript
mskcc_canonical_transcript_explanation
"""
.split()
)

def lowercase_set(x):
return set({i.lower() for i in x})
Expand Down Expand Up @@ -100,7 +140,7 @@ def main(ensembl_biomart_geneids_transcript_info,
.set_index('gene_name'.split())
oncokb = pd.read_csv(isoform_overrides_at_oncokb, sep='\t')\
.rename(columns={'enst_id':'isoform_override'})\
.set_index('gene_name'.split())
.set_index('hugo_symbol'.split())
hgnc_df = pd.read_csv(hgnc_complete_set, sep='\t', dtype=object)

# Convert new column names to old stable column names. If this is not done properly, Genome Nexus and any other
Expand Down Expand Up @@ -156,32 +196,20 @@ def main(ensembl_biomart_geneids_transcript_info,
'------ End of new genes list ------\n')
assert(len(new_genes) == 0)

transcript_info_df = transcript_info_df.set_index('hgnc_symbol')
transcript_info_df = transcript_info_df.set_index('hgnc_symbol').sort_index()
transcript_info_indexed_by_gene_stable_id = transcript_info_df
# duplicate a temp column to use as index
transcript_info_indexed_by_gene_stable_id["gene_stable_id_temp"] = transcript_info_df['gene_stable_id']
transcript_info_indexed_by_gene_stable_id = transcript_info_indexed_by_gene_stable_id.set_index('gene_stable_id_temp').sort_index()

# for testing use
# NSD3 replaces WHSC1L1
# AATF has uniprot canonical transcript, not hgnc ensembl gene id, but
# there are multiple in ensembl data dump
# hugos = ['KRT18P53', 'NSD3', 'AATF']

# TODO Optimize this part, as this part takes most time
one_transcript_per_hugo_symbol = pd.Series(hugos).apply(lambda x:
pd.Series(
[
get_ensembl_canonical_transcript_id_from_hgnc_then_ensembl(transcript_info_df, x, hgnc_df, 'gene_stable_id'),
get_ensembl_canonical_transcript_id_from_hgnc_then_ensembl(transcript_info_df, x, hgnc_df, 'transcript_stable_id'),
get_overrides_transcript([custom], transcript_info_df, x, hgnc_df),
get_overrides_transcript([uniprot, custom], transcript_info_df, x, hgnc_df),
get_overrides_transcript([oncokb, mskcc, uniprot, custom], transcript_info_df, x, hgnc_df),
],
index="""
ensembl_canonical_gene
ensembl_canonical_transcript
genome_nexus_canonical_transcript
uniprot_canonical_transcript
mskcc_canonical_transcript
""".split()
)
)
one_transcript_per_hugo_symbol = pd.Series(hugos).apply(lambda x: get_transcript_id_and_explanation(transcript_info_df, transcript_info_indexed_by_gene_stable_id, x, hgnc_df, oncokb, mskcc, uniprot, custom ))
one_transcript_per_hugo_symbol.index = hugos
one_transcript_per_hugo_symbol.index.name = 'hgnc_symbol'

Expand Down Expand Up @@ -209,9 +237,9 @@ def main(ensembl_biomart_geneids_transcript_info,
parser.add_argument("isoform_overrides_at_mskcc",
help="common_input/isoform_overrides_at_mskcc_grch37.txt or common_input/isoform_overrides_at_mskcc_grch38.txt")
parser.add_argument("isoform_overrides_genome_nexus",
help="common_input/isoform_overrides_genome_nexus.txt")
help="common_input/isoform_overrides_genome_nexus_grch37.txt or common_input/isoform_overrides_genome_nexus_grch38.txt")
parser.add_argument("isoform_overrides_at_oncokb",
help="common_input/isoform_overrides_at_oncokb.txt")
help="common_input/isoform_overrides_oncokb_grch37.txt or common_input/isoform_overrides_oncokb_grch38.txt")
parser.add_argument("ignored_genes_file_name",
help="common_input/ignored_genes.txt")
parser.add_argument("ensembl_biomart_canonical_transcripts_per_hgnc",
Expand Down