From c942e0061f85372fe5b3ecd0f07fe383b08dd989 Mon Sep 17 00:00:00 2001 From: "Jeroen F.J. Laros" Date: Sun, 27 Sep 2015 21:38:37 +0200 Subject: [PATCH 1/2] Add back translator interface --- mutalyzer/backtranslator.py | 159 ++++++++++++++++++ .../website/templates/back-translator.html | 82 +++++++++ mutalyzer/website/templates/base.html | 1 + mutalyzer/website/views.py | 31 +++- requirements.txt | 3 +- 5 files changed, 273 insertions(+), 3 deletions(-) create mode 100644 mutalyzer/backtranslator.py create mode 100644 mutalyzer/website/templates/back-translator.html diff --git a/mutalyzer/backtranslator.py b/mutalyzer/backtranslator.py new file mode 100644 index 00000000..1b84a9cd --- /dev/null +++ b/mutalyzer/backtranslator.py @@ -0,0 +1,159 @@ +""" +Mutalyzer interface for variant back translation (p. to c.). +""" + + +from __future__ import unicode_literals + +from backtranslate.backtranslate import BackTranslate +from backtranslate.util import protein_letters_3to1 +from Bio.Data import CodonTable +from extractor.variant import Allele, DNAVar, ISeq, ISeqList + +from . import ncbi, Retriever +from .grammar import Grammar +from .output import Output + + +def backtranslate(output, description, table_id=1): + """ + Back translation of an amino acid substitution to all possible + causal one-nucleotide substitutions. + + :arg object output: Output object for logging. + :arg unicode description: Amino acid substitution in HGVS format. + :arg int table_id: Translation table id. + + :returns: List of DNA substitutions in HGVS format. + :rtype: list(unicode) + """ + # TODO: We currently only support NM and NP references, but in principle + # we could also support other references. + # TODO: For NP references where we don't find a link to the corresponding + # NM, we don't check if the specified reference amino acid corresponds + # to the NP reference sequence. + parse_tree = Grammar(output).parse(description) + + if not parse_tree: + return [] + if parse_tree.RefType != 'p': + output.addMessage( + __file__, 3, 'ENOPROT', 'Reference type is not p. (protein).') + return [] + if parse_tree.RawVar.MutationType != 'subst': + output.addMessage( + __file__, 3, 'ENOSUBST', 'Variant is not a substitution.') + return [] + + if parse_tree.Version: + accession_number = '{}.{}'.format(parse_tree.RefSeqAcc, parse_tree.Version) + else: + accession_number = parse_tree.RefSeqAcc + position = int(parse_tree.RawVar.Main) + reference_amino_acid, amino_acid = parse_tree.RawVar.Args + + if len(reference_amino_acid) == 3: + reference_amino_acid = protein_letters_3to1[reference_amino_acid] + if len(amino_acid) == 3: + amino_acid = protein_letters_3to1[amino_acid] + + bt = BackTranslate(table_id) + + # FIXME: Rancid workaround to silence fatal error raised by `loadrecord`. + output_ = Output('') + retriever = Retriever.GenBankRetriever(output_) + + # The genbank retriever does not (yet) support protein references, but we + # cannot reliably distinguish between different reference types from the + # variant description before downloading the reference. + # Therefore, we just try to download the reference. This will succeed for + # transcript references, but fail for protein references. + # As a quick and dirty optimization, we shortcut this for accessions + # starting with 'NP_', of which we know that they are protein references. + # In the future we hope to support protein references directly. + if accession_number.startswith('NP_'): + genbank_record = None + else: + genbank_record = retriever.loadrecord(accession_number) + + if not genbank_record: + # Assuming RefSeqAcc is an NP, try to get the corresponding NM. + version = int(parse_tree.Version) if parse_tree.Version else None + + try: + transcript = ncbi.protein_to_transcript(parse_tree.RefSeqAcc, + version) + except ncbi.NoLinkError: + pass + else: + if transcript[1] is not None: + accession_number = '{}.{}'.format(*transcript) + else: + output.addMessage( + __file__, 2, 'WNOVERSION', + 'Found corresponding nucleotide sequence, but note that ' + 'the version number is missing.') + accession_number = transcript[0] + genbank_record = retriever.loadrecord(accession_number) + + offset = (position - 1) * 3 + if genbank_record and genbank_record.molType == 'n': + # Only NM for now. + cds_loc = genbank_record.geneList[0].transcriptList[0].CDS.location + codon = genbank_record \ + .seq[cds_loc[0] - 1:cds_loc[1]][offset:offset + 3] + + forward_table = CodonTable.unambiguous_dna_by_id[table_id] + found_ref = forward_table.forward_table[unicode(codon)] + if reference_amino_acid != found_ref: + output.addMessage( + __file__, 3, 'EREF', + '{} not found at position {}, found {} instead.'.format( + reference_amino_acid, position, found_ref)) + + substitutions = bt.with_dna(unicode(codon), amino_acid) + else: + # Assume NP. + output.addMessage( + __file__, 2, 'WNODNA', + 'Nucleotide reference sequence could not be found, using ' + 'protein fallback method.') + accession_number = 'UNKNOWN' + + substitutions = bt.without_dna(reference_amino_acid, amino_acid) + if (reference_amino_acid, amino_acid) in bt.improvable(): + output.addMessage( + __file__, 2, 'WIMPROVE', + 'The back translation for this variant can be improved by ' + 'supplying a nucleotide reference sequence.') + + return ['{}:c.{}'.format(accession_number, v) + for v in subst_to_hgvs(substitutions, offset)] + + +def subst_to_hgvs(substitutions, offset=0): + """ + Translate a set of substitutions to HGVS. + + :arg dict substitutions: Set of single nucleotide substitutions indexed by + position. + :arg int offset: Codon position in the CDS. + + :returns: Substitutions in HGVS format. + :rtype: set(Allele) + """ + variants = set() + + for position in substitutions: + for substitution in substitutions[position]: + variants.add(Allele([DNAVar( + start=position + offset + 1, + end=position + offset + 1, + sample_start=position + offset + 1, + sample_end=position + offset + 1, + type='subst', + deleted=ISeqList([ISeq(sequence=substitution[0])]), + inserted=ISeqList([ISeq(sequence=substitution[1])]) + )])) + + return variants diff --git a/mutalyzer/website/templates/back-translator.html b/mutalyzer/website/templates/back-translator.html new file mode 100644 index 00000000..d59aaf28 --- /dev/null +++ b/mutalyzer/website/templates/back-translator.html @@ -0,0 +1,82 @@ +{% extends "base.html" %} + +{% set active_page = "back-translator" %} +{% set page_title = "Back Translator" %} + +{% block content %} + +

+Please note that this is an experimental service. +

+ +

+Back translation from amino acid substitutions to nucleotide substitutions. +

+ +

+The underlying algorithm is also available as a command line utility and +programming library +(source, +package). +

+ +

+Please supply an amino acid substitution. +

+ +
+
+ + +

Examples: + NM_003002.3:p.Asp92Tyr, + NP_002993.1:p.Asp92Glu, + NP_000000.0:p.Asp92Tyr, + NP_000000.0:p.Leu92Phe +

+
+
+ + Help +
+
+ +{% if description %} +
+ {% for m in messages %} + {% if m.class == "error" %} +

{{ m.description }}

+ {% elif m.class == "warning" %} +

{{ m.description }}

+ {% elif m.class == "information" %} +

{{ m.description }}

+ {% elif m.class == "debug" %} +

{{ m.description }}

+ {% endif %} + {% endfor %} + + {% if summary == "0 Errors, 0 Warnings." %} +

{{ summary }}

+ {% else %} +

{{summary}}

+ {% endif %} + + {% if not errors %} +
+

Input

+

{{ description }}

+ +

Possible variants

+ {% for variant in variants %} + {% if variant.startswith('UNKNOWN:') %} +

{{ variant }}

+ {% else %} +

{{ variant }}

+ {% endif %} + {% endfor %} + + {% endif %} +{% endif %}{# variants #} + +{% endblock content %} diff --git a/mutalyzer/website/templates/base.html b/mutalyzer/website/templates/base.html index 6021bd17..ae6ade1a 100644 --- a/mutalyzer/website/templates/base.html +++ b/mutalyzer/website/templates/base.html @@ -38,6 +38,7 @@ ('website.snp_converter', {}, 'snp-converter', 'SNP Converter', False, False), ('website.name_generator', {}, 'name-generator', 'Name Generator', False, False), ('website.description_extractor', {}, 'description-extractor', 'Description Extractor', False, False), + ('website.back_translator', {}, 'back-translator', 'Back Translator', False, False), ('website.reference_loader', {}, 'reference-loader', 'Reference File Loader', False, True), ('website.batch_jobs', {}, 'batch-jobs', 'Batch Jobs', True, False), diff --git a/mutalyzer/website/views.py b/mutalyzer/website/views.py index 0b8ce1b1..df1fa716 100644 --- a/mutalyzer/website/views.py +++ b/mutalyzer/website/views.py @@ -23,8 +23,8 @@ import extractor import mutalyzer -from mutalyzer import (announce, File, Retriever, Scheduler, stats, - util, variantchecker) +from mutalyzer import (announce, backtranslator, File, Retriever, Scheduler, + stats, util, variantchecker) from mutalyzer.config import settings from mutalyzer.db.models import BATCH_JOB_TYPES from mutalyzer.db.models import Assembly, BatchJob @@ -682,6 +682,33 @@ def check_position(position, field): errors=errors) +@website.route('/back-translator') +def back_translator(): + """ + Back translator. + """ + output = Output(__file__) + output.addMessage( + __file__, -1, 'INFO', + 'Received Back Translate request from {}'.format(request.remote_addr)) + stats.increment_counter('back-translator/website') + + description = request.args.get('description') + + variants = [] + if description: + variants = backtranslator.backtranslate(output, description) + + errors, warnings, summary = output.Summary() + messages = map(util.message_info, output.getMessages()) + + output.addMessage(__file__, -1, 'INFO', 'Finished Back Translate request') + + return render_template( + 'back-translator.html', errors=errors, summary=summary, + description=description or '', messages=messages, variants=variants) + + @website.route('/description-extractor') def description_extractor(): """ diff --git a/requirements.txt b/requirements.txt index 32d7ba52..ecd4026e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ -e git+https://github.com/LUMC/magic-python.git#egg=Magic_file_extensions -description-extractor==2.2.1 Flask==0.10.1 Jinja2==2.7.3 MySQL-python==1.2.5 @@ -8,9 +7,11 @@ SQLAlchemy==0.9.8 Sphinx==1.2.3 Werkzeug==0.9.6 alembic==0.8.2 +backtranslate==0.0.5 biopython==1.64 chardet==2.3.0 cssselect==0.9.1 +description-extractor==2.2.1 lxml==3.4.0 mock==1.0.1 mockredispy==2.9.0.9 From 7feaf88936f49aa3316912b8505836a0a200eb1b Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Thu, 8 Oct 2015 16:06:07 +0200 Subject: [PATCH 2/2] Add tests for backtranslator module --- tests/test_backtranslator.py | 116 +++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 tests/test_backtranslator.py diff --git a/tests/test_backtranslator.py b/tests/test_backtranslator.py new file mode 100644 index 00000000..8799f018 --- /dev/null +++ b/tests/test_backtranslator.py @@ -0,0 +1,116 @@ +""" +Tests for the mutalyzer.backtranslator module. +""" + + +from __future__ import unicode_literals + +import pytest + +from mutalyzer.backtranslator import backtranslate +import mutalyzer.Retriever + +from fixtures import with_links, with_references + + +DIVERSIONS = { + 'NM_003002': 'NM_003002.2' +} + + +@pytest.fixture +def divert(monkeypatch, request): + """ + Ad-hoc fixture diverting a retriever call for NM_003002 to NM_003002.2. + + This is useful to mimic behaviour of the retriever for the accession + without version number, which we cannot do with a normal fixture (the + retriever does not support non-versioned references in the database). + """ + loadrecord = mutalyzer.Retriever.GenBankRetriever.loadrecord + def mock_loadrecord(self, identifier): + identifier = DIVERSIONS.get(identifier, identifier) + return loadrecord(self, identifier) + monkeypatch.setattr(mutalyzer.Retriever.GenBankRetriever, 'loadrecord', mock_loadrecord) + + +@with_references('NM_003002.2') +def test_nm(output): + """ + Back translate on NM. + """ + variants = backtranslate(output, 'NM_003002.2:p.Asp92Tyr') + assert sorted(variants) == ['NM_003002.2:c.274G>T'] + assert len(output.getMessagesWithErrorCode('WNOVERSION')) == 0 + assert len(output.getMessagesWithErrorCode('WNODNA')) == 0 + assert len(output.getMessagesWithErrorCode('WIMPROVE')) == 0 + + +@with_references('NM_003002.2') +@pytest.mark.usefixtures('divert') +def test_nm_no_version(output): + """ + Back translate on NM. + """ + variants = backtranslate(output, 'NM_003002:p.Asp92Tyr') + assert sorted(variants) == ['NM_003002:c.274G>T'] + assert len(output.getMessagesWithErrorCode('WNOVERSION')) == 0 + assert len(output.getMessagesWithErrorCode('WNODNA')) == 0 + assert len(output.getMessagesWithErrorCode('WIMPROVE')) == 0 + + +@with_links(('NM_003002.2', 'NP_002993.1')) +@with_references('NM_003002.2') +def test_np(output): + """ + Back translate on NP. + """ + variants = backtranslate(output, 'NP_002993.1:p.Asp92Glu') + assert sorted(variants) == ['NM_003002.2:c.276C>A', + 'NM_003002.2:c.276C>G'] + assert len(output.getMessagesWithErrorCode('WNOVERSION')) == 0 + assert len(output.getMessagesWithErrorCode('WNODNA')) == 0 + assert len(output.getMessagesWithErrorCode('WIMPROVE')) == 0 + + +@with_links(('NM_003002', 'NP_002993')) +@with_references('NM_003002.2') +@pytest.mark.usefixtures('divert') +def test_np_no_version(output): + """ + Back translate on NP. + """ + variants = backtranslate(output, 'NP_002993:p.Asp92Glu') + assert sorted(variants) == ['NM_003002:c.276C>A', + 'NM_003002:c.276C>G'] + assert len(output.getMessagesWithErrorCode('WNOVERSION')) == 1 + assert len(output.getMessagesWithErrorCode('WNODNA')) == 0 + assert len(output.getMessagesWithErrorCode('WIMPROVE')) == 0 + + +@with_links((None, 'NP_000000.0')) +def test_np_no_nm(output): + """ + Back translate on NP without NM. + """ + variants = backtranslate(output, 'NP_000000.0:p.Asp92Tyr') + assert sorted(variants) == ['UNKNOWN:c.274G>T'] + assert len(output.getMessagesWithErrorCode('WNOVERSION')) == 0 + assert len(output.getMessagesWithErrorCode('WNODNA')) == 1 + assert len(output.getMessagesWithErrorCode('WIMPROVE')) == 0 + + +@with_links((None, 'NP_000000.0')) +def test_np_no_nm_improvable(output): + """ + Back translate on NP without NM improvable. + """ + variants = backtranslate(output, 'NP_000000.0:p.Leu92Phe') + assert sorted(variants) == ['UNKNOWN:c.274C>T', + 'UNKNOWN:c.276A>C', + 'UNKNOWN:c.276A>T', + 'UNKNOWN:c.276G>C', + 'UNKNOWN:c.276G>T'] + assert len(output.getMessagesWithErrorCode('WNOVERSION')) == 0 + assert len(output.getMessagesWithErrorCode('WNODNA')) == 1 + assert len(output.getMessagesWithErrorCode('WIMPROVE')) == 1