From c942e0061f85372fe5b3ecd0f07fe383b08dd989 Mon Sep 17 00:00:00 2001
From: "Jeroen F.J. Laros" <jlaros@fixedpoint.nl>
Date: Sun, 27 Sep 2015 21:38:37 +0200
Subject: [PATCH 1/2] Add back translator interface

---
 mutalyzer/backtranslator.py                   | 159 ++++++++++++++++++
 .../website/templates/back-translator.html    |  82 +++++++++
 mutalyzer/website/templates/base.html         |   1 +
 mutalyzer/website/views.py                    |  31 +++-
 requirements.txt                              |   3 +-
 5 files changed, 273 insertions(+), 3 deletions(-)
 create mode 100644 mutalyzer/backtranslator.py
 create mode 100644 mutalyzer/website/templates/back-translator.html

diff --git a/mutalyzer/backtranslator.py b/mutalyzer/backtranslator.py
new file mode 100644
index 00000000..1b84a9cd
--- /dev/null
+++ b/mutalyzer/backtranslator.py
@@ -0,0 +1,159 @@
+"""
+Mutalyzer interface for variant back translation (p. to c.).
+"""
+
+
+from __future__ import unicode_literals
+
+from backtranslate.backtranslate import BackTranslate
+from backtranslate.util import protein_letters_3to1
+from Bio.Data import CodonTable
+from extractor.variant import Allele, DNAVar, ISeq, ISeqList
+
+from . import ncbi, Retriever
+from .grammar import Grammar
+from .output import Output
+
+
+def backtranslate(output, description, table_id=1):
+    """
+    Back translation of an amino acid substitution to all possible
+    causal one-nucleotide substitutions.
+
+    :arg object output: Output object for logging.
+    :arg unicode description: Amino acid substitution in HGVS format.
+    :arg int table_id: Translation table id.
+
+    :returns: List of DNA substitutions in HGVS format.
+    :rtype: list(unicode)
+    """
+    # TODO: We currently only support NM and NP references, but in principle
+    #   we could also support other references.
+    # TODO: For NP references where we don't find a link to the corresponding
+    #   NM, we don't check if the specified reference amino acid corresponds
+    #   to the NP reference sequence.
+    parse_tree = Grammar(output).parse(description)
+
+    if not parse_tree:
+        return []
+    if parse_tree.RefType != 'p':
+        output.addMessage(
+            __file__, 3, 'ENOPROT', 'Reference type is not p. (protein).')
+        return []
+    if parse_tree.RawVar.MutationType != 'subst':
+        output.addMessage(
+            __file__, 3, 'ENOSUBST', 'Variant is not a substitution.')
+        return []
+
+    if parse_tree.Version:
+        accession_number = '{}.{}'.format(parse_tree.RefSeqAcc, parse_tree.Version)
+    else:
+        accession_number = parse_tree.RefSeqAcc
+    position = int(parse_tree.RawVar.Main)
+    reference_amino_acid, amino_acid = parse_tree.RawVar.Args
+
+    if len(reference_amino_acid) == 3:
+        reference_amino_acid = protein_letters_3to1[reference_amino_acid]
+    if len(amino_acid) == 3:
+        amino_acid = protein_letters_3to1[amino_acid]
+
+    bt = BackTranslate(table_id)
+
+    # FIXME: Rancid workaround to silence fatal error raised by `loadrecord`.
+    output_ = Output('')
+    retriever = Retriever.GenBankRetriever(output_)
+
+    # The genbank retriever does not (yet) support protein references, but we
+    # cannot reliably distinguish between different reference types from the
+    # variant description before downloading the reference.
+    # Therefore, we just try to download the reference. This will succeed for
+    # transcript references, but fail for protein references.
+    # As a quick and dirty optimization, we shortcut this for accessions
+    # starting with 'NP_', of which we know that they are protein references.
+    # In the future we hope to support protein references directly.
+    if accession_number.startswith('NP_'):
+        genbank_record = None
+    else:
+        genbank_record = retriever.loadrecord(accession_number)
+
+    if not genbank_record:
+        # Assuming RefSeqAcc is an NP, try to get the corresponding NM.
+        version = int(parse_tree.Version) if parse_tree.Version else None
+
+        try:
+            transcript = ncbi.protein_to_transcript(parse_tree.RefSeqAcc,
+                                                    version)
+        except ncbi.NoLinkError:
+            pass
+        else:
+            if transcript[1] is not None:
+                accession_number = '{}.{}'.format(*transcript)
+            else:
+                output.addMessage(
+                    __file__, 2, 'WNOVERSION',
+                    'Found corresponding nucleotide sequence, but note that '
+                    'the version number is missing.')
+                accession_number = transcript[0]
+            genbank_record = retriever.loadrecord(accession_number)
+
+    offset = (position - 1) * 3
+    if genbank_record and genbank_record.molType == 'n':
+        # Only NM for now.
+        cds_loc = genbank_record.geneList[0].transcriptList[0].CDS.location
+        codon = genbank_record \
+            .seq[cds_loc[0] - 1:cds_loc[1]][offset:offset + 3]
+
+        forward_table = CodonTable.unambiguous_dna_by_id[table_id]
+        found_ref = forward_table.forward_table[unicode(codon)]
+        if reference_amino_acid != found_ref:
+            output.addMessage(
+                __file__, 3, 'EREF',
+                '{} not found at position {}, found {} instead.'.format(
+                    reference_amino_acid, position, found_ref))
+
+        substitutions = bt.with_dna(unicode(codon), amino_acid)
+    else:
+        # Assume NP.
+        output.addMessage(
+            __file__, 2, 'WNODNA',
+            'Nucleotide reference sequence could not be found, using '
+            'protein fallback method.')
+        accession_number = 'UNKNOWN'
+
+        substitutions = bt.without_dna(reference_amino_acid, amino_acid)
+        if (reference_amino_acid, amino_acid) in bt.improvable():
+            output.addMessage(
+                __file__, 2, 'WIMPROVE',
+                'The back translation for this variant can be improved by '
+                'supplying a nucleotide reference sequence.')
+
+    return ['{}:c.{}'.format(accession_number, v)
+            for v in subst_to_hgvs(substitutions, offset)]
+
+
+def subst_to_hgvs(substitutions, offset=0):
+    """
+    Translate a set of substitutions to HGVS.
+
+    :arg dict substitutions: Set of single nucleotide substitutions indexed by
+      position.
+    :arg int offset: Codon position in the CDS.
+
+    :returns: Substitutions in HGVS format.
+    :rtype: set(Allele)
+    """
+    variants = set()
+
+    for position in substitutions:
+        for substitution in substitutions[position]:
+            variants.add(Allele([DNAVar(
+                start=position + offset + 1,
+                end=position + offset + 1,
+                sample_start=position + offset + 1,
+                sample_end=position + offset + 1,
+                type='subst',
+                deleted=ISeqList([ISeq(sequence=substitution[0])]),
+                inserted=ISeqList([ISeq(sequence=substitution[1])])
+            )]))
+
+    return variants
diff --git a/mutalyzer/website/templates/back-translator.html b/mutalyzer/website/templates/back-translator.html
new file mode 100644
index 00000000..d59aaf28
--- /dev/null
+++ b/mutalyzer/website/templates/back-translator.html
@@ -0,0 +1,82 @@
+{% extends "base.html" %}
+
+{% set active_page = "back-translator" %}
+{% set page_title = "Back Translator" %}
+
+{% block content %}
+
+<p class="alert alert-warning">
+Please note that this is an experimental service.
+</p>
+
+<p>
+Back translation from amino acid substitutions to nucleotide substitutions.
+</p>
+
+<p>
+The underlying algorithm is also available as a command line utility and
+programming library
+(<a href="https://github.com/mutalyzer/backtranslate">source</a>,
+<a href="https://pypi.python.org/pypi/backtranslate">package</a>).
+</p>
+
+<p>
+Please supply an amino acid substitution.
+</p>
+
+<form class="form" action="{{ url_for('.back_translator') }}" method="get">
+  <div class="form-group">
+    <label for="description">Variant description</label>
+    <input class="form-control form-pre" type="text"
+           name="description" id="description" value="{{ description }}" placeholder="Variant description using HGVS format">
+    <p>Examples:
+      <code class="example-input" data-for="description">NM_003002.3:p.Asp92Tyr</code>,
+      <code class="example-input" data-for="description">NP_002993.1:p.Asp92Glu</code>,
+      <code class="example-input" data-for="description">NP_000000.0:p.Asp92Tyr</code>,
+      <code class="example-input" data-for="description">NP_000000.0:p.Leu92Phe</code>
+    </p>
+  </div>
+  <div class="form-group button-group">
+    <input type="submit" class="btn btn-primary" value="Back translate">
+    <a href="https://humgenprojects.lumc.nl/trac/mutalyzer/wiki/BackTranslator" target="new" class="btn btn-default pull-right">Help</a>
+  </div>
+</form>
+
+{% if description %}
+  <hr>
+  {% for m in messages %}
+    {% if m.class == "error" %}
+      <p class="alert alert-danger" title="{{ m.level }} (origin: {{ m.origin }})">{{ m.description }}</p>
+    {% elif m.class == "warning" %}
+      <p class="alert alert-warning" title="{{ m.level }} (origin: {{ m.origin }})">{{ m.description }}</p>
+    {% elif m.class == "information" %}
+      <p class="alert alert-info" title="{{ m.level }} (origin: {{ m.origin }})">{{ m.description }}</p>
+    {% elif m.class == "debug" %}
+      <p class="alert alert-info" title="{{ m.level }} (origin: {{ m.origin }})">{{ m.description }}</p>
+    {% endif %}
+  {% endfor %}
+
+  {% if summary == "0 Errors, 0 Warnings." %}
+    <p class="alert alert-success summary">{{ summary }}</p>
+  {% else %}
+    <p>{{summary}}</p>
+  {% endif %}
+
+  {% if not errors %}
+    <hr>
+    <h4>Input</h4>
+    <p><code>{{ description }}</code></p>
+
+    <h4>Possible variants</h4>
+    {% for variant in variants %}
+      {% if variant.startswith('UNKNOWN:') %}
+        <p><code>{{ variant }}</code></p>
+      {% else %}
+        <p><code><a href="{{ url_for('.name_checker', description=variant) }}">{{ variant }}</a></code></p>
+      {% endif %}
+    {% endfor %}
+
+  {% endif %}
+{% endif %}{# variants #}
+
+{% endblock content %}
diff --git a/mutalyzer/website/templates/base.html b/mutalyzer/website/templates/base.html
index 6021bd17..ae6ade1a 100644
--- a/mutalyzer/website/templates/base.html
+++ b/mutalyzer/website/templates/base.html
@@ -38,6 +38,7 @@
         ('website.snp_converter', {}, 'snp-converter', 'SNP Converter', False, False),
         ('website.name_generator', {}, 'name-generator', 'Name Generator', False, False),
         ('website.description_extractor', {}, 'description-extractor', 'Description Extractor', False, False),
+        ('website.back_translator', {}, 'back-translator', 'Back Translator', False, False),
         ('website.reference_loader', {}, 'reference-loader', 'Reference File Loader', False, True),
 
         ('website.batch_jobs', {}, 'batch-jobs', 'Batch Jobs', True, False),
diff --git a/mutalyzer/website/views.py b/mutalyzer/website/views.py
index 0b8ce1b1..df1fa716 100644
--- a/mutalyzer/website/views.py
+++ b/mutalyzer/website/views.py
@@ -23,8 +23,8 @@
 import extractor
 
 import mutalyzer
-from mutalyzer import (announce, File, Retriever, Scheduler, stats,
-                       util, variantchecker)
+from mutalyzer import (announce, backtranslator, File, Retriever, Scheduler,
+                       stats, util, variantchecker)
 from mutalyzer.config import settings
 from mutalyzer.db.models import BATCH_JOB_TYPES
 from mutalyzer.db.models import Assembly, BatchJob
@@ -682,6 +682,33 @@ def check_position(position, field):
                            errors=errors)
 
 
+@website.route('/back-translator')
+def back_translator():
+    """
+    Back translator.
+    """
+    output = Output(__file__)
+    output.addMessage(
+        __file__, -1, 'INFO',
+        'Received Back Translate request from {}'.format(request.remote_addr))
+    stats.increment_counter('back-translator/website')
+
+    description = request.args.get('description')
+
+    variants = []
+    if description:
+        variants = backtranslator.backtranslate(output, description)
+
+    errors, warnings, summary = output.Summary()
+    messages = map(util.message_info, output.getMessages())
+
+    output.addMessage(__file__, -1, 'INFO', 'Finished Back Translate request')
+
+    return render_template(
+        'back-translator.html', errors=errors, summary=summary,
+        description=description or '', messages=messages, variants=variants)
+
+
 @website.route('/description-extractor')
 def description_extractor():
     """
diff --git a/requirements.txt b/requirements.txt
index 32d7ba52..ecd4026e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
 -e git+https://github.com/LUMC/magic-python.git#egg=Magic_file_extensions
-description-extractor==2.2.1
 Flask==0.10.1
 Jinja2==2.7.3
 MySQL-python==1.2.5
@@ -8,9 +7,11 @@ SQLAlchemy==0.9.8
 Sphinx==1.2.3
 Werkzeug==0.9.6
 alembic==0.8.2
+backtranslate==0.0.5
 biopython==1.64
 chardet==2.3.0
 cssselect==0.9.1
+description-extractor==2.2.1
 lxml==3.4.0
 mock==1.0.1
 mockredispy==2.9.0.9

From 7feaf88936f49aa3316912b8505836a0a200eb1b Mon Sep 17 00:00:00 2001
From: Martijn Vermaat <martijn@vermaat.name>
Date: Thu, 8 Oct 2015 16:06:07 +0200
Subject: [PATCH 2/2] Add tests for backtranslator module

---
 tests/test_backtranslator.py | 116 +++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 tests/test_backtranslator.py

diff --git a/tests/test_backtranslator.py b/tests/test_backtranslator.py
new file mode 100644
index 00000000..8799f018
--- /dev/null
+++ b/tests/test_backtranslator.py
@@ -0,0 +1,116 @@
+"""
+Tests for the mutalyzer.backtranslator module.
+"""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+from mutalyzer.backtranslator import backtranslate
+import mutalyzer.Retriever
+
+from fixtures import with_links, with_references
+
+
+DIVERSIONS = {
+    'NM_003002': 'NM_003002.2'
+}
+
+
+@pytest.fixture
+def divert(monkeypatch, request):
+    """
+    Ad-hoc fixture diverting a retriever call for NM_003002 to NM_003002.2.
+
+    This is useful to mimic behaviour of the retriever for the accession
+    without version number, which we cannot do with a normal fixture (the
+    retriever does not support non-versioned references in the database).
+    """
+    loadrecord = mutalyzer.Retriever.GenBankRetriever.loadrecord
+    def mock_loadrecord(self, identifier):
+        identifier = DIVERSIONS.get(identifier, identifier)
+        return loadrecord(self, identifier)
+    monkeypatch.setattr(mutalyzer.Retriever.GenBankRetriever, 'loadrecord', mock_loadrecord)
+
+
+@with_references('NM_003002.2')
+def test_nm(output):
+    """
+    Back translate on NM.
+    """
+    variants = backtranslate(output, 'NM_003002.2:p.Asp92Tyr')
+    assert sorted(variants) == ['NM_003002.2:c.274G>T']
+    assert len(output.getMessagesWithErrorCode('WNOVERSION')) == 0
+    assert len(output.getMessagesWithErrorCode('WNODNA')) == 0
+    assert len(output.getMessagesWithErrorCode('WIMPROVE')) == 0
+
+
+@with_references('NM_003002.2')
+@pytest.mark.usefixtures('divert')
+def test_nm_no_version(output):
+    """
+    Back translate on NM.
+    """
+    variants = backtranslate(output, 'NM_003002:p.Asp92Tyr')
+    assert sorted(variants) == ['NM_003002:c.274G>T']
+    assert len(output.getMessagesWithErrorCode('WNOVERSION')) == 0
+    assert len(output.getMessagesWithErrorCode('WNODNA')) == 0
+    assert len(output.getMessagesWithErrorCode('WIMPROVE')) == 0
+
+
+@with_links(('NM_003002.2', 'NP_002993.1'))
+@with_references('NM_003002.2')
+def test_np(output):
+    """
+    Back translate on NP.
+    """
+    variants = backtranslate(output, 'NP_002993.1:p.Asp92Glu')
+    assert sorted(variants) == ['NM_003002.2:c.276C>A',
+                                'NM_003002.2:c.276C>G']
+    assert len(output.getMessagesWithErrorCode('WNOVERSION')) == 0
+    assert len(output.getMessagesWithErrorCode('WNODNA')) == 0
+    assert len(output.getMessagesWithErrorCode('WIMPROVE')) == 0
+
+
+@with_links(('NM_003002', 'NP_002993'))
+@with_references('NM_003002.2')
+@pytest.mark.usefixtures('divert')
+def test_np_no_version(output):
+    """
+    Back translate on NP.
+    """
+    variants = backtranslate(output, 'NP_002993:p.Asp92Glu')
+    assert sorted(variants) == ['NM_003002:c.276C>A',
+                                'NM_003002:c.276C>G']
+    assert len(output.getMessagesWithErrorCode('WNOVERSION')) == 1
+    assert len(output.getMessagesWithErrorCode('WNODNA')) == 0
+    assert len(output.getMessagesWithErrorCode('WIMPROVE')) == 0
+
+
+@with_links((None, 'NP_000000.0'))
+def test_np_no_nm(output):
+    """
+    Back translate on NP without NM.
+    """
+    variants = backtranslate(output, 'NP_000000.0:p.Asp92Tyr')
+    assert sorted(variants) == ['UNKNOWN:c.274G>T']
+    assert len(output.getMessagesWithErrorCode('WNOVERSION')) == 0
+    assert len(output.getMessagesWithErrorCode('WNODNA')) == 1
+    assert len(output.getMessagesWithErrorCode('WIMPROVE')) == 0
+
+
+@with_links((None, 'NP_000000.0'))
+def test_np_no_nm_improvable(output):
+    """
+    Back translate on NP without NM improvable.
+    """
+    variants = backtranslate(output, 'NP_000000.0:p.Leu92Phe')
+    assert sorted(variants) == ['UNKNOWN:c.274C>T',
+                                'UNKNOWN:c.276A>C',
+                                'UNKNOWN:c.276A>T',
+                                'UNKNOWN:c.276G>C',
+                                'UNKNOWN:c.276G>T']
+    assert len(output.getMessagesWithErrorCode('WNOVERSION')) == 0
+    assert len(output.getMessagesWithErrorCode('WNODNA')) == 1
+    assert len(output.getMessagesWithErrorCode('WIMPROVE')) == 1