[translate] require nodes to have sequences

This is the implicit expectation, and is true for all of our pipelines. In theory it's possible to translate without having the full sequence attached to the node by reconstructing mutations on the root, but that is not the approach taken by the current code. As we explicitly check the tree nodes against the sequences in the node-data JSON we can skip the automatic tests optionally performed when reading the node-data JSON. Closes #1345
nextstrain · Dec 11, 2023 · 4d449ae · 4d449ae
1 parent 5218124
commit 4d449ae
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 3 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -8,7 +8,7 @@
 * ancestral: Improvements to command line arguments. [#1344][] (@jameshadfield)
      * Incompatible arguments are now checked, especially related to VCF vs FASTA inputs. 
      * `--vcf-reference` and `--root-sequence` are now mutually exclusive.
-
+* translate: Tree nodes are checked against the node-data JSON input to ensure sequences are present. [#1348][] (@jameshadfield)
 * translate: Improvements to command line arguments.  [#1348][] (@jameshadfield)
     * `--tree` and `--ancestral-sequences` are now required arguments.
     * separate VCF-only arguments into their own group

diff --git a/augur/translate.py b/augur/translate.py
@@ -20,6 +20,7 @@
 from .utils import read_node_data, load_features, write_json, get_json_name
 from treetime.vcf_utils import read_vcf
 from augur.errors import AugurError
+from textwrap import dedent
 
 class MissingNodeError(Exception):
     pass
@@ -339,14 +340,22 @@ def sequences_json(node_data_json, tree):
     Extract the full nuc sequence for each node in the provided node-data JSON.
     Returns a dict, keys are node names and values are a string of the genome sequence (nuc)
     """
-    node_data = read_node_data(node_data_json, tree)
+    node_data = read_node_data(node_data_json)
     if node_data is None:
         raise AugurError("could not read node data (incl sequences)")
     # extract sequences from node meta data
     sequences = {}
     for k,v in node_data['nodes'].items():
         if 'sequence' in v:
             sequences[k] = v['sequence']
+    tree_nodes = {c.name for c in tree.find_clades()}
+    tree_nodes_missing_sequences = tree_nodes - set(sequences.keys())
+    if len(tree_nodes_missing_sequences):
+        raise AugurError(dedent(f"""\
+            {len(tree_nodes_missing_sequences)} nodes on the tree are missing nucleotide sequences in the node-data JSON.
+            These must be present under 'nodes' → <node_name> → 'sequence'.
+            This error may originate from using 'augur ancestral' with VCF input; in this case try using VCF output from that command here.
+            """))
     return sequences
 
 def register_parser(parent_subparsers):
@@ -412,7 +421,7 @@ def run(args):
         if len(features_without_variation):
             print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation)))  
     else:
-        sequences = sequences_json(args.ancestral_sequences, args.tree)
+        sequences = sequences_json(args.ancestral_sequences, tree)
         translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if feat.type != 'source'}
 
     ## glob the annotations for later auspice export