diff --git a/CHANGES.md b/CHANGES.md index 1cfe73a27..54b14f8d2 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -9,6 +9,7 @@ * Incompatible arguments are now checked, especially related to VCF vs FASTA inputs. * `--vcf-reference` and `--root-sequence` are now mutually exclusive. * translate: Tree nodes are checked against the node-data JSON input to ensure sequences are present. [#1348][] (@jameshadfield) +* translate: The 'source' ID for GFF files is now ignored as a potential gene feature. [#1348][] (@jameshadfield) * translate: Improvements to command line arguments. [#1348][] (@jameshadfield) * `--tree` and `--ancestral-sequences` are now required arguments. * separate VCF-only arguments into their own group diff --git a/augur/translate.py b/augur/translate.py index 1fc73040f..8c7253c39 100644 --- a/augur/translate.py +++ b/augur/translate.py @@ -129,7 +129,7 @@ def translate_feature(aln, feature): return translations -def translate_vcf_feature(sequences, ref, feature): +def translate_vcf_feature(sequences, ref, feature, feature_name): '''Translates a subsequence of input nucleotide sequences. Parameters @@ -168,7 +168,7 @@ def str_reverse_comp(str_seq): # Need to get ref translation to store. check if multiple of 3 for sanity. # will be padded in safe_translate if not if len(refNuc)%3: - print("Gene length of {} is not a multiple of 3. will pad with N".format(feature.qualifiers['Name'][0]), file=sys.stderr) + print(f"Gene length of {feature_name!r} is not a multiple of 3. will pad with N", file=sys.stderr) ref_aa_seq = safe_translate(refNuc) prot['reference'] = ref_aa_seq @@ -409,13 +409,16 @@ def run(args): print("Read in {} features from reference sequence file".format(len(features))) ## Read in sequences & for each sequence translate each feature _except for_ the source (nuc) feature + ## Note that `load_features` _only_ extracts {'gene', 'source'} for GFF files, {'CDS', 'source'} for GenBank. translations = {} if is_vcf: (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences) features_without_variation = [] for fname, feat in features.items(): + if feat.type=='source': + continue try: - translations[fname] = translate_vcf_feature(sequences, ref, feat) + translations[fname] = translate_vcf_feature(sequences, ref, feat, fname) except NoVariationError: features_without_variation.append(fname) if len(features_without_variation): diff --git a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t index f0d25bee4..1de2856ee 100644 --- a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t +++ b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t @@ -15,7 +15,7 @@ Translate amino acids for genes using a GFF3 file where the gene names are store > --output-node-data aa_muts.json \ > --alignment-output translations.vcf \ > --vcf-reference-output translations_reference.fasta - Gene length of rrs_Rvnr01 is not a multiple of 3. will pad with N + Gene length of 'rrs' is not a multiple of 3. will pad with N Read in 187 specified genes to translate. Read in 187 features from reference sequence file 162 genes had no mutations and so have been be excluded.