From ac57d8a6857002fc3ceecfd38b6da409b7f63082 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Tue, 16 Feb 2021 09:57:53 +0100 Subject: [PATCH 1/9] mask sites without data in output of ancestral reconstruction --- augur/ancestral.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/augur/ancestral.py b/augur/ancestral.py index 805eb0682..4e3d835ad 100644 --- a/augur/ancestral.py +++ b/augur/ancestral.py @@ -60,7 +60,7 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True, return tt -def collect_mutations_and_sequences(tt, infer_tips=False, full_sequences=False, character_map=None): +def collect_mutations_and_sequences(tt, infer_tips=False, full_sequences=False, character_map=None, mask_ambiguous=True): """iterates of the tree and produces dictionaries with mutations and sequences for each node. @@ -91,10 +91,21 @@ def collect_mutations_and_sequences(tt, infer_tips=False, full_sequences=False, data[n.name]['muts'] = [a+str(int(pos)+inc)+cm(d) for a,pos,d in n.mutations] + if full_sequences: + if mask_ambiguous: + ambiguous_count = np.zeros(tt.sequence_length(), dtype=int) + for n in tt.tree.get_terminals(): + ambiguous_count += np.array(tt.sequence(n,reconstructed=False, as_string=False)==tt.gtr.ambiguous, dtype=int) + mask = ambiguous_count==len(tt.tree.get_terminals()) + else: + mask = np.zeros(tt.sequence_length(), dtype=bool) + for n in tt.tree.find_clades(): try: - data[n.name]['sequence'] = tt.sequence(n,reconstructed=infer_tips, as_string=True) + tmp = tt.sequence(n,reconstructed=infer_tips, as_string=False) + tmp[mask] = tt.gtr.ambiguous + data[n.name]['sequence'] = "".join(tmp) except: print("No sequence available for node ",n.name) From d99c1800ccdea157ee9991c155bf222aee39459b Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Tue, 16 Feb 2021 10:06:03 +0100 Subject: [PATCH 2/9] add numpy import --- augur/ancestral.py | 1 + 1 file changed, 1 insertion(+) diff --git a/augur/ancestral.py b/augur/ancestral.py index 4e3d835ad..907cc4c98 100644 --- a/augur/ancestral.py +++ b/augur/ancestral.py @@ -3,6 +3,7 @@ """ import os, shutil, time, json, sys +import numpy as np from Bio import Phylo, SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord From 96e5daa0680c22596945a8da05a00443444be4d3 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Tue, 16 Feb 2021 10:15:06 +0100 Subject: [PATCH 3/9] fix sequence_length attribute access --- augur/ancestral.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/augur/ancestral.py b/augur/ancestral.py index 907cc4c98..cd50ba165 100644 --- a/augur/ancestral.py +++ b/augur/ancestral.py @@ -95,12 +95,12 @@ def collect_mutations_and_sequences(tt, infer_tips=False, full_sequences=False, if full_sequences: if mask_ambiguous: - ambiguous_count = np.zeros(tt.sequence_length(), dtype=int) + ambiguous_count = np.zeros(tt.sequence_length, dtype=int) for n in tt.tree.get_terminals(): ambiguous_count += np.array(tt.sequence(n,reconstructed=False, as_string=False)==tt.gtr.ambiguous, dtype=int) mask = ambiguous_count==len(tt.tree.get_terminals()) else: - mask = np.zeros(tt.sequence_length(), dtype=bool) + mask = np.zeros(tt.sequence_length, dtype=bool) for n in tt.tree.find_clades(): try: From 8046e7b3389ac8173c43ae427ac03766750bdc80 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Tue, 16 Feb 2021 22:33:40 +0100 Subject: [PATCH 4/9] also mask exported root sequence --- augur/ancestral.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/augur/ancestral.py b/augur/ancestral.py index cd50ba165..aad8fdd64 100644 --- a/augur/ancestral.py +++ b/augur/ancestral.py @@ -92,7 +92,7 @@ def collect_mutations_and_sequences(tt, infer_tips=False, full_sequences=False, data[n.name]['muts'] = [a+str(int(pos)+inc)+cm(d) for a,pos,d in n.mutations] - + mask=None if full_sequences: if mask_ambiguous: ambiguous_count = np.zeros(tt.sequence_length, dtype=int) @@ -110,7 +110,7 @@ def collect_mutations_and_sequences(tt, infer_tips=False, full_sequences=False, except: print("No sequence available for node ",n.name) - return data + return {"nodes": data, "mask": mask} def register_arguments(parser): @@ -189,14 +189,17 @@ def run(args): else: character_map[x] = x - anc_seqs['nodes'] = collect_mutations_and_sequences(tt, full_sequences=not is_vcf, - infer_tips=infer_ambiguous, character_map=character_map) + anc_seqs.update(collect_mutations_and_sequences(tt, full_sequences=not is_vcf, + infer_tips=infer_ambiguous, character_map=character_map)) # add reference sequence to json structure. This is the sequence with # respect to which mutations on the tree are defined. if is_vcf: anc_seqs['reference'] = {"nuc":compress_seq['reference']} else: - anc_seqs['reference'] = {"nuc":"".join(T.root.sequence) if hasattr(T.root, 'sequence') else ''} + root_seq = tt.sequence(T.root) + if anc_seqs['mask']: + root_seq[anc_seqs['mask']] = tt.gtr.ambiguous + anc_seqs['reference'] = {"nuc": ''.join(root_seq)} out_name = get_json_name(args, '.'.join(args.alignment.split('.')[:-1]) + '_mutations.json') write_json(anc_seqs, out_name) From cbf17e6b62d52bd5235e86f97568fa021f175d64 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Tue, 16 Feb 2021 22:44:49 +0100 Subject: [PATCH 5/9] fix: check for mask as not None --- augur/ancestral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/ancestral.py b/augur/ancestral.py index aad8fdd64..4ee13db73 100644 --- a/augur/ancestral.py +++ b/augur/ancestral.py @@ -197,7 +197,7 @@ def run(args): anc_seqs['reference'] = {"nuc":compress_seq['reference']} else: root_seq = tt.sequence(T.root) - if anc_seqs['mask']: + if ('mask' in anc_seqs) and (anc_seqs['mask'] is not None): root_seq[anc_seqs['mask']] = tt.gtr.ambiguous anc_seqs['reference'] = {"nuc": ''.join(root_seq)} From 20da544d28bb160f5db631e1065ff810757e88fb Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Tue, 16 Feb 2021 22:53:31 +0100 Subject: [PATCH 6/9] fix indexing of root sequence --- augur/ancestral.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/augur/ancestral.py b/augur/ancestral.py index 4ee13db73..3b8951d87 100644 --- a/augur/ancestral.py +++ b/augur/ancestral.py @@ -196,11 +196,15 @@ def run(args): if is_vcf: anc_seqs['reference'] = {"nuc":compress_seq['reference']} else: - root_seq = tt.sequence(T.root) + root_seq = tt.sequence(T.root, as_string=False) if ('mask' in anc_seqs) and (anc_seqs['mask'] is not None): + print("HERE", root_seq, anc_seqs['mask']) root_seq[anc_seqs['mask']] = tt.gtr.ambiguous anc_seqs['reference'] = {"nuc": ''.join(root_seq)} + if ('mask' in anc_seqs) and (anc_seqs['mask'] is not None): + anc_seqs["mask"] = "".join(['1' if x else '0' for x in anc_seqs["mask"]]) + out_name = get_json_name(args, '.'.join(args.alignment.split('.')[:-1]) + '_mutations.json') write_json(anc_seqs, out_name) print("ancestral mutations written to", out_name, file=sys.stdout) From cc07e629b5c32658921ae314881532b883411417 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Tue, 16 Feb 2021 22:59:33 +0100 Subject: [PATCH 7/9] chore: remove logging statement --- augur/ancestral.py | 1 - 1 file changed, 1 deletion(-) diff --git a/augur/ancestral.py b/augur/ancestral.py index 3b8951d87..591b36b33 100644 --- a/augur/ancestral.py +++ b/augur/ancestral.py @@ -198,7 +198,6 @@ def run(args): else: root_seq = tt.sequence(T.root, as_string=False) if ('mask' in anc_seqs) and (anc_seqs['mask'] is not None): - print("HERE", root_seq, anc_seqs['mask']) root_seq[anc_seqs['mask']] = tt.gtr.ambiguous anc_seqs['reference'] = {"nuc": ''.join(root_seq)} From 0071cc38def4c825fed7f789a5ed65bdd73c4255 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Thu, 18 Feb 2021 15:58:13 -0800 Subject: [PATCH 8/9] Fix functional test by updating expected Zika nt muts output Adds the new "mask" attribute to the ancestral sequences JSON for the Zika build. --- tests/builds/zika/results/nt_muts.json | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/builds/zika/results/nt_muts.json b/tests/builds/zika/results/nt_muts.json index 5f29a5fd9..7627ae707 100644 --- a/tests/builds/zika/results/nt_muts.json +++ b/tests/builds/zika/results/nt_muts.json @@ -3,6 +3,7 @@ "program": "augur", "version": "7.0.2" }, + "mask": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", "nodes": { "BRA/2016/FC_6706": { "muts": [ From eb41026ed65f8242c42d97dac820c55113ff0b68 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Thu, 18 Feb 2021 15:58:48 -0800 Subject: [PATCH 9/9] Comment masking of ambiguous sites in ancestral inference Adds a comment to describe what's happening and why in the new masking feature. Also, makes minor stylistic changes for readability. --- augur/ancestral.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/augur/ancestral.py b/augur/ancestral.py index 591b36b33..03260e19f 100644 --- a/augur/ancestral.py +++ b/augur/ancestral.py @@ -95,6 +95,10 @@ def collect_mutations_and_sequences(tt, infer_tips=False, full_sequences=False, mask=None if full_sequences: if mask_ambiguous: + # Identify sites for which every terminal sequence is ambiguous. + # These sites will be masked to prevent rounding errors in the + # maximum likelihood inference from assigning an arbitrary + # nucleotide to sites at internal nodes. ambiguous_count = np.zeros(tt.sequence_length, dtype=int) for n in tt.tree.get_terminals(): ambiguous_count += np.array(tt.sequence(n,reconstructed=False, as_string=False)==tt.gtr.ambiguous, dtype=int) @@ -197,11 +201,11 @@ def run(args): anc_seqs['reference'] = {"nuc":compress_seq['reference']} else: root_seq = tt.sequence(T.root, as_string=False) - if ('mask' in anc_seqs) and (anc_seqs['mask'] is not None): + if anc_seqs.get("mask") is not None: root_seq[anc_seqs['mask']] = tt.gtr.ambiguous anc_seqs['reference'] = {"nuc": ''.join(root_seq)} - if ('mask' in anc_seqs) and (anc_seqs['mask'] is not None): + if anc_seqs.get("mask") is not None: anc_seqs["mask"] = "".join(['1' if x else '0' for x in anc_seqs["mask"]]) out_name = get_json_name(args, '.'.join(args.alignment.split('.')[:-1]) + '_mutations.json')