Skip to content

Commit

Permalink
[clade-I] hardcode clade assignment
Browse files Browse the repository at this point in the history
This approach is necessary because the mutations assigned to the top
branches in the tree are random. I'd consider this approach temporary
and we should revisit it in the near-future.
  • Loading branch information
jameshadfield committed Sep 27, 2024
1 parent 412e4ab commit cd29467
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 0 deletions.
7 changes: 7 additions & 0 deletions phylogenetic/defaults/clade-i/include.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Ensure we include 2 Ia and Ib samples so we can use them to check clade assignment
# Clade Ia
PP601197
KJ642618
# Clade Ib
PP601222
PP601209
15 changes: 15 additions & 0 deletions phylogenetic/rules/annotate_phylogeny.smk
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ rule rename_clades:
build_dir + "/{build_name}/clades_raw.json",
output:
node_data=build_dir + "/{build_name}/clades.json",
wildcard_constraints:
build_name="^(?!clade-i)$",
shell:
"""
python scripts/clades_renaming.py \
Expand All @@ -124,6 +126,19 @@ rule rename_clades:
"""


rule clades_for_clade_I:
input:
tree=build_dir + "/clade-i/tree.nwk",
output:
node_data=build_dir + "/clade-i/clades.json",
shell:
"""
python scripts/assign-clade-I-clades.py \
< {input.tree} \
> {output.node_data}
"""


rule mutation_context:
input:
tree=build_dir + "/{build_name}/tree.nwk",
Expand Down
45 changes: 45 additions & 0 deletions phylogenetic/scripts/assign-clade-I-clades.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""
Labels the two child nodes of the root as clade Ia and Ib
based on an expected tree structure. This approach is temporary and is
necessary because the distribution of mutations at these two nodes
(via augur ancestral) is random and thus we can't use our normal
`augur clades` approach.
This script expects certain tips to be present for each clade
which are force-included in the analysis.
Usage: provide the tree on STDIN, node-data JSON written to STDOUT
"""

import argparse
from sys import stdin,stdout
from Bio import Phylo
from collections import defaultdict
import json

TIPS = {
"clade Ia": ["PP601197", "KJ642618"],
"clade Ib": ["PP601222", "PP601209"]
}

if __name__=="__main__":
parser = argparse.ArgumentParser(description = __doc__)
args = parser.parse_args()

t = Phylo.read(stdin, "newick")

node_data = { # node-data JSON
"nodes": defaultdict(dict),
"branches": defaultdict(dict),
}

for node in t.clade:
tips = set([n.name for n in node.get_terminals()])
for clade_name, defining_tips in TIPS.items():
if all([name in tips for name in defining_tips]):
node_data['branches'][node.name]['labels'] = {'clade': clade_name}
node_data['nodes'][node.name]["clade_membership"] = clade_name
for descendant in node.find_clades():
node_data['nodes'][descendant.name]["clade_membership"] = clade_name

json.dump(node_data, stdout)

0 comments on commit cd29467

Please sign in to comment.