nextstrain · jameshadfield · Dec 11, 2024 · Dec 11, 2024
diff --git a/Snakefile b/Snakefile
@@ -507,8 +507,11 @@ def export_node_data_files(wildcards):
         rules.cleavage_site.output.cleavage_site_sequences,
     ]
 
-    if wildcards.subtype=="h5n1-cattle-outbreak" and wildcards.segment!='genome':
-        nd.append(rules.prune_tree.output.node_data)
+    if wildcards.subtype=="h5n1-cattle-outbreak":
+        if wildcards.segment!='genome':
+            nd.append(rules.prune_tree.output.node_data)
+        else:
+            nd.append(rules.join_segments.output.node_data)
     return nd
 
 

diff --git a/config/h5n1-cattle-outbreak/auspice_config_h5n1-cattle-outbreak.json b/config/h5n1-cattle-outbreak/auspice_config_h5n1-cattle-outbreak.json
@@ -54,6 +54,16 @@
       "title": "Date",
       "type": "continuous"
     },
+    {
+      "key": "ATGC_perc",
+      "title": "genome ATGC%",
+      "type": "continuous"
+    },
+    {
+      "key": "num_segments",
+      "title": "Num segments sequenced",
+      "type": "ordinal"
+    },
     {
       "key": "region",
       "title": "Region",

diff --git a/rules/cattle-flu.smk b/rules/cattle-flu.smk
@@ -64,9 +64,11 @@ rule join_segments:
     # allow snakemake to choose the correct rule to run. Note that `wildcards.segment="genome"`
     # here, and for that we need alignments for 8 individual segments, which we refer to as `wildcards.genome_seg`
     input:
-        alignment = expand("results/{{subtype}}/{{segment}}/{{time}}/aligned_{genome_seg}.fasta", genome_seg=SEGMENTS) 
+        alignment = expand("results/{{subtype}}/{{segment}}/{{time}}/aligned_{genome_seg}.fasta", genome_seg=SEGMENTS),
+        metadata = metadata_by_wildcards,
     output:
-        alignment = "results/{subtype}/{segment}/{time}/aligned.fasta"
+        alignment = "results/{subtype}/{segment}/{time}/aligned.fasta",
+        node_data = "results/{subtype}/{segment}/{time}/aligned.json",
     wildcard_constraints:
         subtype = 'h5n1-cattle-outbreak',
         segment = 'genome',
@@ -75,7 +77,9 @@ rule join_segments:
         """
         python scripts/join-segments.py \
             --segments {input.alignment} \
-            --output {output.alignment}
+            --output {output.alignment} \
+            --output-node-data {output.node_data} \
+            --force-include $( cat {input.metadata} | csvtk filter2 -t -f '$host=="Human"' | csvtk cut -t -f strain | tail -n +2 | tr "\n" " " )
         """
 
 rule genome_metadata:

diff --git a/scripts/join-segments.py b/scripts/join-segments.py
@@ -2,25 +2,69 @@
 from Bio import SeqIO
 from collections import defaultdict
 import argparse
+import json
+
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--segments', type = str, required = True, nargs='+', help = "per-segment alignments")
+    parser.add_argument('--force-include', type=str, nargs="+", required=False,
+                        help="Force include these strains regardless of how many segments have been sequenced")
     parser.add_argument('--output', type = str, required = True, help = "output whole genome alignment")
+    parser.add_argument('--output-node-data', type = str, required = False, help = "output metadata in node-data JSON format")
     args = parser.parse_args()
 
     records = {}
-    strain_counts = defaultdict(int)
+    strain_counts: dict[str,int] = defaultdict(int)
+    segment_lengths = defaultdict(int)
+    atgc = set(['A','T','G','C'])
+
+    assert len(args.segments)==8, "Expected 8 segments!"
+
     for fname in args.segments:
-        records[fname] = {record.name:str(record.seq) for record in SeqIO.parse(fname, 'fasta')}
-        for key in records[fname]: strain_counts[key]+=1
-        print(f"{fname}: parsed {len(records[fname].keys())} sequences")
+        records[fname] = {record.name:str(record.seq).upper() for record in SeqIO.parse(fname, 'fasta')}
+        for key in records[fname]:
+            strain_counts[key]+=1
+        assert len({len(seq) for seq in records[fname].values()})==1, f"Different sequence lengths observed in {fname}"
+        segment_lengths[fname] = len(next(iter(records[fname].values())))
+        print(f"{fname}: parsed {len(records[fname].keys())} sequences, each {segment_lengths[fname]} nt")
+
+    ## how many strains are missing segments?
+    num_segments: dict[str, list[str]] = {str(i): [] for i in range(1,8+1)}
+    for strain,n in strain_counts.items():
+        num_segments[str(n)].append(strain)
+
+
+    def sequence(segment, name):
+        if seq:=records[segment].get(name, None):
+            return seq
+        # `augur ancestral` is run with --keep-ambiguous but without --keep-overhangs
+        # So use Ns to represent missing segments rather than gaps
+        # https://docs.nextstrain.org/en/latest/guides/bioinformatics/missing-sequence-data.html
+        return "N" * segment_lengths[segment]
+
+    def atgc_perc(seq):
+        atgc = set(['A','T','G','C'])
+        len([nt for nt in seq if nt in atgc])
+
+    node_data: dict[str,dict] = {'nodes': {}}
 
     with open(args.output, 'w') as fh:
         print("writing genome to ", args.output)
         for name,count in strain_counts.items():
-            if count!=len(args.segments):
-                print(f"Excluding {name} as it only appears in {count} segments")
-                continue
-            genome = "".join([records[seg][name] for seg in args.segments])
+            if count<7:
+                if name in args.force_include:
+                    print(f"Force including {name} which would otherwise be dropped as it only appears in {count} segments")
+                else:
+                    print(f"Excluding {name} as it only appears in {count} segments")
+                    continue
+            genome = "".join([sequence(seg, name) for seg in args.segments])
+            node_data['nodes'][name] = {
+                "ATGC_perc": int( len([nt for nt in genome if nt in atgc])/len(genome) * 100),
+                "num_segments": strain_counts[name]
+            }
             print(f">{name}\n{genome}", file=fh)
+
+    if args.output_node_data:
+        with open(args.output_node_data, 'w') as fh:
+            json.dump(node_data, fh)