diff --git a/Snakefile b/Snakefile index 0a9fa55..f7a46d5 100755 --- a/Snakefile +++ b/Snakefile @@ -507,8 +507,11 @@ def export_node_data_files(wildcards): rules.cleavage_site.output.cleavage_site_sequences, ] - if wildcards.subtype=="h5n1-cattle-outbreak" and wildcards.segment!='genome': - nd.append(rules.prune_tree.output.node_data) + if wildcards.subtype=="h5n1-cattle-outbreak": + if wildcards.segment!='genome': + nd.append(rules.prune_tree.output.node_data) + else: + nd.append(rules.join_segments.output.node_data) return nd diff --git a/config/h5n1-cattle-outbreak/auspice_config_h5n1-cattle-outbreak.json b/config/h5n1-cattle-outbreak/auspice_config_h5n1-cattle-outbreak.json index 09ac49e..1ec0b99 100755 --- a/config/h5n1-cattle-outbreak/auspice_config_h5n1-cattle-outbreak.json +++ b/config/h5n1-cattle-outbreak/auspice_config_h5n1-cattle-outbreak.json @@ -54,6 +54,16 @@ "title": "Date", "type": "continuous" }, + { + "key": "ATGC_perc", + "title": "genome ATGC%", + "type": "continuous" + }, + { + "key": "num_segments", + "title": "Num segments sequenced", + "type": "ordinal" + }, { "key": "region", "title": "Region", diff --git a/rules/cattle-flu.smk b/rules/cattle-flu.smk index 2c0c762..2ff3dfc 100644 --- a/rules/cattle-flu.smk +++ b/rules/cattle-flu.smk @@ -64,9 +64,11 @@ rule join_segments: # allow snakemake to choose the correct rule to run. Note that `wildcards.segment="genome"` # here, and for that we need alignments for 8 individual segments, which we refer to as `wildcards.genome_seg` input: - alignment = expand("results/{{subtype}}/{{segment}}/{{time}}/aligned_{genome_seg}.fasta", genome_seg=SEGMENTS) + alignment = expand("results/{{subtype}}/{{segment}}/{{time}}/aligned_{genome_seg}.fasta", genome_seg=SEGMENTS), + metadata = metadata_by_wildcards, output: - alignment = "results/{subtype}/{segment}/{time}/aligned.fasta" + alignment = "results/{subtype}/{segment}/{time}/aligned.fasta", + node_data = "results/{subtype}/{segment}/{time}/aligned.json", wildcard_constraints: subtype = 'h5n1-cattle-outbreak', segment = 'genome', @@ -75,7 +77,9 @@ rule join_segments: """ python scripts/join-segments.py \ --segments {input.alignment} \ - --output {output.alignment} + --output {output.alignment} \ + --output-node-data {output.node_data} \ + --force-include $( cat {input.metadata} | csvtk filter2 -t -f '$host=="Human"' | csvtk cut -t -f strain | tail -n +2 | tr "\n" " " ) """ rule genome_metadata: diff --git a/scripts/join-segments.py b/scripts/join-segments.py index ba0a7b4..9c54c27 100644 --- a/scripts/join-segments.py +++ b/scripts/join-segments.py @@ -2,25 +2,69 @@ from Bio import SeqIO from collections import defaultdict import argparse +import json + if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--segments', type = str, required = True, nargs='+', help = "per-segment alignments") + parser.add_argument('--force-include', type=str, nargs="+", required=False, + help="Force include these strains regardless of how many segments have been sequenced") parser.add_argument('--output', type = str, required = True, help = "output whole genome alignment") + parser.add_argument('--output-node-data', type = str, required = False, help = "output metadata in node-data JSON format") args = parser.parse_args() records = {} - strain_counts = defaultdict(int) + strain_counts: dict[str,int] = defaultdict(int) + segment_lengths = defaultdict(int) + atgc = set(['A','T','G','C']) + + assert len(args.segments)==8, "Expected 8 segments!" + for fname in args.segments: - records[fname] = {record.name:str(record.seq) for record in SeqIO.parse(fname, 'fasta')} - for key in records[fname]: strain_counts[key]+=1 - print(f"{fname}: parsed {len(records[fname].keys())} sequences") + records[fname] = {record.name:str(record.seq).upper() for record in SeqIO.parse(fname, 'fasta')} + for key in records[fname]: + strain_counts[key]+=1 + assert len({len(seq) for seq in records[fname].values()})==1, f"Different sequence lengths observed in {fname}" + segment_lengths[fname] = len(next(iter(records[fname].values()))) + print(f"{fname}: parsed {len(records[fname].keys())} sequences, each {segment_lengths[fname]} nt") + + ## how many strains are missing segments? + num_segments: dict[str, list[str]] = {str(i): [] for i in range(1,8+1)} + for strain,n in strain_counts.items(): + num_segments[str(n)].append(strain) + + + def sequence(segment, name): + if seq:=records[segment].get(name, None): + return seq + # `augur ancestral` is run with --keep-ambiguous but without --keep-overhangs + # So use Ns to represent missing segments rather than gaps + # https://docs.nextstrain.org/en/latest/guides/bioinformatics/missing-sequence-data.html + return "N" * segment_lengths[segment] + + def atgc_perc(seq): + atgc = set(['A','T','G','C']) + len([nt for nt in seq if nt in atgc]) + + node_data: dict[str,dict] = {'nodes': {}} with open(args.output, 'w') as fh: print("writing genome to ", args.output) for name,count in strain_counts.items(): - if count!=len(args.segments): - print(f"Excluding {name} as it only appears in {count} segments") - continue - genome = "".join([records[seg][name] for seg in args.segments]) + if count<7: + if name in args.force_include: + print(f"Force including {name} which would otherwise be dropped as it only appears in {count} segments") + else: + print(f"Excluding {name} as it only appears in {count} segments") + continue + genome = "".join([sequence(seg, name) for seg in args.segments]) + node_data['nodes'][name] = { + "ATGC_perc": int( len([nt for nt in genome if nt in atgc])/len(genome) * 100), + "num_segments": strain_counts[name] + } print(f">{name}\n{genome}", file=fh) + + if args.output_node_data: + with open(args.output_node_data, 'w') as fh: + json.dump(node_data, fh) \ No newline at end of file