Skip to content

Commit

Permalink
Allow same strains for all segments
Browse files Browse the repository at this point in the history
See added content in README.md for how to use.

This is useful in its own right, but also paves the way for future work
which will attempt to analyse whole genomes.
  • Loading branch information
jameshadfield committed Apr 18, 2024
1 parent 018f454 commit 71cd572
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 22 deletions.
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,21 @@ To string these together and update the `clades.tsv` file for new sequences and

`snakemake -s Snakefile.clades -p --cores 1 && snakemake -p --cores 1`

### Using the same strains for all segments

By default we subsample data for each segment independently.
Alternatively, you can ask the pipeline to use the same strains for each segment.
This modifies the pipeline in a few ways:
1. An additional metadata processing step is added which counts the number of segments a strain has sequence data for
2. Subsampling is performed as normal for the HA segment, with the additional condition that each sample has sequence data for all segments
3. All other segments are subsampled to contain the same strains as used for HA in step 2

To enable this set the config parameter `same_strains_per_segment` to a truthy value. If you are calling `snakemake` directly you can add

--config 'same_strains_per_segment=True'

If you are using `nextstrain build` then add that to the end of the command (i.e. as a parameter which will be passed through to Snakemake).

## To modify this build to work with your own data
Although the simplest way to generate a custom build is via the quickstart build, you are welcome to clone this repo and use it as a starting point for running your own, local builds if you'd like. The [Nextstrain docs](https://docs.nextstrain.org/en/latest/index.html) are a fantastic resource for getting started with the Nextstrain pipeline, and include some [great tutorials](https://docs.nextstrain.org/en/latest/install.html) to get you started. This build is slightly more complicated than other builds, and has a few custom functions in it to accommodate all the features on [nextstrain.org](https://nextstrain.org/flu/avian), and makes use of wildcards for both subtypes and gene segments. If you'd like to adapt this full, non-simplified pipeline here to your own data (which you may want to do if you also want to annotate clades), you would need to make a few changes and additions:

Expand Down
96 changes: 74 additions & 22 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
SUBTYPES = ["h5nx","h5n1","h9n2","h7n9"]
# SUBTYPES = ["h5nx"]
# SUBTYPES = ['h7n9']
SEGMENTS = ["pb2", "pb1", "pa", "ha","np", "na", "mp", "ns"]

# The config option `same_strains_per_segment=True'` (e.g. supplied to snakemake via --config command line argument)
# will change the behaviour of the workflow to use the same strains for each segment. This is achieved via three steps:
# (1) Add a metadata processing step to add the segment count per strain to the HA metadata file
# (2) Filter the HA segment as normal plus filter to those strains with 8 segments
# (3) Filter the other segments by simply force-including the same strains as (2)
SAME_STRAINS = bool(config.get('same_strains_per_segment', False))

path_to_fauna = '../fauna'

rule all:
Expand All @@ -24,6 +33,8 @@ def download_by(w):
return(db[w.subtype])

def metadata_by_wildcards(w):
if SAME_STRAINS and w.segment == 'ha':
return "results/metadata-segments_{subtype}_ha.tsv"
md = {"h5n1": rules.add_h5_clade.output.metadata, "h5nx": rules.add_h5_clade.output.metadata, "h7n9": rules.parse.output.metadata, "h9n2": rules.parse.output.metadata}
return(md[w.subtype])

Expand Down Expand Up @@ -101,41 +112,82 @@ rule add_h5_clade:
--clades {input.clades_file}
"""

rule filter:
message:
# For the HA metadata file (for each subtype) add a column "n_segments"
# which reports how many segments have sequence data (no QC performed)
# This will force the download & parsing of all segments for a given subtype
rule add_segment_sequence_counts:
input:
segments = lambda w: expand("results/metadata-with-clade_{{subtype}}_{segment}.tsv", segment=SEGMENTS) \
if w.subtype in ['h5n1', 'h5nx'] \
else expand("results/metadata_{{subtype}}_{segment}.tsv", segment=SEGMENTS),
metadata = lambda w: "results/metadata-with-clade_{subtype}_ha.tsv" \
if w.subtype in ['h5n1', 'h5nx'] \
else "results/metadata_{subtype}_ha.tsv",
output:
metadata = "results/metadata-segments_{subtype}_ha.tsv"
shell:
"""
Filtering to
- {params.sequences_per_group} sequence(s) per {params.group_by!s}
- excluding strains in {input.exclude}
- samples with missing region and country metadata
- excluding strains prior to {params.min_date}
python scripts/add_segment_counts.py \
--segments {input.segments} \
--metadata {input.metadata} \
--output {output.metadata}
"""


def _filter_params(wildcards, input, output, threads, resources):
"""
Generate the arguments to `augur filter`. When we are running independent analyses
(i.e. not using the same strains for each segment), then we generate a full set of
filter parameters here.
When we are using the same sequences for each segment, then for HA we use a full
filter call and for the rest of the segments we filter to the strains chosen for HA
NOTE: we could move the functions `group_by` etc into this function if that was
clearer.
"""
# if strains file provided as input we just filter to those strains
if input.strains:
return f"--exclude-all --include {input.strains}"

# If SAME_STRAINS then we want to add a query to also filter to strains with all 8 segments
segments = f"n_segments!={len(SEGMENTS)}" if SAME_STRAINS else '';

# formulate our typical filtering parameters
cmd = f" --group-by {group_by(wildcards)}"
cmd += f" --sequences-per-group {sequences_per_group(wildcards)}"
cmd += f" --min-date {min_date(wildcards)}"
cmd += f" --exclude-where host=laboratoryderived host=ferret host=unknown host=other country=? region=? {segments}"
cmd += f" --min-length {min_length(wildcards)}"
cmd += f" --non-nucleotide"
return cmd

rule filter:
"""
Filtering to
- {params.sequences_per_group} sequence(s) per {params.group_by!s}
- excluding strains in {input.exclude}
- samples with missing region and country metadata
- excluding strains prior to {params.min_date}
"""
input:
sequences = rules.parse.output.sequences,
metadata = metadata_by_wildcards,
exclude = files.dropped_strains
exclude = files.dropped_strains,
strains = lambda w: f"results/filtered_{w.subtype}_ha.txt" if (SAME_STRAINS and w.segment!='ha') else [],
output:
sequences = "results/filtered_{subtype}_{segment}.fasta"
sequences = "results/filtered_{subtype}_{segment}.fasta",
strains = "results/filtered_{subtype}_{segment}.txt",
params:
group_by = group_by,
sequences_per_group = sequences_per_group,
min_date = min_date,
min_length = min_length,
exclude_where = "host=laboratoryderived host=ferret host=unknown host=other country=? region=?"

args = _filter_params,
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--exclude {input.exclude} \
--output {output.sequences} \
--group-by {params.group_by} \
--sequences-per-group {params.sequences_per_group} \
--min-date {params.min_date} \
--exclude-where {params.exclude_where} \
--min-length {params.min_length} \
--non-nucleotide
--output-sequences {output.sequences} \
--output-strains {output.strains} \
{params.args}
"""

rule align:
Expand Down
65 changes: 65 additions & 0 deletions scripts/add_segment_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""
Takes in a set of metadata TSVs corresponding to segments (i.e. typically 8 TSVs)
and adds a column to the input `--metadata` TSV with the number of segments
that strain appears in.
"""

import argparse
import csv
from collections import defaultdict

def collect_args():
parser = argparse.ArgumentParser()
parser.add_argument('--segments', type=str, nargs='+', help='Metadata TSVs for all segments')
parser.add_argument('--metadata', type=str, help='Metadata TSV which will be amended for output. Must also appear in --segments.')
parser.add_argument('--output', type=str, help='metadata file name')
return parser.parse_args()

def read_metadata(fname, strains_only=False):
strains = set()
rows = []
with open(fname) as csvfile:
reader = csv.DictReader(csvfile, delimiter='\t')
for row in reader:
strains.add(row['strain'])
if not strains_only:
rows.append(row)
if strains_only:
return strains
return (strains, reader.fieldnames, rows)

def summary(strain_count):
## Print some basic stats!
print("Num strains observed (across all segments): ", len(strain_count.keys()))
counts = [0]*9 # 1-indexed
for n in strain_count.values():
counts[n]+=1
for i in range(1,9):
print(f"Num strains observed in {i} segments: ", counts[i])


if __name__=="__main__":
args = collect_args()
# strains_per_segment = []
strain_count = defaultdict(int)
for fname in args.segments:
if fname==args.metadata:
_strains, fieldnames, rows = read_metadata(fname)
else:
_strains = read_metadata(fname, strains_only=True)
for s in _strains:
strain_count[s]+=1
summary(strain_count)

# append count to data for output
column = "n_segments"
fieldnames.append(column)
for row in rows:
row[column]=strain_count[row['strain']]

with open(args.output, 'w') as fh:
writer = csv.DictWriter(fh, fieldnames=fieldnames, delimiter='\t')
writer.writeheader()
for row in rows:
writer.writerow(row)
print("Output written to", args.output)

0 comments on commit 71cd572

Please sign in to comment.