Allow same strains for all segments

See added content in README.md for how to use. This is useful in its own right, but also paves the way for future work which will attempt to analyse whole genomes.
nextstrain · Apr 18, 2024 · 71cd572 · 71cd572
1 parent 018f454
commit 71cd572
Show file tree

Hide file tree

Showing 3 changed files with 154 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -35,6 +35,21 @@ To string these together and update the `clades.tsv` file for new sequences and
 
 `snakemake -s Snakefile.clades -p --cores 1 && snakemake -p --cores 1`
 
+### Using the same strains for all segments
+
+By default we subsample data for each segment independently.
+Alternatively, you can ask the pipeline to use the same strains for each segment.
+This modifies the pipeline in a few ways:
+1. An additional metadata processing step is added which counts the number of segments a strain has sequence data for
+2. Subsampling is performed as normal for the HA segment, with the additional condition that each sample has sequence data for all segments
+3. All other segments are subsampled to contain the same strains as used for HA in step 2
+
+To enable this set the config parameter `same_strains_per_segment` to a truthy value. If you are calling `snakemake` directly you can add
+
+    --config 'same_strains_per_segment=True'
+
+If you are using `nextstrain build` then add that to the end of the command (i.e. as a parameter which will be passed through to Snakemake).
+
 ## To modify this build to work with your own data
 Although the simplest way to generate a custom build is via the quickstart build, you are welcome to clone this repo and use it as a starting point for running your own, local builds if you'd like. The [Nextstrain docs](https://docs.nextstrain.org/en/latest/index.html) are a fantastic resource for getting started with the Nextstrain pipeline, and include some [great tutorials](https://docs.nextstrain.org/en/latest/install.html) to get you started. This build is slightly more complicated than other builds, and has a few custom functions in it to accommodate all the features on [nextstrain.org](https://nextstrain.org/flu/avian), and makes use of wildcards for both subtypes and gene segments. If you'd like to adapt this full, non-simplified pipeline here to your own data (which you may want to do if you also want to annotate clades), you would need to make a few changes and additions:
 

diff --git a/Snakefile b/Snakefile
@@ -1,6 +1,15 @@
 SUBTYPES = ["h5nx","h5n1","h9n2","h7n9"]
+# SUBTYPES = ["h5nx"]
+# SUBTYPES = ['h7n9']
 SEGMENTS = ["pb2", "pb1", "pa", "ha","np", "na", "mp", "ns"]
 
+# The config option `same_strains_per_segment=True'` (e.g. supplied to snakemake via --config command line argument)
+# will change the behaviour of the workflow to use the same strains for each segment. This is achieved via three steps:
+# (1) Add a metadata processing step to add the segment count per strain to the HA metadata file
+# (2) Filter the HA segment as normal plus filter to those strains with 8 segments
+# (3) Filter the other segments by simply force-including the same strains as (2)
+SAME_STRAINS = bool(config.get('same_strains_per_segment', False))
+
 path_to_fauna = '../fauna'
 
 rule all:
@@ -24,6 +33,8 @@ def download_by(w):
     return(db[w.subtype])
 
 def metadata_by_wildcards(w):
+    if SAME_STRAINS and w.segment == 'ha':
+        return "results/metadata-segments_{subtype}_ha.tsv"
     md = {"h5n1": rules.add_h5_clade.output.metadata, "h5nx": rules.add_h5_clade.output.metadata, "h7n9": rules.parse.output.metadata, "h9n2": rules.parse.output.metadata}
     return(md[w.subtype])
 
@@ -101,41 +112,82 @@ rule add_h5_clade:
             --clades {input.clades_file}
         """
 
-rule filter:
-    message:
+# For the HA metadata file (for each subtype) add a column "n_segments"
+# which reports how many segments have sequence data (no QC performed)
+# This will force the download & parsing of all segments for a given subtype
+rule add_segment_sequence_counts:
+    input:
+        segments = lambda w: expand("results/metadata-with-clade_{{subtype}}_{segment}.tsv", segment=SEGMENTS) \
+            if w.subtype in ['h5n1', 'h5nx'] \
+            else expand("results/metadata_{{subtype}}_{segment}.tsv", segment=SEGMENTS),
+        metadata = lambda w: "results/metadata-with-clade_{subtype}_ha.tsv" \
+            if w.subtype in ['h5n1', 'h5nx'] \
+            else "results/metadata_{subtype}_ha.tsv",
+    output:
+        metadata = "results/metadata-segments_{subtype}_ha.tsv"
+    shell:
         """
-        Filtering to
-          - {params.sequences_per_group} sequence(s) per {params.group_by!s}
-          - excluding strains in {input.exclude}
-          - samples with missing region and country metadata
-          - excluding strains prior to {params.min_date}
+        python scripts/add_segment_counts.py \
+            --segments {input.segments} \
+            --metadata {input.metadata} \
+            --output {output.metadata}
         """
+
+
+def _filter_params(wildcards, input, output, threads, resources):
+    """
+    Generate the arguments to `augur filter`. When we are running independent analyses
+    (i.e. not using the same strains for each segment), then we generate a full set of
+    filter parameters here.
+    When we are using the same sequences for each segment, then for HA we use a full
+    filter call and for the rest of the segments we filter to the strains chosen for HA
+
+    NOTE: we could move the functions `group_by` etc into this function if that was
+    clearer.
+    """
+    # if strains file provided as input we just filter to those strains
+    if input.strains:
+        return f"--exclude-all --include {input.strains}"
+
+    # If SAME_STRAINS then we want to add a query to also filter to strains with all 8 segments
+    segments = f"n_segments!={len(SEGMENTS)}" if SAME_STRAINS else '';
+
+    # formulate our typical filtering parameters
+    cmd  = f" --group-by {group_by(wildcards)}"
+    cmd += f" --sequences-per-group {sequences_per_group(wildcards)}"
+    cmd += f" --min-date {min_date(wildcards)}"
+    cmd += f" --exclude-where host=laboratoryderived host=ferret host=unknown host=other country=? region=? {segments}"
+    cmd += f" --min-length {min_length(wildcards)}"
+    cmd += f" --non-nucleotide"
+    return cmd
+
+rule filter:
+    """
+    Filtering to
+        - {params.sequences_per_group} sequence(s) per {params.group_by!s}
+        - excluding strains in {input.exclude}
+        - samples with missing region and country metadata
+        - excluding strains prior to {params.min_date}
+    """
     input:
         sequences = rules.parse.output.sequences,
         metadata = metadata_by_wildcards,
-        exclude = files.dropped_strains
+        exclude = files.dropped_strains,
+        strains = lambda w: f"results/filtered_{w.subtype}_ha.txt" if (SAME_STRAINS and w.segment!='ha') else [],
     output:
-        sequences = "results/filtered_{subtype}_{segment}.fasta"
+        sequences = "results/filtered_{subtype}_{segment}.fasta",
+        strains = "results/filtered_{subtype}_{segment}.txt",
     params:
-        group_by = group_by,
-        sequences_per_group = sequences_per_group,
-        min_date = min_date,
-        min_length = min_length,
-        exclude_where = "host=laboratoryderived host=ferret host=unknown host=other country=? region=?"
-
+        args = _filter_params,
     shell:
         """
         augur filter \
             --sequences {input.sequences} \
             --metadata {input.metadata} \
             --exclude {input.exclude} \
-            --output {output.sequences} \
-            --group-by {params.group_by} \
-            --sequences-per-group {params.sequences_per_group} \
-            --min-date {params.min_date} \
-            --exclude-where {params.exclude_where} \
-            --min-length {params.min_length} \
-            --non-nucleotide
+            --output-sequences {output.sequences} \
+            --output-strains {output.strains} \
+            {params.args}
         """
 
 rule align:

diff --git a/scripts/add_segment_counts.py b/scripts/add_segment_counts.py
@@ -0,0 +1,65 @@
+"""
+Takes in a set of metadata TSVs corresponding to segments (i.e. typically 8 TSVs)
+and adds a column to the input `--metadata` TSV with the number of segments
+that strain appears in.
+"""
+
+import argparse
+import csv
+from collections import defaultdict
+
+def collect_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--segments', type=str, nargs='+', help='Metadata TSVs for all segments')
+    parser.add_argument('--metadata', type=str, help='Metadata TSV which will be amended for output. Must also appear in --segments.')
+    parser.add_argument('--output',   type=str, help='metadata file name')
+    return parser.parse_args()
+
+def read_metadata(fname, strains_only=False):
+    strains = set()
+    rows = []
+    with open(fname) as csvfile:
+        reader = csv.DictReader(csvfile, delimiter='\t')
+        for row in reader:
+            strains.add(row['strain'])
+            if not strains_only:
+                rows.append(row)
+    if strains_only:
+        return strains
+    return (strains, reader.fieldnames, rows)
+
+def summary(strain_count):
+    ## Print some basic stats!
+    print("Num strains observed (across all segments): ", len(strain_count.keys()))
+    counts = [0]*9 # 1-indexed
+    for n in strain_count.values():
+        counts[n]+=1
+    for i in range(1,9):
+        print(f"Num strains observed in {i} segments: ", counts[i])
+
+
+if __name__=="__main__":
+    args = collect_args()
+    # strains_per_segment = []
+    strain_count = defaultdict(int)
+    for fname in args.segments:
+        if fname==args.metadata:
+            _strains, fieldnames, rows = read_metadata(fname)
+        else:
+            _strains = read_metadata(fname, strains_only=True)
+        for s in _strains:
+            strain_count[s]+=1
+    summary(strain_count)
+
+    # append count to data for output
+    column = "n_segments"
+    fieldnames.append(column)
+    for row in rows:
+        row[column]=strain_count[row['strain']]
+
+    with open(args.output, 'w') as fh:
+        writer = csv.DictWriter(fh, fieldnames=fieldnames, delimiter='\t')
+        writer.writeheader()
+        for row in rows:
+            writer.writerow(row)
+    print("Output written to", args.output)