Allow filtering by metadata only

Allows users to filter strains using only metadata information and output the strains that pass filters as either a metadata table or a list of names. Additionally, allows users to select only those sequences whose strain names are listed in one or more include files. To enable this functionality, this commit makes the following major changes to the filter interface: - sequence input and output are optional - non-sequence-based filters work without sequence input or index - the include argument accepts multiple inputs whose contents are unioned - a new `--exclude-all` flag excludes all strains by default, allowing only those in the include flags to be added back - new metadata and strain output arguments are supported - the initial complete list of strain names to consider for filtering originates from the metadata instead of the sequences Collectively, these changes allow multiple filter commands to be run in parallel without accessing the original sequence data and then the outputs of these separate commands can be joined by a single final filter command to output the sequences corresponding to all desired strain names. For workflows with large sequence inputs, like ncov, these changes should speed up analyses considerably.
nextstrain · Feb 25, 2021 · f80ae36 · f80ae36
1 parent 698df3d
commit f80ae36
Show file tree

Hide file tree

Showing 3 changed files with 186 additions and 30 deletions.
diff --git a/augur/filter.py b/augur/filter.py
@@ -92,15 +92,16 @@ def filter_by_query(sequences, metadata_file, query):
     return [seq for seq in sequences if seq in filtered_meta_dict]
 
 def register_arguments(parser):
-    parser.add_argument('--sequences', '-s', required=True, help="sequences in fasta or VCF format")
+    parser.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format")
     parser.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata, as CSV or TSV")
     parser.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.")
     parser.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
     parser.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
     parser.add_argument('--min-length', type=int, help="minimal length of the sequences")
     parser.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters")
     parser.add_argument('--exclude', type=str, help="file with list of strains that are to be excluded")
-    parser.add_argument('--include', type=str, help="file with list of strains that are to be included regardless of priorities or subsampling")
+    parser.add_argument('--exclude-all', action="store_true", help="exclude all strains by default. Use this with the include arguments to select a specific subset of strains.")
+    parser.add_argument('--include', type=str, nargs="+", help="file(s) with list of strains that are to be included regardless of priorities or subsampling")
     parser.add_argument('--priority', type=str, help="file with list of priority scores for sequences (strain\tpriority)")
     subsample_group = parser.add_mutually_exclusive_group()
     subsample_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
@@ -117,18 +118,59 @@ def register_arguments(parser):
     parser.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
                                 help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
     parser.add_argument('--query', help="Filter samples by attribute. Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.")
-    parser.add_argument('--output', '-o', help="output file", required=True)
+    parser.add_argument('--output', '--output-sequences', '-o', help="filtered sequences in FASTA format")
+    parser.add_argument('--output-metadata', help="metadata for strains that passed filters")
+    parser.add_argument('--output-strains', help="list of strains that passed filters (no header)")
 
     parser.set_defaults(probabilistic_sampling=True)
 
 def run(args):
     '''
     filter and subsample a set of sequences into an analysis set
     '''
+    # Validate arguments before attempting any I/O.
+    # Don't allow sequence output when no sequence input is provided.
+    if args.output and not args.sequences:
+        print(
+            "ERROR: You need to provide sequences to output sequences.",
+            file=sys.stderr)
+        return 1
+
+    # Confirm that at least one output was requested.
+    if not any((args.output, args.output_metadata, args.output_strains)):
+        print(
+            "ERROR: You need to select at least one output.",
+            file=sys.stderr)
+        return 1
+
+    # Don't allow filtering on sequence-based information, if no sequences or
+    # sequence index is provided.
+    SEQUENCE_ONLY_FILTERS = [
+        args.min_length,
+        args.non_nucleotide
+    ]
+    if not args.sequences and not args.sequence_index and any(SEQUENCE_ONLY_FILTERS):
+        print(
+            "ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.",
+            file=sys.stderr)
+        return 1
+
+    # Load inputs, starting with metadata.
+    try:
+        # Metadata are the source of truth for which sequences we want to keep
+        # in filtered output.
+        meta_dict, meta_columns = read_metadata(args.metadata)
+        seq_keep = sorted(meta_dict.keys())
+        all_seq = seq_keep.copy()
+    except ValueError as error:
+        print("ERROR: Problem reading in {}:".format(args.metadata))
+        print(error)
+        return 1
+
     #Set flags if VCF
     is_vcf = False
     is_compressed = False
-    if any([args.sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
+    if args.sequences and any([args.sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
         is_vcf = True
         if args.sequences.lower().endswith('.gz'):
             is_compressed = True
@@ -147,7 +189,8 @@ def run(args):
     #If VCF, open and get sequence names
     if is_vcf:
         seq_keep, all_seq = read_vcf(args.sequences)
-    else:
+        # TODO: How to best handle "source of truth" for VCF files vs. metadata?
+    elif args.sequences or args.sequence_index:
         # If FASTA, try to load the sequence composition details and strain
         # names to be filtered.
         index_is_autogenerated = False
@@ -180,21 +223,15 @@ def run(args):
 
         # Calculate summary statistics needed for filtering.
         sequence_index["ACGT"] = sequence_index.loc[:, ["A", "C", "G", "T"]].sum(axis=1)
-        seq_keep = sequence_index["strain"].values
-        all_seq = seq_keep.copy()
-
-    try:
-        meta_dict, meta_columns = read_metadata(args.metadata)
-    except ValueError as error:
-        print("ERROR: Problem reading in {}:".format(args.metadata))
-        print(error)
-        return 1
-
 
     #####################################
     #Filtering steps
     #####################################
 
+    # Exclude all strains by default.
+    if args.exclude_all:
+        seq_keep = []
+
     # remove sequences without meta data
     tmp = [ ]
     for seq_name in seq_keep:
@@ -426,15 +463,20 @@ def run(args):
     # Note that this might re-add previously excluded sequences
     # Note that we are also not checking for existing meta data here
     num_included_by_name = 0
-    if args.include and os.path.isfile(args.include):
-        with open(args.include, 'r', encoding='utf-8') as ifile:
-            to_include = set(
-                [
-                    line.strip()
-                    for line in ifile
-                    if line[0]!=comment_char and len(line.strip()) > 0
-                ]
-            )
+    if args.include:
+        # Collect the union of all given strains to include.
+        to_include = set()
+        for include_file in args.include:
+            with open(include_file, 'r', encoding='utf-8') as ifile:
+                to_include.update(
+                    set(
+                        [
+                            line.strip()
+                            for line in ifile
+                            if line[0]!=comment_char and len(line.strip()) > 0
+                        ]
+                    )
+                )
 
         for s in to_include:
             if s not in seq_keep:
@@ -476,7 +518,7 @@ def run(args):
             return 1
         write_vcf(args.sequences, args.output, dropped_samps)
 
-    else:
+    elif args.sequences and args.output:
         # It should not be possible to have ids in the list of sequences to keep
         # that do not exist in the original input sequences, since we built this
         # list of ids from the sequence index. Just to be safe though, we find
@@ -496,6 +538,19 @@ def run(args):
             print("ERROR: All samples have been dropped! Check filter rules and metadata file format.", file=sys.stderr)
             return 1
 
+    if args.output_metadata:
+        metadata_df = pd.DataFrame([meta_dict[strain] for strain in seq_keep])
+        metadata_df.to_csv(
+            args.output_metadata,
+            sep="\t",
+            index=False
+        )
+
+    if args.output_strains:
+        with open(args.output_strains, "w") as oh:
+            for strain in sorted(seq_keep):
+                oh.write(f"{strain}\n")
+
     print("\n%i sequences were dropped during filtering" % (len(all_seq) - len(seq_keep),))
     if args.exclude:
         print("\t%i of these were dropped because they were in %s" % (num_excluded_by_name, args.exclude))
@@ -516,12 +571,12 @@ def run(args):
         seed_txt = ", using seed {}".format(args.subsample_seed) if args.subsample_seed else ""
         print("\t%i of these were dropped because of subsampling criteria%s" % (num_excluded_subsamp, seed_txt))
 
-    if args.include and os.path.isfile(args.include):
-        print("\n\t%i sequences were added back because they were in %s" % (num_included_by_name, args.include))
+    if args.include:
+        print("\n\t%i sequences were added back because they were in %s" % (num_included_by_name, " and ".join(args.include)))
     if args.include_where:
         print("\t%i sequences were added back because of '%s'" % (num_included_by_metadata, args.include_where))
 
-    print("%i sequences have been written out to %s" % (len(seq_keep), args.output))
+    print("%i strains have been passed all filters" % (len(seq_keep),))
 
 
 def _filename_gz(filename):

diff --git a/tests/builds/zika.t b/tests/builds/zika.t
@@ -43,8 +43,8 @@ Filter sequences by a minimum date and an exclusion list and only keep one seque
   >   --subsample-seed 314159 \
   >   --no-probabilistic-sampling \
   >   --min-date 2012 > /dev/null
-
-  $ diff -u "results/filtered.fasta" "$TMP/out/filtered.fasta"
+  $ grep "^>" "$TMP/out/filtered.fasta" | wc -l
+  \s*10 (re)
 
 Align filtered sequences to a specific reference sequence and fill any gaps.
 

diff --git a/tests/functional/filter.t b/tests/functional/filter.t
@@ -63,3 +63,104 @@ Using the default probabilistic subsampling, should work the same as the previou
   >  --subsample-seed 314159 \
   >  --output "$TMP/filtered.fasta" > /dev/null
   $ rm -f "$TMP/filtered.fasta"
+
+Filter using only metadata without sequence input or output and save results as filtered metadata.
+
+  $ ${AUGUR} filter \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --min-date 2012 \
+  >  --min-length 10500 \
+  >  --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null
+
+Output should include the 9 sequences matching the filters and a header line.
+
+  $ wc -l "$TMP/filtered_metadata.tsv"
+  \s*10 .* (re)
+  $ rm -f "$TMP/filtered_metadata.tsv"
+
+Filter using only metadata and save results as a list of filtered strains.
+
+  $ ${AUGUR} filter \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --min-date 2012 \
+  >  --min-length 10500 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+
+Output should include only the 9 sequences matching the filters (without a header line).
+
+  $ wc -l "$TMP/filtered_strains.txt"
+  \s*9 .* (re)
+  $ rm -f "$TMP/filtered_strains.txt"
+
+Filter using only metadata without a sequence index.
+This should work because the requested filters don't rely on sequence information.
+
+  $ ${AUGUR} filter \
+  >  --metadata filter/metadata.tsv \
+  >  --min-date 2012 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  $ rm -f "$TMP/filtered_strains.txt"
+
+Try to filter using only metadata without a sequence index.
+This should fail because the requested filters rely on sequence information.
+
+  $ ${AUGUR} filter \
+  >  --metadata filter/metadata.tsv \
+  >  --min-length 10000 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
+  [1]
+
+Try to filter with sequence outputs and no sequence inputs.
+This should fail.
+
+  $ ${AUGUR} filter \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --min-length 10000 \
+  >  --output "$TMP/filtered.fasta" > /dev/null
+  ERROR: You need to provide sequences to output sequences.
+  [1]
+
+Try to filter without any outputs.
+
+  $ ${AUGUR} filter \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --min-length 10000 > /dev/null
+  ERROR: You need to select at least one output.
+  [1]
+
+Filter into two separate sets and then select sequences from the union of those sets.
+First, select strains from Brazil (there should be 1).
+
+  $ ${AUGUR} filter \
+  >  --metadata filter/metadata.tsv \
+  >  --query "country == 'Brazil'" \
+  >  --output-strains "$TMP/filtered_strains.brazil.txt" > /dev/null
+  $ wc -l "$TMP/filtered_strains.brazil.txt"
+  \s*1 .* (re)
+
+Then, select strains from Colombia (there should be 3).
+
+  $ ${AUGUR} filter \
+  >  --metadata filter/metadata.tsv \
+  >  --query "country == 'Colombia'" \
+  >  --output-strains "$TMP/filtered_strains.colombia.txt" > /dev/null
+  $ wc -l "$TMP/filtered_strains.colombia.txt"
+  \s*3 .* (re)
+
+Finally, exclude all sequences except those from the two sets of strains (there should be 4).
+
+  $ ${AUGUR} filter \
+  >  --sequences filter/sequences.fasta \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --exclude-all \
+  >  --include "$TMP/filtered_strains.brazil.txt" "$TMP/filtered_strains.colombia.txt" \
+  >  --output "$TMP/filtered.fasta" > /dev/null
+  $ grep "^>" "$TMP/filtered.fasta" | wc -l
+  \s*4 (re)
+  $ rm -f "$TMP/filtered.fasta"