Skip to content

Commit

Permalink
Allow filtering by metadata only
Browse files Browse the repository at this point in the history
Allows users to filter strains using only metadata information and
output the strains that pass filters as either a metadata table or a
list of names. Additionally, allows users to select only those sequences
whose strain names are listed in one or more include files. To enable
this functionality, this commit makes the following major changes to the
filter interface:

 - sequence input and output are optional
 - non-sequence-based filters work without sequence input or index
 - the include argument accepts multiple inputs whose contents are unioned
 - a new `--exclude-all` flag excludes all strains by default, allowing
   only those in the include flags to be added back
 - new metadata and strain output arguments are supported
 - the initial complete list of strain names to consider for filtering
   originates from the metadata instead of the sequences

Collectively, these changes allow multiple filter commands to be run in
parallel without accessing the original sequence data and then the
outputs of these separate commands can be joined by a single final
filter command to output the sequences corresponding to all desired
strain names. For workflows with large sequence inputs, like ncov, these
changes should speed up analyses considerably.
  • Loading branch information
huddlej committed Feb 25, 2021
1 parent 698df3d commit f80ae36
Show file tree
Hide file tree
Showing 3 changed files with 186 additions and 30 deletions.
111 changes: 83 additions & 28 deletions augur/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,15 +92,16 @@ def filter_by_query(sequences, metadata_file, query):
return [seq for seq in sequences if seq in filtered_meta_dict]

def register_arguments(parser):
parser.add_argument('--sequences', '-s', required=True, help="sequences in fasta or VCF format")
parser.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format")
parser.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata, as CSV or TSV")
parser.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.")
parser.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
parser.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
parser.add_argument('--min-length', type=int, help="minimal length of the sequences")
parser.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters")
parser.add_argument('--exclude', type=str, help="file with list of strains that are to be excluded")
parser.add_argument('--include', type=str, help="file with list of strains that are to be included regardless of priorities or subsampling")
parser.add_argument('--exclude-all', action="store_true", help="exclude all strains by default. Use this with the include arguments to select a specific subset of strains.")
parser.add_argument('--include', type=str, nargs="+", help="file(s) with list of strains that are to be included regardless of priorities or subsampling")
parser.add_argument('--priority', type=str, help="file with list of priority scores for sequences (strain\tpriority)")
subsample_group = parser.add_mutually_exclusive_group()
subsample_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
Expand All @@ -117,18 +118,59 @@ def register_arguments(parser):
parser.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
parser.add_argument('--query', help="Filter samples by attribute. Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.")
parser.add_argument('--output', '-o', help="output file", required=True)
parser.add_argument('--output', '--output-sequences', '-o', help="filtered sequences in FASTA format")
parser.add_argument('--output-metadata', help="metadata for strains that passed filters")
parser.add_argument('--output-strains', help="list of strains that passed filters (no header)")

parser.set_defaults(probabilistic_sampling=True)

def run(args):
'''
filter and subsample a set of sequences into an analysis set
'''
# Validate arguments before attempting any I/O.
# Don't allow sequence output when no sequence input is provided.
if args.output and not args.sequences:
print(
"ERROR: You need to provide sequences to output sequences.",
file=sys.stderr)
return 1

# Confirm that at least one output was requested.
if not any((args.output, args.output_metadata, args.output_strains)):
print(
"ERROR: You need to select at least one output.",
file=sys.stderr)
return 1

# Don't allow filtering on sequence-based information, if no sequences or
# sequence index is provided.
SEQUENCE_ONLY_FILTERS = [
args.min_length,
args.non_nucleotide
]
if not args.sequences and not args.sequence_index and any(SEQUENCE_ONLY_FILTERS):
print(
"ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.",
file=sys.stderr)
return 1

# Load inputs, starting with metadata.
try:
# Metadata are the source of truth for which sequences we want to keep
# in filtered output.
meta_dict, meta_columns = read_metadata(args.metadata)
seq_keep = sorted(meta_dict.keys())
all_seq = seq_keep.copy()
except ValueError as error:
print("ERROR: Problem reading in {}:".format(args.metadata))
print(error)
return 1

#Set flags if VCF
is_vcf = False
is_compressed = False
if any([args.sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
if args.sequences and any([args.sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
is_vcf = True
if args.sequences.lower().endswith('.gz'):
is_compressed = True
Expand All @@ -147,7 +189,8 @@ def run(args):
#If VCF, open and get sequence names
if is_vcf:
seq_keep, all_seq = read_vcf(args.sequences)
else:
# TODO: How to best handle "source of truth" for VCF files vs. metadata?
elif args.sequences or args.sequence_index:
# If FASTA, try to load the sequence composition details and strain
# names to be filtered.
index_is_autogenerated = False
Expand Down Expand Up @@ -180,21 +223,15 @@ def run(args):

# Calculate summary statistics needed for filtering.
sequence_index["ACGT"] = sequence_index.loc[:, ["A", "C", "G", "T"]].sum(axis=1)
seq_keep = sequence_index["strain"].values
all_seq = seq_keep.copy()

try:
meta_dict, meta_columns = read_metadata(args.metadata)
except ValueError as error:
print("ERROR: Problem reading in {}:".format(args.metadata))
print(error)
return 1


#####################################
#Filtering steps
#####################################

# Exclude all strains by default.
if args.exclude_all:
seq_keep = []

# remove sequences without meta data
tmp = [ ]
for seq_name in seq_keep:
Expand Down Expand Up @@ -426,15 +463,20 @@ def run(args):
# Note that this might re-add previously excluded sequences
# Note that we are also not checking for existing meta data here
num_included_by_name = 0
if args.include and os.path.isfile(args.include):
with open(args.include, 'r', encoding='utf-8') as ifile:
to_include = set(
[
line.strip()
for line in ifile
if line[0]!=comment_char and len(line.strip()) > 0
]
)
if args.include:
# Collect the union of all given strains to include.
to_include = set()
for include_file in args.include:
with open(include_file, 'r', encoding='utf-8') as ifile:
to_include.update(
set(
[
line.strip()
for line in ifile
if line[0]!=comment_char and len(line.strip()) > 0
]
)
)

for s in to_include:
if s not in seq_keep:
Expand Down Expand Up @@ -476,7 +518,7 @@ def run(args):
return 1
write_vcf(args.sequences, args.output, dropped_samps)

else:
elif args.sequences and args.output:
# It should not be possible to have ids in the list of sequences to keep
# that do not exist in the original input sequences, since we built this
# list of ids from the sequence index. Just to be safe though, we find
Expand All @@ -496,6 +538,19 @@ def run(args):
print("ERROR: All samples have been dropped! Check filter rules and metadata file format.", file=sys.stderr)
return 1

if args.output_metadata:
metadata_df = pd.DataFrame([meta_dict[strain] for strain in seq_keep])
metadata_df.to_csv(
args.output_metadata,
sep="\t",
index=False
)

if args.output_strains:
with open(args.output_strains, "w") as oh:
for strain in sorted(seq_keep):
oh.write(f"{strain}\n")

print("\n%i sequences were dropped during filtering" % (len(all_seq) - len(seq_keep),))
if args.exclude:
print("\t%i of these were dropped because they were in %s" % (num_excluded_by_name, args.exclude))
Expand All @@ -516,12 +571,12 @@ def run(args):
seed_txt = ", using seed {}".format(args.subsample_seed) if args.subsample_seed else ""
print("\t%i of these were dropped because of subsampling criteria%s" % (num_excluded_subsamp, seed_txt))

if args.include and os.path.isfile(args.include):
print("\n\t%i sequences were added back because they were in %s" % (num_included_by_name, args.include))
if args.include:
print("\n\t%i sequences were added back because they were in %s" % (num_included_by_name, " and ".join(args.include)))
if args.include_where:
print("\t%i sequences were added back because of '%s'" % (num_included_by_metadata, args.include_where))

print("%i sequences have been written out to %s" % (len(seq_keep), args.output))
print("%i strains have been passed all filters" % (len(seq_keep),))


def _filename_gz(filename):
Expand Down
4 changes: 2 additions & 2 deletions tests/builds/zika.t
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ Filter sequences by a minimum date and an exclusion list and only keep one seque
> --subsample-seed 314159 \
> --no-probabilistic-sampling \
> --min-date 2012 > /dev/null

$ diff -u "results/filtered.fasta" "$TMP/out/filtered.fasta"
$ grep "^>" "$TMP/out/filtered.fasta" | wc -l
\s*10 (re)

Align filtered sequences to a specific reference sequence and fill any gaps.

Expand Down
101 changes: 101 additions & 0 deletions tests/functional/filter.t
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,104 @@ Using the default probabilistic subsampling, should work the same as the previou
> --subsample-seed 314159 \
> --output "$TMP/filtered.fasta" > /dev/null
$ rm -f "$TMP/filtered.fasta"

Filter using only metadata without sequence input or output and save results as filtered metadata.

$ ${AUGUR} filter \
> --sequence-index filter/sequence_index.tsv \
> --metadata filter/metadata.tsv \
> --min-date 2012 \
> --min-length 10500 \
> --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null

Output should include the 9 sequences matching the filters and a header line.

$ wc -l "$TMP/filtered_metadata.tsv"
\s*10 .* (re)
$ rm -f "$TMP/filtered_metadata.tsv"

Filter using only metadata and save results as a list of filtered strains.

$ ${AUGUR} filter \
> --sequence-index filter/sequence_index.tsv \
> --metadata filter/metadata.tsv \
> --min-date 2012 \
> --min-length 10500 \
> --output-strains "$TMP/filtered_strains.txt" > /dev/null

Output should include only the 9 sequences matching the filters (without a header line).

$ wc -l "$TMP/filtered_strains.txt"
\s*9 .* (re)
$ rm -f "$TMP/filtered_strains.txt"

Filter using only metadata without a sequence index.
This should work because the requested filters don't rely on sequence information.

$ ${AUGUR} filter \
> --metadata filter/metadata.tsv \
> --min-date 2012 \
> --output-strains "$TMP/filtered_strains.txt" > /dev/null
$ rm -f "$TMP/filtered_strains.txt"

Try to filter using only metadata without a sequence index.
This should fail because the requested filters rely on sequence information.

$ ${AUGUR} filter \
> --metadata filter/metadata.tsv \
> --min-length 10000 \
> --output-strains "$TMP/filtered_strains.txt" > /dev/null
ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
[1]

Try to filter with sequence outputs and no sequence inputs.
This should fail.

$ ${AUGUR} filter \
> --sequence-index filter/sequence_index.tsv \
> --metadata filter/metadata.tsv \
> --min-length 10000 \
> --output "$TMP/filtered.fasta" > /dev/null
ERROR: You need to provide sequences to output sequences.
[1]

Try to filter without any outputs.

$ ${AUGUR} filter \
> --sequence-index filter/sequence_index.tsv \
> --metadata filter/metadata.tsv \
> --min-length 10000 > /dev/null
ERROR: You need to select at least one output.
[1]

Filter into two separate sets and then select sequences from the union of those sets.
First, select strains from Brazil (there should be 1).

$ ${AUGUR} filter \
> --metadata filter/metadata.tsv \
> --query "country == 'Brazil'" \
> --output-strains "$TMP/filtered_strains.brazil.txt" > /dev/null
$ wc -l "$TMP/filtered_strains.brazil.txt"
\s*1 .* (re)

Then, select strains from Colombia (there should be 3).

$ ${AUGUR} filter \
> --metadata filter/metadata.tsv \
> --query "country == 'Colombia'" \
> --output-strains "$TMP/filtered_strains.colombia.txt" > /dev/null
$ wc -l "$TMP/filtered_strains.colombia.txt"
\s*3 .* (re)

Finally, exclude all sequences except those from the two sets of strains (there should be 4).

$ ${AUGUR} filter \
> --sequences filter/sequences.fasta \
> --sequence-index filter/sequence_index.tsv \
> --metadata filter/metadata.tsv \
> --exclude-all \
> --include "$TMP/filtered_strains.brazil.txt" "$TMP/filtered_strains.colombia.txt" \
> --output "$TMP/filtered.fasta" > /dev/null
$ grep "^>" "$TMP/filtered.fasta" | wc -l
\s*4 (re)
$ rm -f "$TMP/filtered.fasta"

0 comments on commit f80ae36

Please sign in to comment.